In [32]:
from bs4 import BeautifulSoup
from random import randint, shuffle
from time import sleep
import re
import pandas as pd
import json
import math
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
from tqdm import tqdm_notebook as tqdm
import shutil
import numpy as np
import pandas_profiling
import pickle
from IPython.display import Image 
import pickle

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", -1)
pd.set_option("display.float_format", "{:,.4f}".format)
pd.set_option("display.max_info_rows", 200)


import surprise
from surprise.model_selection import train_test_split
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD, SVDpp, BaselineOnly, NMF, CoClustering
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
sns.set_context("paper", font_scale=1.8)
sns.set_palette("hls", 15)

# Dataset

In [2]:
# List of all products from Sephora as of June 24 with binary columns for categories

products = pd.read_csv("df_sephora.csv")

In [3]:
products.info(null_counts=True, max_cols=200)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9007 entries, 0 to 9006
Data columns (total 192 columns):
id                                   9007 non-null object
name                                 9007 non-null object
brand_id                             9007 non-null int64
brand_name                           9007 non-null object
short_desc                           8997 non-null object
long_desc                            8997 non-null object
item                                 9007 non-null int64
item_name                            7696 non-null object
list_price                           9007 non-null float64
variation                            9007 non-null object
variants                             3930 non-null object
price_low                            9007 non-null float64
price_high                           9007 non-null float64
is_limited_edition                   9007 non-null int64
rating                               8792 non-null float64
review_count         

In [34]:
df = pd.read_csv("user_item_rating_raw.csv")
df = df[["uid", "author_id", "product_id", "rating"]].copy()

  interactivity=interactivity, compiler=compiler, result=result)


In [35]:
author_ids = list(df.uid.unique())
product_ids = list(df.product_id.unique())

In [36]:
del df

In [40]:
selected_products = products[products.id.isin(product_ids)]
selected_products.shape

(6839, 192)

In [41]:
selected_products.to_csv("df_sephora_selected.csv", index=False)

In [9]:
products_short = products[products.id.isin(product_ids)][[
                        "id", "rating", "review_count", "categories"
                    ]].copy().sort_values(["rating", "review_count"], ascending=False)

products_short.shape

(6839, 4)

In [145]:
for_all = list(products[(products.gifts_her == 1) | (products.gifts_men == 1) |
         (products.gifts_them == 1) | (products.luxury_gifts == 1) |
         (products.editors_picks == 1) | (products.gift_sets == 1)].id.unique())

pickle.dump(for_all, open("for_all.pkl", "wb"))

In [125]:
for_men = list(products[(products.for_men==1)].id.unique())
pickle.dump(for_men, open("for_men.pkl", "wb"))

In [68]:
skincare = list(products_short[(products_short.categories.str.contains("cat150006", na=False))].id.unique())
pickle.dump(skincare, open("skincare.pkl", "wb"))

In [69]:
fragrance = list(products_short[(products_short.categories.str.contains("cat160006", na=False))].id.unique())
pickle.dump(fragrance, open("fragrance.pkl", "wb"))

In [70]:
makeup = list(products_short[(products_short.categories.str.contains("cat140006", na=False))].id.unique())
pickle.dump(makeup, open("makeup.pkl", "wb"))

In [147]:
hair_body = list(products_short[(products_short.categories.str.contains("cat130038|cat140014", na=False))].id.unique())
pickle.dump(hair_body, open("hair_body.pkl", "wb"))

# Model 1

In [None]:
# top_ = get_reco(age=age,
#      stone = stone,
#      stype = stype,
#      sconcerns = sconcerns,
#      ecolor = ecolor,
#      hcolor = hcolor,
#      hcondition = hcondition,
#      hconcerns = hconcerns,
#      cats = cats,
#      rel = rel,
#      gender = gender,
#      budget_max = budget_max,
#      budget_min = budget_min,
#      n=15)

# top_

In [93]:
def get_reco(age,
             stone,
             stype,
             sconcerns,
             ecolor,
             hcolor,
             hcondition,
             hconcerns,
             cats,
             rel,
             gender,
             budget_max = 1000,
             budget_min = 0,
             n=50):
    
    if budget_min>budget_max:
        return print("Error: min budget is greater than max budget")
    
    else:
    
        products_to_consider = get_products(cats, rel, gender)

        if len(products_to_consider)== 0 or type(products_to_consider)==str:
            return print("Error: reenter details xxx")
        else:
            uid_query = get_uid(age, stone, stype, sconcerns, ecolor, hcolor,
                                hcondition, hconcerns)

            if uid_query=="error":
                return print("Error: reenter details")
            else:
                products = pd.read_csv("df_sephora_selected.csv")
                topn = get_topn(uid_query, products_to_consider)
                print("There are {} iids to consider...".format(len(topn)))
                topn = topn.merge(products, how="left", left_on="iid", right_on="id")
                print("Getting iids within the budget...")
                topn = topn[(topn.price_high <= budget_max)
                            & (topn.price_low >= budget_min)].reset_index(
                                drop=True).head(n)
    
    if len(topn) == n:
        return topn
    else:
        print("Not enough iids within the budget...")
        ager = age_range(age)
        similar_uids = get_similar_uids(uid_query, ager) 
        for uid in similar_uids:
            addtl = get_topn(uid, products_to_consider)
            addtl = addtl.merge(products, how="left", left_on="iid", right_on="id")
            print("Getting iids within the budget...")
            addtl = addtl[(addtl.price_high <= budget_max)
                            & (addtl.price_low >= budget_min)].reset_index(
                                drop=True)
            topn = topn.append(addtl, ignore_index=True)
            topn.drop_duplicates(subset="iid", inplace=True)
            if len((topn.head(n))) == n:
                return topn.head(n)
            else:
                print("Not enough iids within the budget...")
                
            
    return topn

In [7]:
def show_image(product_id):
    image_folder_path = "/Users/valmadrid/DataScienceBootcamp/Projects/Gift Recommendation/Gift-Recommendation/image/"
    display(Image(image_folder_path + product_id + ".png", width=250, height=250))

In [96]:
def get_similar_uids(uidx, ager):
    
    
    print("Getting similar uids...")
    
    uid_similar = pd.read_csv("user_matrix.csv")

    
    if len(uid_similar[uid_similar.uid==uidx]) != 0:
        
        similar_uids = list(uid_similar[uid_similar.uid==uidx].similar_uid)[0].split("_")
        
    else:
        
        udix_list = udix.split("_")
        similar_uid_list = []
        for i in range(1, len(udix_list)):
            udix_list = udix.split("_")
            if udix_list[i] !="0":
                udix_list[i] = "0"
                similar_uid_list.append(("_".join(udix_list)))
                
        similar_uids = list(uid_similar[uid_similar.uid.isin(similar_uid_list)].similar_uid)[0].split("_")
   


    similar_uids = [uid_similar.uid.iloc[int(i)] for i in similar_uids]
    similar_uids = [i for i in similar_uids if i.startswith(ager)
                    ] + [i for i in similar_uids if not i.startswith(ager)]
    
    return similar_uids

In [25]:
def get_products(cats=["0"], rel="0", gender="0"):

    print("Getting iids...")

    relationship = [
        "family", "colleague", "boss", "client", "friend", "significant other"
    ]

    if rel not in relationship:
        return "please reenter details (relationship)"

    products_to_consider = []

    men = pickle.load(open("for_men.pkl", "rb"))
    for_all = pickle.load(open("for_all.pkl", "rb"))
    hair_body = pickle.load(open("hair_body.pkl", "rb"))
    skincare = pickle.load(open("skincare.pkl", "rb"))
    makeup = pickle.load(open("makeup.pkl", "rb"))
    fragrance = pickle.load(open("fragrance.pkl", "rb"))

    products_to_consider = []

    # if office/client and not male
    if rel in ["client", "boss", "colleague"] and gender in ["they", "0"]:
        products_to_consider = for_all

    if rel in ["client", "boss", "colleague"] and gender == "he":
        products_to_consider = [i for i in for_all if i in men
                                ] + [i for i in fragrance if i in men]

    #male
    elif rel in ["family", "friend", "significant other"] and gender == "he":
        products_to_consider = men

    #neither male of femaile
    elif gender in ["they", "0"]:
        products_to_consider = men + hair_body + skincare + fragrance + for_all

    #all female
    else:

        if cats[0] == "0":
            products_to_consider = for_all + skincare + makeup + fragrance

        else:

            for cat in cats:
                if cat == "skincare":
                    products_to_consider.extend(skincare)
                elif cat == "makeup":
                    products_to_consider.extend(makeup)
                elif cat == "fragrance":
                    products_to_consider.extend(fragrance)

        products_to_consider = [
            i for i in products_to_consider if i not in men
        ]


    return list(set(products_to_consider))

In [21]:
def age_range(age):

    age_dict = {
        "13to17": range(1, 17 + 1),
        "18to24": range(18, 24 + 1),
        "25to34": range(25, 34 + 1),
        "35to44": range(35, 44 + 1),
        "45to54": range(45, 54 + 1),
        "over54": range(54, 100 + 1),
        "0": [0]
    }

    if age > 100 or age < 0:
        print("please check age input")
        return "error"

    for key, val in age_dict.items():
        if age in val:
            age_group = key
            if age < 13:
                print("products may not be aged appropriate")
            return age_group


def get_uid(age="0",
            stone="0",
            stype="0",
            sconcerns="0",
            ecolor="0",
            hcolor="0",
            hcondition="0",
            hconcerns="0"):

    skintone_options = [
        "0", 'porcelain', 'fair', 'light', 'medium', 'tan', 'olive', 'dark',
        'deep', 'ebony'
    ]

    skintype_options = ["0", "normal", "oily", "dry", "combination"]

    sconcerns_options1 = [
        "0", "acne", "aging", "blackheads", "uneven skintones", "sensitivity",
        "dark circles", "calluses", "pores", "redness", "dullness",
        "cellulite", "sun damage", "cuticles", "stretch marks", "puffiness"
    ]

    sconcerns_options2 = [
        "0", "acne", "aging", "blackheads", "unevenSkinTones", "sensitivity",
        "darkCircles", "calluses", "pores", "redness", "dullness", "cellulite",
        "sunDamage", "cuticles", "stretchMarks", "puffiness"
    ]

    ecolor_options = ["0", 'green', 'blue', 'brown', 'hazel', 'gray']

    hcolor_options = [
        "0", 'brunette', 'black', 'red', 'blonde', 'gray', 'auburn'
    ]

    hcondition_options1 = [
        "0", 'chemically treated', 'normal', 'wavy', 'fine', 'curly', 'coarse',
        'dry', 'oily', 'straight'
    ]

    hcondition_options2 = [
        "0", 'chemicallyTreated', 'normal', 'wavy', 'fine', 'curly', 'coarse',
        'dry', 'oily', 'straight'
    ]

    hconcerns_options1 = [
        "0", 'color protection', 'damage', 'curly enchancing', 'dandruff',
        'volumizing', 'anti aging', 'frizz', 'hold', 'thinning', 'oiliness',
        'straightening & smoothing', 'heat protection', 'shine'
    ]

    hconcerns_options2 = [
        "0", 'ColorProtection', 'Damage', 'CurlyEnhancing', 'Dandruff',
        'Volumizing', 'AntiAging', 'Frizz', 'Hold', 'Thinning', 'Oiliness',
        'StraighteningSmoothing', 'HeatProtection', 'Shine'
    ]
    
    print("Getting uid...")
    
    ager = age_range(age)
    
    if ager == "error":
        return "error"

    if stone not in skintone_options:
        return "error"

    if stype not in skintype_options:
        return "error"

    if sconcerns not in sconcerns_options1:
        return "error"
    else:
        idx = sconcerns_options1.index(sconcerns)
        sconcerns = sconcerns_options2[idx]

    if ecolor not in ecolor_options:
        return "error"

    if hcolor not in hcolor_options:
        return "error"

    if hcondition not in hcondition_options1:
        return "error"
    else:
        idx = hcondition_options1.index(hcondition)
        hcondition = hcondition_options2[idx]

    if hconcerns not in hconcerns_options1:
        return "error"
    else:
        idx = hconcerns_options1.index(hconcerns)
        hconcerns = hconcerns_options2[idx]

    uid_query = ager + "_" + stone + "_" + stype + "_" + sconcerns + "_" + ecolor + "_" + hcolor + "_" + hcondition + "_" + hconcerns

    return uid_query

In [10]:
def get_topn(uid, product_ids):

    algo = pickle.load(open("SVD_best.pkl", "rb"))

    items_to_pred = [(uid, iid, 0) for iid in product_ids]

    predictions = algo.test(items_to_pred)

    item_predictions = pd.DataFrame({
        "iid": product_ids,
        "est": [pred.est for pred in predictions]
    }).sort_values("est", ascending=False).reset_index(drop=True)

    return item_predictions

In [103]:
def get_gifts_model1(n=15):
    print("Select from: family, significant other, boss\n",
          "client, colleague, friend")
    rel = str(input("Relationship to recipient: ")).lower()
    gender = str(input("He, She or They:  (enter 0 if unknown)")).lower()
    age = int(input("Age: (please provide estimate if unknown): "))
    print("\n")

    budget_max = float(input("Maximum budget in US$: "))
    budget_min = float(input("Minimum budget in US$: "))

    print("\n")
    print("Select from: porcelain, fair, light, medium, olive, tan, dark,\n",
          "deep, ebony")
    stone = str(input("Skintone: (enter 0 if unknown):")).lower()
    print("\n")
    print("Select from: normal, oily, dry, combination")
    stype = str(input("Primary skintype: (enter 0 if unknown):")).lower()
    print("\n")
    print(
        "Select from: acne, aging, blackheads, uneven skintones, sensitivity,\n",
        "dark circles, calluses, pores, redness, dullness, cellulite,\n",
        "sun damage, cuticles, stretch marks, puffiness")
    sconcerns = str(
        input("Primary skinconcern: (enter 0 if unknown):")).lower()
    print("\n")
    print("Select from: green, blue, brown, hazel, gray")
    ecolor = str(input("Eye color: (enter 0 if unknown):")).lower()
    print("\n")
    print("Select from: brunette, black, red, blonde, gray, auburn")
    hcolor = str(input("Hair color: (enter 0 if unknown):")).lower()
    print("\n")
    print("Select from: chemically treated, normal, wavy, fine,\n",
          "curly, coarse, dry, oily, straight")
    hcondition = str(
        input("Primary hair condition: (enter 0 if unknown):")).lower()
    print("\n")
    print(
        "Select from: color protection, damage, curly enchancing, dandruff,\n",
        "volumizing, anti aging, frizz, hold, thinning, oiliness,\n",
        "straightening & smoothing, heat protection, shine")
    hconcerns = str(
        input("Primary hair concern: (enter 0 if unknown):")).lower()

    if gender in ["they", "0"]:
        cats = ["0"]
    elif rel in ["friend", "dad", "brother", "son"] and gender == "he":
        cats = ["0"]
    elif rel in ["client", "boss", "colleague"] and gender in ["he", "she"]:
        cats = ["0"]
    else:
        print(
            "What time of products? Skincare, Makeup, Fragrance:\n(separate options using comma ',')"
        )
        cats = str(input("Enter here: ")).replace(" ", "").replace(
            "-", "").lower().split(",")

    top_ = get_reco(age=age,
                    stone=stone,
                    stype=stype,
                    sconcerns=sconcerns,
                    ecolor=ecolor,
                    hcolor=hcolor,
                    hcondition=hcondition,
                    hconcerns=hconcerns,
                    cats=cats,
                    rel=rel,
                    gender=gender,
                    budget_max=budget_max,
                    budget_min=budget_min,
                    n=n)


    return top_