In [1]:
import numpy as np
import pandas as pd
import gensim 
from gensim.models import Word2Vec
from urllib import request
import warnings
import re
warnings.filterwarnings('ignore')
import spacy




# Data Cleaning

## Import product data and filter out data only for women's clothing

In [2]:
data = pd.read_excel("Behold+product+data+04262021.xlsx")
data['is_womens_clothing'] = data.apply(lambda x: x.astype(str).str.\
                            findall(r'\b(woman|women|girls?|females?|lady|ladies|unisex|women\'s|woman\'s)\b', re.IGNORECASE).any(), axis=1)

# Turn results from "list" to "boolean" (binary)
for i in range(len(data['is_womens_clothing'])):
    if len(data.loc[i,'is_womens_clothing']) == 0:
        data.loc[i,'is_womens_clothing'] = 0
    else:
        data.loc[i,'is_womens_clothing'] = 1
        
# Filter out women's clothings only for the focus of this project
data = data.loc[data.is_womens_clothing == 1, :]

## Combine outfit data & additional tags data with product data

In [3]:
outfit = pd.read_csv("outfit_combinations USC.csv")
fulldata = data.merge(outfit, left_on = "product_id", right_on = "product_id", how = "inner")
fulldata.drop_duplicates()
fulldata = fulldata.loc[:,["product_id","outfit_id","outfit_item_type","brand_x","product_full_name","description","details"]]
fulldata.rename(columns={"brand_x": "brand"}, inplace=True)

In [4]:
tags=pd.read_csv("usc_additional_tags USC.csv")

In [5]:
tag=pd.DataFrame(tags.groupby(['product_id']).agg({'attribute_value':lambda x:' '.join(set(x))}))

In [6]:
# add one more feature--tags
tag

Unnamed: 0_level_0,attribute_value
product_id,Unnamed: 1_level_1
01DMBRYVA2P5H24WK0HTK4R0A1,bottom beiges
01DMBRYVA2PEPWFTT7RMP5AA1T,blazerscoatsjackets
01DMBRYVA2Q2ST7MNYR6EEY4TK,onepiece
01DMBRYVA2S5T9W793F4CY41HE,handbags accessory
01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe
...,...
01E6076GTCE5P3VH76VWJH4MY9,vacation boho none medium blacks weekend dayto...
01E6078G3GRATF2C96VKYYWSGD,modern none bottom blacks highover9 nylon skin...
01E6079DG58YW9K78D57C6J2Y1,buttonedfront vacation modern boho midimidcalf...
01E6079QFKH4HPZFQ31T6WDRRX,cropped denim no clean bottom weekend slim blu...


In [7]:
fulldata = fulldata.merge(tag, left_on = "product_id", right_on = "product_id", how = "inner")

In [8]:
fulldata.head()

Unnamed: 0,product_id,outfit_id,outfit_item_type,brand,product_full_name,description,details,attribute_value
0,01DVA59VHYAPT4PVX32NXW91G5,01DVA879D7TQ59VPTTGCMJWWSK,shoe,Tibi,Juan Embossed Mules,Tibi's Juan embossed mules are made from shiny...,\nAs seen on the Pre-Fall ‘19 runway\nHeel mea...,modern none block medium blacks mule daytonigh...
1,01DT2D39XSRFC204J231X3C7XK,01DVC571VTD70793BKGPVSTF2A,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,\nBelted waist fastening \nComposition: cotton...,belted purecotton blazerscoatsjackets oversize...
2,01DT2D39XSRFC204J231X3C7XK,01DVC571VV0DYHPSK1GJDPQTQT,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,\nBelted waist fastening \nComposition: cotton...,belted purecotton blazerscoatsjackets oversize...
3,01DT2D39XSRFC204J231X3C7XK,01DVC571VV2KR8G4TAZWZM0YQH,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,\nBelted waist fastening \nComposition: cotton...,belted purecotton blazerscoatsjackets oversize...
4,01DT2D39XSRFC204J231X3C7XK,01DVC571VV8YNZS2NC6JCTADP0,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,\nBelted waist fastening \nComposition: cotton...,belted purecotton blazerscoatsjackets oversize...


## Replace null value and select useful features

In [9]:
fulldata = fulldata.replace(np.nan, 'UNKNOWN_TOKEN', regex=True)

# Remove "\n" entries in "details" column
fulldata['details'] = fulldata['details'].str.replace("\n", "")

# Concat columns with text
fulldata['text'] = fulldata['outfit_item_type']+' '+fulldata['brand']+' '+fulldata['product_full_name']+' '+fulldata['description']+' '+fulldata['details']+' '+fulldata['attribute_value']

fulldata.head()

Unnamed: 0,product_id,outfit_id,outfit_item_type,brand,product_full_name,description,details,attribute_value,text
0,01DVA59VHYAPT4PVX32NXW91G5,01DVA879D7TQ59VPTTGCMJWWSK,shoe,Tibi,Juan Embossed Mules,Tibi's Juan embossed mules are made from shiny...,As seen on the Pre-Fall ‘19 runwayHeel measure...,modern none block medium blacks mule daytonigh...,shoe Tibi Juan Embossed Mules Tibi's Juan embo...
1,01DT2D39XSRFC204J231X3C7XK,01DVC571VTD70793BKGPVSTF2A,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,Belted waist fastening Composition: cotton Dry...,belted purecotton blazerscoatsjackets oversize...,accessory2 Frame Belted Double-Faced Cotton Co...
2,01DT2D39XSRFC204J231X3C7XK,01DVC571VV0DYHPSK1GJDPQTQT,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,Belted waist fastening Composition: cotton Dry...,belted purecotton blazerscoatsjackets oversize...,accessory2 Frame Belted Double-Faced Cotton Co...
3,01DT2D39XSRFC204J231X3C7XK,01DVC571VV2KR8G4TAZWZM0YQH,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,Belted waist fastening Composition: cotton Dry...,belted purecotton blazerscoatsjackets oversize...,accessory2 Frame Belted Double-Faced Cotton Co...
4,01DT2D39XSRFC204J231X3C7XK,01DVC571VV8YNZS2NC6JCTADP0,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,Belted waist fastening Composition: cotton Dry...,belted purecotton blazerscoatsjackets oversize...,accessory2 Frame Belted Double-Faced Cotton Co...


In [10]:
len(fulldata)

1547

## Use Spacy to remove stopwords and implement lemmatization to get corpus be ready to be trained

In [11]:
#use large size
nlp = spacy.load('en_core_web_lg') 

In [12]:
# the stopwords_removed_docs is used for creating document vectors
docs=fulldata['text']
stopwords_removed_docs = list(map(lambda doc: [token.text for token in nlp(doc) if not 
                                                        token.is_stop | token.is_punct | token.is_space], docs))
 

In [13]:
len(stopwords_removed_docs )

1547

In [14]:
# clean_docs is used for training Doc2vec model
clean_docs = list(map(lambda doc: " ".join([token.text for token in nlp(doc) if not 
                                                        token.is_stop | token.is_punct | token.is_space]), docs))

# Used Doc2Vec to creat recommendation system

## Train Doc2Vec model

In [15]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
import string
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(clean_docs)]
model = Doc2Vec(documents, vector_size=50, window=4, min_count=2, workers=4)

## Get document vector for each document

In [16]:
docs_vector=[]
for i in stopwords_removed_docs:
    docs_vector.append(model.infer_vector(i))

In [17]:
docs_vector

[array([ 1.0240340e-03,  7.5368141e-03, -4.0190904e-03, -6.1662183e-03,
        -6.2288524e-04, -3.6837457e-04,  4.2732488e-03,  3.8910592e-03,
         7.7155088e-03,  5.1716375e-03, -6.1707823e-03,  3.9479397e-03,
         1.9037497e-03,  8.7071775e-04,  8.6278152e-03,  5.3165602e-03,
         1.4156520e-03, -9.8266061e-03, -2.8021438e-03,  6.9420459e-03,
         2.6562940e-03, -4.4415300e-03,  1.0181058e-03,  5.5265534e-03,
         4.9950900e-03, -8.1800111e-03, -2.4308830e-03, -9.0989675e-03,
         7.7361395e-03, -9.7965458e-03,  8.6649535e-03, -4.6708584e-03,
        -8.2606282e-03, -7.7512800e-03, -1.0084605e-03, -2.7329326e-05,
         6.3922408e-04, -8.9397491e-04,  4.1293479e-03,  3.8860666e-03,
         9.9329352e-03,  7.5475336e-04,  3.8373531e-03,  3.2503188e-03,
        -4.6288534e-03,  2.6969302e-03, -9.9899834e-03, -4.0468471e-03,
         7.9097264e-03, -7.2512003e-03], dtype=float32),
 array([-9.0154307e-03,  9.1916481e-03, -3.3004285e-04,  8.8254996e-03,
       

## Find the most similar outfit for the query

In [18]:
def outfit_recommendation(id):
    
    #Use fuzzy-matching to find product_id that most similar to the input id
    foundID = process.extractOne(id,outfit['product_id'],scorer=fuzz.token_set_ratio)[0]
    
    #Get all outfit_id for which involve matched product
    outfits = outfit[outfit.product_id==foundID].outfit_id
    
    #Select first outfit
    products = outfit[outfit.outfit_id==outfits.values[0]]
    
    outfitDict={}
    #Formatting output of function
    for i in products.index:
        key = products.loc[i,"outfit_item_type"]
        outfitDict[key]=products.loc[i,"product_full_name"]+"("+products.loc[i,"product_id"]+")""
    return outfitDict

In [19]:
def recommendations(query,docs_vector,nlp,fulldata):
    query_token=[token.text for token in nlp(query) if not token.is_stop | token.is_punct | token.is_space]
    query_vector=model.infer_vector(query_token).reshape(1, -1)
    cos = 0
    i=-1
    for doc_vector in docs_vector:
        i+=1
        similar=cosine_similarity(query_vector, doc_vector.reshape(1, -1))
        if similar > cos:
            cos = similar
            productid=fulldata.loc[i,"product_id"]
    return outfit_recommendation(productid)

In [20]:
test1="slim fitting, straight leg pant with a center back zipper and slightly cropped leg Reformation"
recommendations(test1,docs_vector,nlp,fulldata)

accessory2: Draped wool cardigan (01DT51AATB6TJDA52FSD48YT2Y)
top: The Bound Sleeve T-Shirt (01DT8NBXPWVPW48VYJTBV2M8MC)
bottom: Eva Cropped High-Rise Wide-Leg Jeans (01DT8NCZ5QVTKC21F495J1WKPJ)
accessory1: Pristine Mini Two-Tone Leather Shoulder Bag (01DTJ8BNRJCMD36E0NY19MKZD3)
shoe: Celeste Knotted Sandals (01DTJ8BWWDE8MAN8P91395TGQ4)
