# Note before use:

Cell [1-16] contain codes that need to be run one time when the server / machine initializes

Cell [17-18] contains the search function that can be called directly by entering a user search query

Cell [19-20] contain two sample queries

In [1]:
import numpy as np
import pandas as pd
import gensim 
from gensim.models import Word2Vec
from urllib import request
import warnings
import re
warnings.filterwarnings('ignore')
import spacy




# Data Cleaning

## Import product data and filter out data only for women's clothing

In [2]:
data = pd.read_excel("Behold+product+data+04262021.xlsx")
data['is_womens_clothing'] = data.apply(lambda x: x.astype(str).str.\
                            findall(r'\b(woman|women|girls?|females?|lady|ladies|unisex|women\'s|woman\'s)\b', re.IGNORECASE).any(), axis=1)

# Turn results from "list" to "boolean" (binary)
for i in range(len(data['is_womens_clothing'])):
    if len(data.loc[i,'is_womens_clothing']) == 0:
        data.loc[i,'is_womens_clothing'] = 0
    else:
        data.loc[i,'is_womens_clothing'] = 1
        
# Filter out women's clothings only for the focus of this project
data = data.loc[data.is_womens_clothing == 1, :]

## Combine outfit data & additional tags data with product data

In [3]:
outfit = pd.read_csv("outfit_combinations USC.csv")
fulldata = data.merge(outfit, left_on = "product_id", right_on = "product_id", how = "inner")
fulldata.drop_duplicates()
fulldata = fulldata.loc[:,["product_id","outfit_id","outfit_item_type","brand_x","product_full_name","description","details"]]
fulldata.rename(columns={"brand_x": "brand"}, inplace=True)

In [4]:
tags=pd.read_csv("usc_additional_tags USC.csv")

In [5]:
tag=pd.DataFrame(tags.groupby(['product_id']).agg({'attribute_value':lambda x:' '.join(set(x))}))

In [6]:
# add one more feature--tags
tag

Unnamed: 0_level_0,attribute_value
product_id,Unnamed: 1_level_1
01DMBRYVA2P5H24WK0HTK4R0A1,beiges bottom
01DMBRYVA2PEPWFTT7RMP5AA1T,blazerscoatsjackets
01DMBRYVA2Q2ST7MNYR6EEY4TK,onepiece
01DMBRYVA2S5T9W793F4CY41HE,handbags accessory
01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe
...,...
01E6076GTCE5P3VH76VWJH4MY9,none vacation classic medium closedtoe roundto...
01E6078G3GRATF2C96VKYYWSGD,whites modern spandex none mesh highover9 fitt...
01E6079DG58YW9K78D57C6J2Y1,blues modern vacation dress shortsleeve pureco...
01E6079QFKH4HPZFQ31T6WDRRX,clean blues classic weekend denim slim cottonb...


In [7]:
fulldata = fulldata.merge(tag, left_on = "product_id", right_on = "product_id", how = "inner")

In [8]:
fulldata.head()

Unnamed: 0,product_id,outfit_id,outfit_item_type,brand,product_full_name,description,details,attribute_value
0,01DVA59VHYAPT4PVX32NXW91G5,01DVA879D7TQ59VPTTGCMJWWSK,shoe,Tibi,Juan Embossed Mules,Tibi's Juan embossed mules are made from shiny...,\nAs seen on the Pre-Fall ‘19 runway\nHeel mea...,modern pointedtoe classic none work block slid...
1,01DT2D39XSRFC204J231X3C7XK,01DVC571VTD70793BKGPVSTF2A,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,\nBelted waist fastening \nComposition: cotton...,blazerscoatsjackets purecotton coat longsleeve...
2,01DT2D39XSRFC204J231X3C7XK,01DVC571VV0DYHPSK1GJDPQTQT,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,\nBelted waist fastening \nComposition: cotton...,blazerscoatsjackets purecotton coat longsleeve...
3,01DT2D39XSRFC204J231X3C7XK,01DVC571VV2KR8G4TAZWZM0YQH,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,\nBelted waist fastening \nComposition: cotton...,blazerscoatsjackets purecotton coat longsleeve...
4,01DT2D39XSRFC204J231X3C7XK,01DVC571VV8YNZS2NC6JCTADP0,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,\nBelted waist fastening \nComposition: cotton...,blazerscoatsjackets purecotton coat longsleeve...


## Replace null value and select useful features

In [9]:
fulldata = fulldata.replace(np.nan, 'UNKNOWN_TOKEN', regex=True)

# Remove "\n" entries in "details" column
fulldata['details'] = fulldata['details'].str.replace("\n", "")

# Concat columns with text
fulldata['text'] = fulldata['outfit_item_type']+' '+fulldata['brand']+' '+fulldata['product_full_name']+' '+fulldata['description']+' '+fulldata['details']+' '+fulldata['attribute_value']

fulldata.head()

Unnamed: 0,product_id,outfit_id,outfit_item_type,brand,product_full_name,description,details,attribute_value,text
0,01DVA59VHYAPT4PVX32NXW91G5,01DVA879D7TQ59VPTTGCMJWWSK,shoe,Tibi,Juan Embossed Mules,Tibi's Juan embossed mules are made from shiny...,As seen on the Pre-Fall ‘19 runwayHeel measure...,modern pointedtoe classic none work block slid...,shoe Tibi Juan Embossed Mules Tibi's Juan embo...
1,01DT2D39XSRFC204J231X3C7XK,01DVC571VTD70793BKGPVSTF2A,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,Belted waist fastening Composition: cotton Dry...,blazerscoatsjackets purecotton coat longsleeve...,accessory2 Frame Belted Double-Faced Cotton Co...
2,01DT2D39XSRFC204J231X3C7XK,01DVC571VV0DYHPSK1GJDPQTQT,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,Belted waist fastening Composition: cotton Dry...,blazerscoatsjackets purecotton coat longsleeve...,accessory2 Frame Belted Double-Faced Cotton Co...
3,01DT2D39XSRFC204J231X3C7XK,01DVC571VV2KR8G4TAZWZM0YQH,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,Belted waist fastening Composition: cotton Dry...,blazerscoatsjackets purecotton coat longsleeve...,accessory2 Frame Belted Double-Faced Cotton Co...
4,01DT2D39XSRFC204J231X3C7XK,01DVC571VV8YNZS2NC6JCTADP0,accessory2,Frame,Belted Double-Faced Cotton Coat,There are only a few great coats you need to b...,Belted waist fastening Composition: cotton Dry...,blazerscoatsjackets purecotton coat longsleeve...,accessory2 Frame Belted Double-Faced Cotton Co...


In [10]:
len(fulldata)

1547

## Use Spacy to remove stopwords and implement lemmatization to get corpus be ready to be trained

In [11]:
#use large size
nlp = spacy.load('en_core_web_lg') 

In [12]:
# the stopwords_removed_docs is used for creating document vectors
docs=fulldata['text']
stopwords_removed_docs = list(map(lambda doc: [token.text for token in nlp(doc) if not 
                                                        token.is_stop | token.is_punct | token.is_space], docs))
 

In [13]:
len(stopwords_removed_docs )

1547

In [14]:
# clean_docs is used for training Doc2vec model
clean_docs = list(map(lambda doc: " ".join([token.text for token in nlp(doc) if not 
                                                        token.is_stop | token.is_punct | token.is_space]), docs))

# Used Doc2Vec to creat recommendation system

## Train Doc2Vec model

In [15]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
import string
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(clean_docs)]
model = Doc2Vec(documents, vector_size=50, window=4, min_count=2, workers=4)

## Get document vector for each document

In [16]:
docs_vector=[]
for i in stopwords_removed_docs:
    docs_vector.append(model.infer_vector(i))

## Find the most similar outfit for the query

In [17]:
def outfit_recommendation(id):
    
    #Use fuzzy-matching to find product_id that most similar to the input id
    foundID = process.extractOne(id,outfit['product_id'],scorer=fuzz.token_set_ratio)[0]
    
    #Get all outfit_id for which involve matched product
    outfits = outfit[outfit.product_id==foundID].outfit_id
    
    #Select first outfit
    products = outfit[outfit.outfit_id==outfits.values[0]]
    
    outfitDict={}
    #Formatting output of function
    for i in products.index:
        key = products.loc[i,"outfit_item_type"]
        outfitDict[key]=products.loc[i,"product_full_name"]+"("+products.loc[i,"product_id"]+")"
    return outfitDict

In [18]:
def recommendations(query,docs_vector,nlp,fulldata):
    query_token=[token.text for token in nlp(query) if not token.is_stop | token.is_punct | token.is_space]
    query_vector=model.infer_vector(query_token).reshape(1, -1)
    cos = 0
    i=-1
    for doc_vector in docs_vector:
        i+=1
        similar=cosine_similarity(query_vector, doc_vector.reshape(1, -1))
        if similar > cos:
            cos = similar
            productid=fulldata.loc[i,"product_id"]
    return outfit_recommendation(productid)

In [19]:
test1="slim fitting, straight leg pant with a center back zipper and slightly cropped leg Reformation"
recommendations(test1,docs_vector,nlp,fulldata)

{'accessory1': 'Oversized Satin Cloqué Blazer(01DV4HSATZ3X8G7M9CWKFM1F9H)',
 'top': 'Leopold Tinsel Baby Doll Blouse(01DVS1BSSVTBEPS1NJP64JXRD8)',
 'bottom': 'Toni mid-rise straight-leg jeans(01DVS1DM6H1SC40YFGP0NWQ5SZ)',
 'shoe': 'Organza Magani Mules(01DVS1F4V223Y662VKYJQSWK1C)',
 'accessory2': 'Mini Leather Top Handle Bag(01DVS1SHB0MXNERKX17C3P5YXS)'}

In [20]:
test2 = "Sexy silky, a-line mini skirt zipper Benson skirt"
recommendations(test2,docs_vector,nlp,fulldata)

{'onepiece': 'Mia Matelassé Maxi Dress(01DS448JWKPGXK5EGG34W6KCX0)',
 'accessory1': 'Two Way Tulip Large Leather Tote(01DT2CZJ0PNR61AEX3XYNXMS37)',
 'accessory2': 'Leisure hooded mohair-blend cardigan(01DT512CTJKECJG1YB97X7MWNG)',
 'shoe': 'Bou Raffia Heeled Mule(01DTATGYW72E9WEZHCKJBDFTCT)'}