In [1]:
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import process
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
full_df = pd.read_csv('full_data.csv')
outfit_comb_df = pd.read_csv('outfit_combinations.csv')

# Preprocessing

In [3]:
# clean full_df
df = full_df.drop(['mpn','created_at','updated_at','deleted_at','bc_product_id','labels','brand_canonical_url'], axis=1).groupby('product_id').first().reset_index().fillna('')

In [4]:
def process_text(text):
    '''
    remove stopword and do lemmatization, remove punctuations
    convert multiple whitespace into one, lowercase
    
    Parameter:
    text: str
    '''
    text = ' '.join([token.lemma_ for token in nlp(text) if not token.is_stop])
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).lower()
    return text

def process_details(text):
    '''
    clean text in 'details' field, lowercase, remove '\n', remove punctuations and digits
    convert multiple whitespace into one, remove stopword and do lemmatization
    
    Parameter:
    text: str
    '''
    text = re.sub(r'\n', ' ', text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = ' '.join([token.lemma_ for token in nlp(text) if not token.is_stop])
    return text

In [5]:
# clean description and details columns
df.description = df.description.apply(process_text)
df.details = df.details.apply(process_details)

# combine description and details columns for further embeddings
df['text'] = df['description']+df['details']

In [6]:
def process_outfit_type(text):
    '''
    clean text in 'outfit_item_type' field, lowercase, remove punctuations and digits and whitespace
    
    Parameter:
    text: str
    '''
    text = re.sub(r'[^a-z]', '', text.lower())
    return text

In [7]:
# clean outfit_item_type column
outfit_comb_df.outfit_item_type = outfit_comb_df.outfit_item_type.apply(process_outfit_type)

# Recommendation

## helper function

In [8]:
def recommend_on_id(product_id):
    '''
    look up an outfit_id by a product_id in outfit_comb_df, and print all items under that outfit_id
    select first outfit_id if there are multiple outfit_id
    
    Parameter:
    product_id: str
        product unique id. make sure it's uppercase and appears in outfit_comb_df
    '''
    print('\nOutput')
    outfit_id = outfit_comb_df[outfit_comb_df.product_id==product_id].iloc[0].outfit_id
    for index, row in outfit_comb_df[outfit_comb_df.outfit_id==outfit_id].iterrows():
        print(f'{row.outfit_item_type}: {row.product_full_name} ({row.product_id})')

In [9]:
def recommend_on_doc2vec(outfit_type, brand_category, brand, text):
    '''
    find most similar product given outfit_type, brand_category, brand, text
    sentence embedding by doc2vec
    recommend outfit combination 
    
    Parameters:
    outfit_type: str
        outfit type, shoe/bottom/top/accessory/onepiece (can be empty)
    brand_category: str
        band category, must appear in df (can be empty)
    brand: str
        band name, must appear in df (can be empty)
    text: str
        descrption+details (can be empty)
    '''
    full_product_id_list = list(df.product_id.unique())
    sme_product_id_list = list(outfit_comb_df[outfit_comb_df.outfit_item_type==outfit_type].product_id.unique())
    
    # filter df by brand_category and brand
    train_df = df.copy()
    if brand_category:
        train_df = train_df[train_df.brand_category==brand_category]
    if brand:
        train_df = train_df[train_df.brand==brand]
    
    # doc2vec embedding
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_df.text)]
    model = Doc2Vec(documents, vector_size=50, window=4, min_count=2, workers=4)
    
    # find most similar index given new text
    vector = model.infer_vector(text.split())
    index = model.docvecs.most_similar([vector])[0][0]
    
    # check if the result is in outfit_comb_df
    if train_df.iloc[index,].product_id in sme_product_id_list:
        recommend_on_id(train_df.iloc[index,].product_id)
    else:
        # if not, filter df by outfit item type and re-train the doc2vec model
        train_df = train_df[train_df.product_id.isin(sme_product_id_list)]
        documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_df.text)]
        model = Doc2Vec(documents, vector_size=50, window=4, min_count=2, workers=4)
        
        # find most similar index given new text
        index = model.docvecs.most_similar([vector])[0][0]
        recommend_on_id(train_df.iloc[index,].product_id)

In [10]:
def check_brand_category(text):
    '''
    make sure input is either empty string or a brand category in df
    if not, give candidate list by fuzzy match and ask to re-enter
    
    Parameters:
    text: str
        brand category (can be empty)
    '''
    if text:
        brand_category_list = list(df.brand_category.unique())
        candidate = process.extractBests(text,brand_category_list)
        while candidate[0][1] != 100:
            print('Do you mean?')
            for i in candidate:
                print(i[0], 'similarity:',i[1])
            text = input('brand category (left blank if none): ')
            if text:
                candidate = process.extractBests(text,brand_category_list)
            else:
                return text
        return candidate[0][0] 
    else:
        return text

In [11]:
def check_brand(text):
    '''
    make sure input is either empty string or a brand name in df
    if not, give candidate list by fuzzy match and ask to re-enter
    
    Parameters:
    text: str
        brand category (can be empty)
    '''
    if text:
        brand_list = list(df.brand.unique())
        candidate = process.extractBests(text,brand_list)
        while candidate[0][1] != 100:
            print('Do you mean?')
            for i in candidate:
                print(i[0], 'similarity:',i[1])
            text = input('brand (left blank if none): ')
            if text:
                candidate = process.extractBests(text,brand_list)
            else:
                return text
        return candidate[0][0] 
    else:
        return text

## main function

In [12]:
print('Input')

# ask user to enter outfit item type
outfit_type = process_outfit_type(input('outfit type [shoe/bottom/top/accessory/onepiece]: '))

# make sure outfit item type is entered correctly
while outfit_type not in ['shoe','bottom','top','accessory','onepiece']:
    print('Wrong input')
    outfit_type = process_outfit_type(input('outfit type [shoe/bottom/top/accessory/onepiece]: '))

# ask user to enter product_id
product_id = input('product id (left blank if none): ').upper()

if product_id:
    if product_id in list(outfit_comb_df['product_id']):
        recommend_on_id(product_id)
        
    elif product_id in list(df['product_id']):
        text = str(df[df.product_id==product_id].text)
        recommend_on_doc2vec(outfit_type,'','',text)
        
    else:
        print('Do you mean?')
        for i in process.extractBests(product_id,df['product_id']):
            print(i[0], 'similarity:', i[1])

else:
    brand_category = check_brand_category(input('brand category (left blank if none): '))
    band = check_brand(input('brand (left blank if none): '))
    description = input('description (left blank if none): ')
    details = input('details (left blank if none): ')
    text = process_text(description+details)
    recommend_on_doc2vec(outfit_type, brand_category, band, text)

Input
outfit type [shoe/bottom/top/accessory/onepiece]: shoe
product id (left blank if none): 01DMBRYVA2ZFDYRYY5TRQZJTBD)
Do you mean?
01DMBRYVA2ZFDYRYY5TRQZJTBD similarity: 100
01DMBRYVA2PEPWFTT7RMP5AA1T similarity: 54
01DMBRYVA2Q2ST7MNYR6EEY4TK similarity: 54
01DMBRYVA2P5H24WK0HTK4R0A1 similarity: 50
01DPBV967NDZSFTDMYPDRTMZYB similarity: 50
