## Product Recommendation System

Reference: https://medium.com/@prateekgaurav/step-by-step-content-based-recommendation-system-823bbfd0541c

In [None]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('wordnet')
nltk.download('punkt_tab')


warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option('display.width', 10000)

# Change the project name here to your project name in your account
client = bigquery.Client(project="ambient-sylph-426620-g6") # Odele

### 1. Querying product information

In [269]:
query = '''
    SELECT distinct
        product.productSKU AS product_id
        ,product.productBrand AS product_brand
        ,product.productVariant AS product_variant
        ,product.v2ProductName AS product_name
        ,product.v2ProductCategory AS product_category
        ,product.productPrice/1000000 AS price_temp
        ,count(DISTINCT date_trunc(PARSE_DATETIME("%Y%m%d %H:%M:%S", CONCAT(date, ' ' ,hour,':',minute,':00')), day)) as days
    FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`,
        UNNEST(hits) AS hits
    LEFT JOIN UNNEST(hits.product) AS product ON TRUE
    GROUP BY 1,2,3,4,5,6
    
'''

rows = client.query(query)
raw_product_data = rows.to_dataframe()


### 2. Data cleaning

In [270]:
# Removing special characters and leading whitespace 
raw_product_data['product_variant'] = raw_product_data['product_variant'].str.lstrip()
replacements = {'2XL': 'XXL', '3XL':'XXXL', 'L/XL': 'LG XL', 'S/M': 'SM MD', 'M/L': 'MD LG'}
raw_product_data['product_variant'].replace(replacements, inplace = True)

raw_product_data['product_category'] = raw_product_data['product_category'].str.replace(r"[\'&]",'',regex = True)
raw_product_data['product_category'] = raw_product_data['product_category'].str.replace(r"[-/]", " ", regex = True)
raw_product_data['product_category'] = raw_product_data['product_category'].str.replace(r"\$\{.*?\}", '', regex = True)

raw_product_data['product_name'] = raw_product_data['product_name'].str.replace('/', ' ')
raw_product_data['product_name'] = raw_product_data['product_name'].str.replace(r"[\'-]",'',regex = True)

# Turn fields with nonsensical names into NA
raw_product_data.replace(['(not set)', 'Single Option Only'], np.nan, inplace = True)

# NA fields will not contribute anything to the product description
raw_product_data['product_brand'].fillna('', inplace = True)
raw_product_data['product_variant'].fillna('', inplace = True)
raw_product_data['product_category'].fillna('', inplace = True)
raw_product_data['product_name'].fillna('', inplace = True)

# Join product category, brand, name and variant into single string
raw_product_data['product_desc'] = raw_product_data.apply(lambda x: ' '.join([x['product_category'], x['product_brand'], x['product_name'], x['product_variant']]), axis = 1)

# Resolve price discrepancies by replacing by modal price
product_by_id = raw_product_data.groupby(['product_id', 'price_temp'])['days'].sum().reset_index()
product_prices = product_by_id.loc[product_by_id.groupby('product_id')['days'].idxmax(), ["product_id", "price_temp"]]
product_prices.rename({"price_temp":"product_price"}, axis = 1, inplace = True)
raw_product_data = raw_product_data.merge(product_prices, how = 'left', on = 'product_id')
raw_product_data.drop(['days', 'price_temp'], axis = 1, inplace = True)

# Resolve discrepancies in product descriptions by combining descriptions
product_df = raw_product_data.groupby(['product_id', 'product_price'])['product_desc'].apply(lambda x: ' '.join(set(x))).reset_index()

In [None]:
# Tokenize and lemmatize product descriptions
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
product_df['tokens'] = product_df['product_desc'].apply(lambda x: word_tokenize(x))

def lemmatize(l):
    lemmatized = [word.lower() for word in l if word not in stop_words and word.isalpha()]
    return [lemmatizer.lemmatize(word) for word in lemmatized]

product_df['lemma'] = product_df['tokens'].apply(lambda x: lemmatize(x))

### 3. Word embeddings

In [154]:
# Download and load GloVe embeddings
model = api.load('glove-wiki-gigaword-100')

In [299]:
# Get vector embedding associated with each product 
def mean_vector(desc, model, vocab, id_map, vector_dim):
    vec = np.zeros(vector_dim)
    word_count = 0

    for word in desc:
        if word in id_map:                      # verify that there is a vector embedding for that word
            vec = np.add(vec, model[word])     
            word_count += 1
    vec = np.divide(vec, word_count)            # find mean of vectors representing words in product description
    return np.array(vec)

vocab = set(product_df['lemma'].sum())
id_map = model.key_to_index
corpus = product_df['lemma']
product_df['vector'] = product_df.apply(lambda x: mean_vector(x['lemma'], model, vocab, id_map, 100), axis = 1)


In [307]:
def upsell(id):
    mat = np.vstack(product_df['vector'])                                                       # embeddings for each product
    base_vec = np.array([product_df.loc[product_df['product_id'] == id, 'vector'].values[0]])   # embedding for queried product
    cos_sim = cosine_similarity(base_vec, mat).flatten()                                        # find cosine similarity between queried product and all othe prroducts

    df = product_df.copy()
    df['similarity'] = cos_sim
    df.drop(['tokens', 'lemma', 'vector'], axis = 1, inplace = True)
    curr_price = df.loc[df['product_id'] == id, 'product_price'].values[0]                      # obtain price of current product
    df = df.loc[df['product_id'] != id]
    
    return df.loc[(df['product_id'] != id) & (df['product_price'] > curr_price)].sort_values('similarity', ascending = False).head(10)  # only return products that are more expensive

def crossell(id):
    mat = np.vstack(product_df['vector'])
    base_vec = np.array([product_df.loc[product_df['product_id'] == id, 'vector'].values[0]])
    cos_sim = cosine_similarity(base_vec, mat).flatten()

    df = product_df.copy()
    df['similarity'] = cos_sim
    df.drop(['tokens', 'lemma', 'vector'], axis = 1, inplace = True)
    df = df.loc[df['product_id'] != id]
    
    return df.loc[(df['product_id'] != id) & (df['similarity'] < 0.8)].sort_values('similarity', ascending = False).head(10)    # return products that are similar, but not too similar?

In [308]:
# upsell('GGOEGADJ059417')
crossell('GGOEGADJ059417')

Unnamed: 0,product_id,product_price,product_desc,similarity
772,GGOEGAAQ010416,16.99,Google Mens 100% Cotton Short Sleeve Hero Te...,0.799889
1096,GGOEGAEA030417,19.99,Google Womens 3 4 Sleeve Baseball Raglan Hea...,0.799815
816,GGOEGAAX0281,16.99,Home Shop by Brand Google Google Womens Shor...,0.799278
912,GGOEGAAX0580,109.99,Home Shop by Brand Google Google Womens 1 4 ...,0.799009
715,GGOEGAAJ032616,18.99,Google Mens Short Sleeve Badge Tee Charcoal ...,0.798683
1333,GGOEGAPB057813,67.19,Womens Performance Full Zip Jacket Black ...,0.797855
10,10 14215,0.0,Mens Outerwear Google Fleece FullZip Hoodie,0.79783
906,GGOEGAAX0574,89.99,Home Apparel Google Mens Performance Polo Gr...,0.797605
1127,GGOEGAEC033117,24.99,Google Long Sleeve Raglan Badge Henley Ocean...,0.79758
826,GGOEGAAX0291,18.99,Google Womens Short Sleeve Hero Tee Sky Blue...,0.797265
