### Load the Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import numpy as np
import os
pd.options.display.max_rows = 20
%matplotlib inline

import glob
from IPython.display import display, HTML

In [2]:
train_orders = pd.read_csv("data/order_products__train.csv")
prior_orders = pd.read_csv("data/order_products__prior.csv")
products = pd.read_csv("data/products.csv").set_index('product_id')

In [3]:
orders = pd.read_csv("data/orders.csv")
prior_orders = prior_orders.merge(right=orders[['user_id','order_id','order_number']],on='order_id',how='left')

In [4]:
products.head()

Unnamed: 0_level_0,product_name,aisle_id,department_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Chocolate Sandwich Cookies,61,19
2,All-Seasons Salt,104,13
3,Robust Golden Unsweetened Oolong Tea,94,7
4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
5,Green Chile Anytime Sauce,5,13


### Turn the product ID to a string
#### This is necessary because Gensim's Word2Vec expects sentences, so we have to resort to this dirty workaround

In [5]:
train_orders["product_id"] = train_orders["product_id"].astype(str)
prior_orders["product_id"] = prior_orders["product_id"].astype(str)
prior_orders.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number
0,2,33120,1,1,202279,3
1,2,28985,2,1,202279,3
2,2,9327,3,0,202279,3
3,2,45918,4,1,202279,3
4,2,30035,5,0,202279,3


In [6]:
#This line is very important to make sense of users orders
prior_orders.sort_values(by=['user_id','order_number','add_to_cart_order'],inplace=True)
prior_orders.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number
24076664,2539329,196,1,0,1,1
24076665,2539329,14084,2,0,1,1
24076666,2539329,12427,3,0,1,1
24076667,2539329,26088,4,0,1,1
24076668,2539329,26405,5,0,1,1


In [7]:
user1_set = set(prior_orders[prior_orders.user_id==196001].product_id.tolist())
user2_set = set(prior_orders[prior_orders.user_id==54166].product_id.tolist())
print(list(user1_set&user2_set))

['32478', '46041']


### Prepare docs (user with all orders)

In [8]:
#train_products = train_orders.groupby("order_id").apply(lambda order: order['product_id'].tolist())
combined_orders_by_user_id = prior_orders.groupby("user_id").apply(lambda order: ' '.join(order['product_id'].tolist()))

combined_orders_by_user_id = pd.DataFrame(combined_orders_by_user_id,columns=['all_orders'])
print(combined_orders_by_user_id.shape)
combined_orders_by_user_id.head()

(206209, 1)


Unnamed: 0_level_0,all_orders
user_id,Unnamed: 1_level_1
1,196 14084 12427 26088 26405 196 10258 12427 13...
2,32792 47766 20574 12000 48110 22474 16589 3591...
3,9387 17668 15143 16797 39190 47766 21903 39922...
4,36606 7350 35469 2707 42329 7160 1200 17769 43...
5,15349 21413 48775 28289 8518 11777 31717 26604...


In [9]:
combined_orders_by_user_id.reset_index(inplace=True)

combined_orders_by_user_id.head()

Unnamed: 0,user_id,all_orders
0,1,196 14084 12427 26088 26405 196 10258 12427 13...
1,2,32792 47766 20574 12000 48110 22474 16589 3591...
2,3,9387 17668 15143 16797 39190 47766 21903 39922...
3,4,36606 7350 35469 2707 42329 7160 1200 17769 43...
4,5,15349 21413 48775 28289 8518 11777 31717 26604...


In [10]:
combined_orders_by_user_id.user_id = combined_orders_by_user_id.user_id.astype(str)

In [12]:
!pip3 install gensim



In [15]:
# This is the iterator approach
from tqdm.notebook import tqdm
from gensim.models.doc2vec import LabeledSentence, TaggedDocument
import multiprocessing as mp
import timeit
import gensim

dimension_size = 200

start_time = timeit.default_timer()

class TaggedDocumentIterator(object):
    def __init__(self, df):
       self.df = df
    def __iter__(self):
        for row in self.df.itertuples():
            #yield LabeledSentence(words=dict(row._asdict())['all_orders'].split(),tags=[dict(row._asdict())['user_id']])
            yield TaggedDocument(words=dict(row._asdict())['all_orders'].split(),tags=[dict(row._asdict())['user_id']])
            
it = TaggedDocumentIterator(combined_orders_by_user_id)

model = gensim.models.Doc2Vec(vector_size=dimension_size, window=5, 
                              min_count=10, workers=mp.cpu_count(),
                              alpha=0.055, min_alpha=0.055,
                              epochs=120) # use fixed learning rate
#model = gensim.models.Doc2Vec(size=dimension_size, window=5, min_count=5,iter=10) # use fixed learning rate

train_corpus = list(it)

model.build_vocab(train_corpus)

for epoch in tqdm(range(10)):
    model.alpha -= 0.005 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no deca
    model.train(train_corpus,total_examples=model.corpus_count,epochs=model.iter)
    print('iteration:',epoch)

print(timeit.default_timer() - start_time )



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))



KeyboardInterrupt: 

In [0]:
train_corpus[206208:]

In [0]:
# What percent of the users are most similar to themselves
ranks = []
second_ranks = []
for doc_id in range(200000,200020):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    doc_id +=1
    rank = [docid for docid, sim in sims].index(str(doc_id))
    ranks.append(rank)
    
    second_ranks.append(sims[1])

print(np.median(ranks))
print(np.mean(ranks))
print(ranks)
collections.Counter(ranks)

In [0]:
model.save('user2vec.model')

In [0]:
model.docvecs.most_similar('196001')

In [0]:
model.docvecs['196001']

### Prepare pandas dataframe

In [0]:
# # This is the non-iterator approach
# from gensim.models.doc2vec import LabeledSentence
# start_time = timeit.default_timer()

# combined_orders_by_user_id_list = prior_orders.groupby("user_id").apply(lambda order: order['product_id'].tolist())

# combined_orders_by_user_id_list_user_ids = 


# sentences = LabeledSentence(words=dict(row._asdict())['all_orders'].split(),tags=[dict(row._asdict())['user_id']])

# model = gensim.models.Doc2Vec(size=300, window=10, min_count=5, workers=11,alpha=0.025, min_alpha=0.025,iter=1) # use fixed learning rate

# print(timeit.default_timer() - start_time )

In [0]:
user_num = combined_orders_by_user_id.shape[0]
user_vectors = np.ndarray((user_num,dimension_size+1), dtype=np.float32)


for user_id in range(user_num):
    
    user_vectors[user_id,1:] = model.docvecs[str(user_id+1)]
    user_vectors[user_id,0] = int(user_id+1)
    
    

In [0]:
product_vector_df = pd.DataFrame(data = user_vectors)
product_vector_df.rename(index=str,columns = lambda x : 'uv_' + str(x),inplace=True)
product_vector_df.rename(index=str,columns={'uv_0':'user_id'},inplace=True)
product_vector_df.user_id = product_vector_df.user_id.astype(int)
product_vector_df.set_index('user_id',inplace=True)
product_vector_df.to_csv('user_vectors_'+str(dimension_size)+'.csv')
print(product_vector_df.shape)
product_vector_df.head()

(206209, 20)


Unnamed: 0_level_0,uv_1,uv_2,uv_3,uv_4,uv_5,uv_6,uv_7,uv_8,uv_9,uv_10,uv_11,uv_12,uv_13,uv_14,uv_15,uv_16,uv_17,uv_18,uv_19,uv_20
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,-0.520328,6.466732,8.108711,20.394876,-3.021416,3.561839,6.041957,0.555195,-0.630227,0.302646,-15.767176,-3.373981,20.488789,-2.917972,-21.544838,-0.492404,-8.259093,-10.42634,-12.178179,12.187541
2,2.419382,-2.376617,2.092973,7.325765,3.112566,-2.381903,1.373984,3.155437,4.530574,-3.539254,5.12388,5.844696,1.20751,9.917433,3.76713,-6.942054,-0.086864,6.330691,-13.983377,3.547235
3,5.324028,4.561249,-7.733359,15.654157,2.985079,-11.23767,-13.007057,13.304809,-3.006064,-4.459224,5.757115,-2.564979,9.221734,4.273202,6.182935,8.882185,4.320265,8.839524,-16.229982,10.646294
4,-7.075795,4.15211,-0.784316,13.434557,8.032644,-7.97202,-1.96324,6.657787,-2.934692,5.20292,5.457757,11.731471,-4.42362,-3.123632,-11.043481,-1.491571,-10.759295,-17.403214,0.905145,-10.391628
5,4.8689,0.135775,2.382523,-1.627105,-5.048021,-4.72031,7.633911,-5.767624,1.067342,-0.123035,-1.496084,-0.447919,0.868454,6.227093,-2.920099,-2.748935,7.493649,5.58018,-7.296006,0.909572
