# COMPLETE USER PROFILE CAN BE READ IN via CSV <br>
# `complete_user_profile.csv`

# Creating User Profile

This notebook will look at user habits 

In [None]:
import collections
import pathlib
import random


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import seaborn as sns
sns.set_style('whitegrid')

%matplotlib inline
# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

### We want to look at user habits 
* Mean order DOW
* Mean order hour of day 
* Mean days since prior order 
* Mean number of products in an order
* Total number of orders made
* Total number of products bought 
* List of all products ordered

### Process test data 

In [None]:
user_data = pd.read_csv('project_order_history_test_data.csv')

In [None]:
# import necesary data and columns
# user_data = pd.read_csv('All_Orders_TRAIN.csv', usecols = ['user_id','order_id', 'product_name', 'order_dow', 'order_hour_of_day','days_since_prior_order','reordered'])

User Profile --> Average of ----->  `order_dow`, `order_hour_of_day`, `days_since_prior_order`, `reordered`

In [None]:
# get the mean dow, hour of day, days_since_prior
# use nanmean to avoid null values in days_since_prior

user_mean = user_data.groupby('user_id')['order_dow','order_hour_of_day', 'days_since_prior_order'].agg(np.nanmean)

In [None]:
# Total number of orders for each user 
total_orders = user_data.groupby('user_id').order_id.nunique() # <- series, will need to be added to larger dataframe 

In [None]:
# Average number of products in each order 
num_products = user_data.groupby('user_id').product_name.nunique()

In [None]:
# add average number of products in each order to user_mean df
user_mean['num_orders'] = total_orders
user_mean['num_products'] = num_products
user_mean['avg_num_products_per_order'] = num_products / total_orders

In [None]:
# user_mean.head()
type(user_data['product_name'])

In [None]:
# Add every product ever ordered for each user 
# Combine users products into a list specific to that user   
product_list = []
for product in user_data.groupby('user_id')['product_name']:
    product_list.append(' '.join(product[1]))


In [None]:
user_mean['Products'] = product_list

In [None]:
user_mean.head()

# Now we need to turn the product names into numbers

Enter Word2Vec <br>
<a href = 'https://www.youtube.com/watch?v=hQwFeIupNP0'> Word2Vec Explanattion Video </a> <br> 
<a href = 'https://youtu.be/Q2NtCcqmIww'> Word2Vec Coding Video </a> <br>
<a href = 'https://github.com/codebasics/deep-learning-keras-tf-tutorial/blob/master/42_word2vec_gensim/42_word2vec_gensim.ipynb'> Word2Vec Github Link </a> <br><a href = 'https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py'> Word2Vec Documentation Link </a>

In [None]:
import gensim

In [None]:
# Remove punctuation and prepare for tokenization 
product_text = user_mean.Products.apply(gensim.utils.simple_preprocess).to_dict()
type(product_text)

In [None]:
# get the number of words in each users product list -> will influence our window size on doc2vec
len_list = []
for key in product_text:
    len_list.append(len(product_text[key]))

In [None]:
average_product_df = pd.DataFrame()

In [None]:
average_product_df['num_words'] = len_list

In [None]:
average_product_df['num_words'].describe() # <- lets use a window size of 660 bc it covers 75% of the data 

### Might need to look at phrases instead of individual words (bigrams or trigrams)
<a href = 'https://www.markhneedham.com/blog/2015/02/12/pythongensim-creating-bigrams-over-how-i-met-your-mother-transcripts/'> NLP Phrases Tutorial </a> <br>


In [None]:
from gensim.models import Phrases
from gensim.models import Word2Vec
import nltk 
import string

## Word 2 Vec Model 1, window=5

In [None]:
model = gensim.models.Word2Vec(
    window = 5, # number of products to look at before and after target word
    min_count =2, # at least two product words need to be in the subject for it to train the model 
    workers = 4) # number of cores assigned 

Build Vocabulary of Products 

In [None]:
model.build_vocab(product_text.values()) ## added .values() to product_text

Train the Word2Vec Model

In [None]:
model.train(product_text.values(), total_examples = model.corpus_count, epochs = model.epochs) ## added .values() again

In [None]:
model.save("./word2vec_product_names.model")

Finding similar words and similarity between words 

In [None]:
model.wv.most_similar("almond")

## Doc 2 Vec Model 6 - Vector = 10, Window = max <br>
Need to limit the number of vectors so we can reduce the number of features in kmeans

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts

In [None]:
product_text_dict = product_text
documents = [TaggedDocument(val, [key]) for key, val in product_text_dict.items()]

# Need to figure out window size

model = Doc2Vec(documents, vector_size=10, window=4735, min_count=20, workers=4)

cuser = 199

products = product_text_dict[cuser]
product_document_vector = model.infer_vector(products)
model.dv.most_similar([product_document_vector])

# Lets go with Doc 2 Vec Model 6 <br> 
- limited number of vectors without sacrificing accuracy 
* Next, we add Vectors to dictionary to then add back to `user_mean`

In [None]:
model.save("./final_doc2vec.model")

In [None]:
model.infer_vector(products)

In [None]:
vector_dict = {}

for key in product_text_dict: 
    product = product_text_dict[key]
    product_document_vector = model.infer_vector(products)
    vector_dict[key] = product_document_vector


In [None]:
# Convert to DataFrame 
vector_df_index = vector_dict.keys()
vector_df = pd.DataFrame.from_dict(vector_dict, orient = 'index')
vector_df.head()

In [None]:
vector_df.reset_index(inplace = True)

In [None]:
vector_df_1 = vector_df
vector_df_1.head()

In [None]:
# df.rename(columns={"A": "a", "B": "c"})
user_mean_v = vector_df_1.rename(columns = {'index': 'user_id', 
                             0: 'vector_1', 
                             1: 'vector_2',
                             2: 'vector_3',
                             3: 'vector_4',
                             4: 'vector_5',
                             5: 'vector_6',
                             6: 'vector_7',
                             7: 'vector_8',
                             8: 'vector_9',
                             9: 'vector_10'})
user_mean_v.head()

### `user_mean_v` is ready, now we need to join `user_mean_v` with `user_mean`

In [None]:
user_mean.head()

In [None]:
user_mean_join = pd.merge(user_mean, user_mean_v, how = 'left', on = 'user_id')

In [None]:
user_mean_join.drop(columns = 'Products', inplace = True)

In [None]:
user_mean_final = user_mean_join.set_index('user_id')

In [None]:
user_mean_final.head()

In [None]:
user_mean_final.to_csv('./complete_user_profile.csv')