In [21]:
import pandas as pd
import numpy as np
import itertools
from collections import defaultdict

# display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# custom helper functions
%load_ext autoreload
%autoreload 2
from helper.general_helper import *
from helper.keras_predictions import *

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer

# view plotly in jupyter 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ValueError: attempted relative import beyond top-level package

In [2]:
df = pd.read_csv('../../merged_df.csv')

In [38]:
from PIL import Image
import requests

url = df.head(1).imUrl.values[0]

im = Image.open(requests.get(url, stream=True).raw)

#### Reduce Sparcity

In [3]:
df = df[df['title'].notna()]

# select reviewer and product values
customers = df['reviewerID'].value_counts()
products = df['asin'].value_counts()

# filter by 10 reviews per product per customer, products with 20 or more reviews
customers = customers[customers >= 10]
products = products[products >= 20]

# merge dataframe
df = df.merge(pd.DataFrame({'reviewerID': customers.index})).merge(pd.DataFrame({'asin': products.index}))


# select test size
n = 39601

# shuffle dataframe
df = df.sample(frac=1).reset_index(drop=True)

# split into train and test set
df_train = df[:-n]
df_test = df[-n:]

#### Recommending off KMeans

In [None]:
pivot_df = df.pivot_table(index='reviewerID', columns='asin', values='overall', fill_value=0)

In [None]:
kmeans = KMeans(n_clusters=100, random_state=0).fit(pivot_df)

In [None]:
preds = kmeans.predict(pivot_df)

In [None]:
user_pred_dict = defaultdict(int)
for user, pred in zip(pivot_df.index, preds):
    user_pred_dict[user] = pred

In [None]:
df['class'] = df.reviewerID.apply(lambda x: user_pred_dict[x])

In [None]:
pred_dict = dict(df.groupby('class').mean().overall)

In [None]:
df['preds'] = df['class'].apply(lambda x: pred_dict[x])

In [None]:
test_values = df[['overall', 'preds']]

In [None]:
test_values

In [None]:
rmse = np.sqrt(mean_squared_error(test_values['overall'], test_values['preds']))
rmse

In [None]:
error = abs(df['overall'] - df['preds'])

In [None]:
figure = sns.distplot(error, kde=True)
plt.axvline(np.median(error),color='r', linestyle='--')
plt.title('Distribution of Error: KMeans', fontsize=18)
sns.set(rc={'figure.figsize':(11.7,8.27)})
plt.xlabel('Error', fontsize=16)
plt.ylabel('Count', fontsize=16)
# plt.savefig(r"../images/kmeans_error");

In [None]:
alg = AgglomerativeClustering(n_clusters=20)
preds = alg.fit_predict(pivot_df)

In [None]:
user_pred_dict = defaultdict(int)
for user, pred in zip(pivot_df.index, preds):
    user_pred_dict[user] = pred

In [None]:
df['class'] = df.reviewerID.apply(lambda x: user_pred_dict[x])

In [None]:
pred_dict = dict(df.groupby('review').mean().overall)

In [None]:
df['preds'] = df['class'].apply(lambda x: pred_dict[x])

In [None]:
test_values = df[['overall', 'preds']]

In [None]:
test_values

In [None]:
rmse = np.sqrt(mean_squared_error(test_values['overall'], test_values['preds']))
rmse

In [None]:
error = abs(df['overall'] - df['preds'])

figure = sns.distplot(error, kde=True)
plt.axvline(np.median(error),color='r', linestyle='--')
plt.title('Distribution of Error: Agglomerative Clustering', fontsize=18)
sns.set(rc={'figure.figsize':(11.7,8.27)})
plt.xlabel('Error', fontsize=16)
plt.ylabel('Count', fontsize=16)
# plt.savefig(r"../images/kmeans_error");

#### cos sim for items

#### content similarity

In [25]:
# set nlp dict with list
nlp_dict = defaultdict(list)

# get unique IDs
unique_asin = df['asin'].unique()

# popupate dictionary with keys
for asin in unique_asin:
    nlp_dict[asin]
    
for idx, text in zip(df['asin'], df['summary']):
    nlp_dict[idx].append(text)
    
# get unique descriptions and titles and dedupe
desc_title = df[['asin', 'description', 'title']].drop_duplicates()
    
    
for idx, text_1, text_2 in zip(desc_title['asin'], desc_title['description'], desc_title['title']):
    nlp_dict[idx].append(text_1)
    nlp_dict[idx].append(text_2)
    

In [26]:
# set subdf with asin for mapping
asin_df = df[['asin']].drop_duplicates()

# map nlp dict to asin
asin_df['text'] = asin_df['asin'].apply(lambda x: nlp_dict[x])

In [5]:
# nlp preprocessing function

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
#     stem_words=[stemmer.stem(w) for w in filtered_words]
#     lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)



In [27]:
# set asin as index for easy reference and drop as a column
asin_df.index = asin_df['asin']
asin_df.drop('asin', axis=1, inplace=True)

# run nlp cleaner over all text and create new column called clean
asin_df['clean'] = asin_df['text'].map(lambda x: preprocess(x))

# drop unprocessed text column
asin_df.drop('text', axis=1, inplace=True)

# save index to ordered list for easy future interpretation
index_list = asin_df.index.tolist()

In [None]:
tfidf = TfidfVectorizer()
tfidf_df = tfidf.fit_transform(asin_df.clean)

In [None]:
product_final = pd.DataFrame(data=tfidf_df.todense(), index=asin_df.index)

In [None]:
# product_final.to_csv('../../product_final.csv', index=False)

#### User similarity

In [7]:
# set user dict with list
user_dict = defaultdict(list)

# get unique IDs
unique_user = df['reviewerID'].unique()

# popupate dictionary with keys
for id in unique_user:
    user_dict[id]
    
# get unique descriptions and titles and dedupe
desc_title = df[['reviewerID', 'description', 'title']].drop_duplicates()
    
    
for idx, text_1, text_2 in zip(desc_title['reviewerID'], desc_title['description'], desc_title['title']):
    user_dict[idx].append(text_1)
    user_dict[idx].append(text_2)
    

In [9]:
# set subdf with asin for mapping
user_df = df[['reviewerID']].drop_duplicates()

# map nlp dict to asin
user_df['text'] = user_df['reviewerID'].apply(lambda x: user_dict[x])

In [10]:
# set user as index for easy reference and drop as a column
user_df.index = user_df['reviewerID']
user_df.drop('reviewerID', axis=1, inplace=True)

# run nlp cleaner over all text and create new column called clean
user_df['clean'] = user_df['text'].map(lambda x: preprocess(x))

# drop unprocessed text column
user_df.drop('text', axis=1, inplace=True)

# save index to ordered list for easy future interpretation
index_list = user_df.index.tolist()

In [12]:
# tfidf vectorize the text
tfidf = TfidfVectorizer()
tfidf_df = tfidf.fit_transform(user_df['clean'])

In [13]:
# save for later for ease of use
user_final = pd.DataFrame(data=tfidf_df.todense(), index=user_df.index)

user_final.to_csv('../../user_final.csv', index=False)

In [14]:
# user_final.to_csv('../../user_final.csv', index=False)

In [17]:
# product_final.to_csv('../../product_final.csv', index=False)
# user_final.to_csv('../../user_final.csv', index=False)

product_nlp = pd.read_csv('../../product_final.csv')
user_nlp = pd.read_csv('../../user_final.csv')

In [18]:
from sklearn.decomposition import TruncatedSVD

In [29]:
# svd = TruncatedSVD(n_components=100)
# compressed_product = svd.fit_transform(product_nlp)
# compressed_user = svd.fit_transform(user_nlp)

# compressed_product_ = pd.DataFrame(data=compressed_product, index=asin_df.index)
compressed_user_ = pd.DataFrame(data=compressed_user, index=user_df.index)


# compressed_product_.to_csv('../../product_final.csv', index=False)
compressed_user_.to_csv('../../user_final.csv', index=False)

In [15]:
pivot_df = df.pivot_table(index='reviewerID', columns='asin', values='overall', fill_value=0)

In [16]:
nmf = NMF(n_components=100)
nmf.fit(pivot_df)
W = nmf.transform(pivot_df)
H = nmf.components_

NameError: name 'NMF' is not defined

In [None]:
W, H = (np.around(x,2) for x in (W,H))
W = pd.DataFrame(W,index=pivot_df.index)
H = pd.DataFrame(H,columns=pivot_df.columns)

In [None]:
top_products = H.iloc[0].sort_values(ascending=False).index[:3]
top_products

In [None]:
top_users = W.iloc[:,0].sort_values(ascending=False).index[:2]
top_users

In [None]:
df.head(1)

In [None]:
W.loc['Emily']

In [None]:
thresh = .2
for g in range(100):
    all_products = H.iloc[g,:]
    included = H.columns[all_movies >= (thresh * all_products.max())]
    print("Concept %i contains: %s" % (g, ', '.join(included)))

In [None]:
thresh = .2  # user is included if at least 20% of max weight
for g in range(2):
    all_users = W.iloc[:,g]
    included = W.index[all_users >= (thresh * all_users.max())]
    print("\nConcept %i contains: %s" % (g, ', '.join(included)))

In [None]:
"""
I am going to select a user and with NMF find their top 3 categories.
I will find the top 10 products in each category that the user has not bought.
I will then find others users that have bought that item and fill a list of users.
I will then using cosine similarity, find the top 10 most similar users.
I will then mean their ratings for the product and use that as my recomended ratings.
I will then sort these items by the meaned ratings highest to lowest and recomend top 10.

How do I test this?

I will get users that have rated at least 50 items?
use this method with items they have already bought and see what the MSE is.

"""

In [None]:
"""

for each user, I want to find their top 3 categories.
for each of these categories I want to find the top 10 products

TOP 10 item for TOP 3 Cats (30 items total)

For each of these items I want to find users that are most similar to our target user.
How have they rated these items? I will mean their recomendation and call it as the targets recomendation.
I will then sort the items from highest to lowest and recomend them.




I then want to use their history to find out what items in these categories are most similar to what they bought
but have not bought.

I then want to recomend them the top 10 most similar items to their history

"""

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

user_sim = cosine_similarity(user_final)

def get_index_from_user(id):
    return user_final[user_final. == title]["index"].values[0]

In [15]:
from helper.keras_predictions import *

SyntaxError: unmatched ')' (keras_predictions.py, line 207)

In [14]:
!python3 helper.karas_predictions.py

Python was not found; run without arguments to install from the Microsoft Store, or disable this shortcut from Settings > Manage App Execution Aliases.
