In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import math
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import joblib
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import warnings; warnings.simplefilter('ignore')
import nltk
import sys
%matplotlib inline

In [2]:
data_path = "./data/Digital_Music_5.json"
table = pd.read_json(data_path, 'records', lines = True).reset_index(drop = True, inplace = False)
print(table.head())

       reviewerID        asin          reviewerName helpful  \
0  A3EBHHCZO6V2A4  5555991584  Amaranth "music fan"  [3, 3]   
1   AZPWAXJG9OJXV  5555991584             bethtexas  [0, 0]   
2  A38IRL0X2T4DPF  5555991584           bob turnley  [2, 2]   
3  A22IK3I6U76GX0  5555991584                 Calle  [1, 1]   
4  A1AISPOIIHTHXX  5555991584           Cloud "..."  [1, 1]   

                                          reviewText  overall  \
0  It's hard to believe "Memory of Trees" came ou...        5   
1  A clasically-styled and introverted album, Mem...        5   
2  I never thought Enya would reach the sublime h...        5   
3  This is the third review of an irish album I w...        5   
4  Enya, despite being a successful recording art...        4   

                        summary  unixReviewTime   reviewTime  
0       Enya's last great album      1158019200  09 12, 2006  
1      Enya at her most elegant       991526400   06 3, 2001  
2               The best so far      1058

In [3]:
from collections import defaultdict
from textblob import TextBlob
def get_sentiment(table):
    polarity = []
    subjectivity = []
    for idx, row in table.iterrows():
        review = TextBlob(row['reviewText'])
        sentiment = review.sentiment
        polarity.append(sentiment.polarity)
        subjectivity.append(sentiment.subjectivity)
    new_table = table
    new_table.insert(len(table.columns), column='polarity', value=polarity)
    new_table.insert(len(table.columns), column='subjectivity', value=subjectivity)
    return new_table

In [None]:
table = get_sentiment(table)
display(table.head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
num_bins = 50
plt.figure(figsize=(10,5))
n, bins, patches = plt.hist(table.polarity, num_bins, facecolor='blue', alpha=0.5)
plt.xlabel('Polarity')
plt.ylabel('Number of Reviews')
plt.title('Histogram of Polarity Score')
plt.show();

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
num_bins = 50
plt.figure(figsize=(10,5))
n, bins, patches = plt.hist(table.subjectivity, num_bins, facecolor='blue', alpha=0.5)
plt.xlabel('Subjectivity')
plt.ylabel('Number of Reviews')
plt.title('Histogram of Subjectivity Score')
plt.show();

In [None]:
#deal with the bias and noise of the overall column
def unbias(overall, polarity, subjectivity):
    if subjectivity > 0.5:
        return ((overall+5*polarity*(2-subjectivity*2))+4)*4/14+1
    else:
        return ((overall+5*polarity*subjectivity*2)+4)*4/14+1
table['unbias_overall'] = table.apply(lambda row: unbias(row['overall'], row['polarity'], row['subjectivity']), axis=1)
display(table.head())

In [None]:
table.shape

In [None]:
table.describe()['unbias_overall'].T

In [None]:
print('Number of missing values across columns: \n',table.isnull().sum())

In [None]:
print("Total data ")
print("-"*50)
print("\nTotal no of ratings :",table.shape[0])
print("Total No of Users   :", len(np.unique(table.reviewerID)))
print("Total No of products  :", len(np.unique(table.asin)))

In [None]:
table.drop(['reviewerName'], axis=1,inplace=True)
table.drop(['helpful'], axis=1,inplace=True)
table.drop(['reviewText'], axis=1,inplace=True)
table.drop(['summary'], axis=1,inplace=True)
table.drop(['unixReviewTime'], axis=1,inplace=True)
table.drop(['reviewTime'], axis=1,inplace=True)
table.drop(['polarity'], axis=1,inplace=True)
table.drop(['overall'], axis=1,inplace=True)
table.drop(['subjectivity'], axis=1,inplace=True)

In [None]:
#Analysis of rating given by the user 

no_of_rated_products_per_user = table.groupby(by='reviewerID')['unbias_overall'].count().sort_values(ascending=False)

no_of_rated_products_per_user.head()

In [None]:
no_of_rated_products_per_user.describe()

In [None]:
quantiles = no_of_rated_products_per_user.quantile(np.arange(0,1.01,0.01), interpolation='higher')

In [None]:
plt.figure(figsize=(10,10))
plt.title("Quantiles and their Values")
quantiles.plot()
# quantiles with 0.05 difference
plt.scatter(x=quantiles.index[::5], y=quantiles.values[::5], c='orange', label="quantiles with 0.05 intervals")
# quantiles with 0.25 difference
plt.scatter(x=quantiles.index[::25], y=quantiles.values[::25], c='m', label = "quantiles with 0.25 intervals")
plt.ylabel('No of ratings by user')
plt.xlabel('Value at the quantile')
plt.legend(loc='best')
plt.show()

In [None]:
print('\n No of rated product more than 50 per user : {}\n'.format(sum(no_of_rated_products_per_user >= 50)) )

In [None]:
##Popularity Based Recommendation

#Getting the new dataframe which contains users who has given 50 or more ratings
new_df=table.groupby("asin").filter(lambda x:x['unbias_overall'].count() >=50)

In [None]:
no_of_ratings_per_product = new_df.groupby(by='asin')['unbias_overall'].count().sort_values(ascending=False)

fig = plt.figure(figsize=plt.figaspect(.5))
ax = plt.gca()
plt.plot(no_of_ratings_per_product.values)
plt.title('# RATINGS per Product')
plt.xlabel('Product')
plt.ylabel('No of ratings per product')
ax.set_xticklabels([])

plt.show()

In [None]:
#Average rating of the product 

new_df.groupby('asin')['unbias_overall'].mean().head()

In [None]:
new_df.groupby('asin')['unbias_overall'].mean().sort_values(ascending=False).head()

In [None]:
#Total no of rating for product

new_df.groupby('asin')['unbias_overall'].count().sort_values(ascending=False).head()

In [None]:
ratings_mean_count = pd.DataFrame(new_df.groupby('asin')['unbias_overall'].mean())

In [None]:
ratings_mean_count['rating_counts'] = pd.DataFrame(new_df.groupby('asin')['unbias_overall'].count())

In [None]:
ratings_mean_count.head()

In [None]:
ratings_mean_count['rating_counts'].max()

In [None]:
plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor'] = True
ratings_mean_count['rating_counts'].hist(bins=50)

In [None]:
plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor'] = True
ratings_mean_count['unbias_overall'].hist(bins=50)

In [None]:
plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor'] = True
sns.jointplot(x='unbias_overall', y='rating_counts', data=ratings_mean_count, alpha=0.4)

In [None]:
popular_products = pd.DataFrame(new_df.groupby('asin')['unbias_overall'].count())
most_popular = popular_products.sort_values('unbias_overall', ascending=False)
most_popular.head(30).plot(kind = "bar")

In [None]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

In [None]:
#Reading the dataset
reader = Reader(rating_scale=(1, 5))
cols = new_df.columns.tolist()
df = new_df[cols] 
# df = df._convert(numeric=True)
print(df)
# print(df.dtypes)
# df['asin']=df['asin'].astype(np.float64)
# print(df.dtypes)

data = Dataset.load_from_df(df,reader)

In [None]:
#Splitting the dataset
trainset, testset = train_test_split(data, test_size=0.2,random_state=0)

In [None]:
# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

In [None]:
# run the trained model against the testset
test_pred = algo.test(testset)

In [None]:
test_pred

In [None]:
# get RMSE
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

In [None]:
#Model-based collaborative filtering system

ratings_matrix = new_df.pivot_table(values='unbias_overall', index='reviewerID', columns='asin', fill_value=0)
ratings_matrix.head()

In [None]:
ratings_matrix.shape

In [None]:
X = ratings_matrix.T
X.head()

In [None]:
X1 = X

In [None]:
#Decomposing the Matrix
from sklearn.decomposition import TruncatedSVD
SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix.shape

In [None]:
#Correlation Matrix

correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

In [None]:
X.index[253]

In [None]:
i = "B005QJZ5FA"

product_names = list(X.index)
product_ID = product_names.index(i)
product_ID

In [None]:
correlation_product_ID = correlation_matrix[product_ID]
correlation_product_ID.shape

In [None]:
Recommend = list(X.index[correlation_product_ID > 0.65])

# Removes the item already bought by the customer
Recommend.remove(i) 

Recommend[0:24]