In [None]:
import IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivivty = "all"
import numpy as np
import pandas as pd
import math
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbours import NearestNeighbours
from sklearn.externals import joblib
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import warnings; warnings.simplefilter('ignore')
%matplotlib inline

In [None]:
columns = ['userId', 'productId', 'ratings', 'timestamp']
electronics_df=pd.read_csv('ratings_Electronics.csv',names=columns)


In [None]:
electronics_df.head()

In [None]:
electronics_df.drop('timestamp',axis=1,inplace=True)

In [None]:
electronics_df.info()

In [None]:
rows,columns=electronics_df.shape
print('Number of rows: ',rows)
print('Number of columns: ',columns)

In [None]:
electronics_df.types

In [None]:
electronics_df1=electronics_df.iloc[:50000,0:]

In [None]:
electronics_df1.info()

In [None]:
electronics_df1['ratings'].describe().transpose()

In [None]:
print('Minimum rating is: %d' %(electronics_df1.ratings.min()))
print('Maximum rating is: %d' %(electronics_df1.ratings.max()))

In [None]:
print('Number of missing values across columns: \n',electronics_df.isnull().sum())

In [None]:
with sns.axes_style('white'):
    g = sns.factorplot("ratings", data=electronics_df1, aspect=2.0,kind='count')
    g.set_ylabels("Total number of ratings")

In [None]:
print('Number of unique users in Raw data = ', electronics_df1['userId'].nunique())
print('Number of unique prosucts in Raw data = ', electronics_df1['productId'].nunique())

In [None]:
most_rated=electronics_df1.groupby('userId').size().sort_values(ascending=False)[:10]
print('Top 10 users based on ratings: \n',most_rated)

In [None]:
counts=electronics_df1.userId.value_counts()
electronics_df1_final=electronics_df1[electronics_df1.userID.isin(counts[counts>=15].index)]
print('Number of users who have rated 25 or more items =', len(electronics_df1_final))
print('Number of unique users in the final data =', electronics_df1_final['userId'].nunique())
print('Number of unique products in the final data =', electronics_df1_final['userId'].nunique())


In [None]:
final_ratings_matrix = electronics_df1_final.pivot(index = 'userId', columns='productId', value = 'ratings').fillna(0)
final_ratings_matrix.head()

In [None]:
print('Shape of final_ratings_matrix: ', final_ratings_matrix.shape)


In [None]:
given_num_of_ratings = np.count_nonzero(final_ratings_matrix)
print('given_num_of_ratings =', given_num_of_ratings)
possible_num_of_ratings = final_ratings_matrix.shape[0] * final_ratings_matrix.shape[1] 
print('possible_num_of_ratings = ', possible_num_of_ratings)
density = (given_num_of_ratings/possible_num_of_ratings)
density *= 100
print('density: {:4.2f}%'.format(density))

In [None]:
train_data, test_data = train_test_split(electronics_df1_final, test_size = 0.3, random_state=0)
train_data.head()


In [None]:
print('Shape of training data: 'train_data.shape)
print('Shape of testing data: ',test_data.shape)

In [None]:
train_data_grouped = train_data.groupby('productId').agg({'userId': 'count'}).reset_index()
train_data_grouped.rename(columns = {'userId': 'score'},inplace=True)
train_data_grouped.head(40)

In [None]:
train_data_sort = train_data_grouped.sort_values(['score', 'productId'], ascending= [0,1])
train_data_sort['rank'] = train_data_sort['score'].rank(ascending=0, method='first')
popularity_recommendations = train_data_sort.head(5)
popularity_recommendations

In [None]:
def recommend(user_id):
    user_recommendations = popularity_recommendations
    user_recommendations['userId'] = user_id
    
    cols = user_recommendations.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    user_recommendations = user_recommendations[cols]
    
    return user_recommendations

In [None]:
find_recom = [10,100,150]
for i in find_recom:
    print("The list of recommendations for the userId: %d\n" %(i))
    print(recommend(i))
    print("\n")

In [None]:
electronics_df_CF = pd.concat([train_data, test_data]).reset_index()
electronics_df_CF.head()


In [None]:
pivot_df = electronics_df_CF.pivot(index = 'userId', columns = 'prodductId', values='ratings').fillana(0)
pivot_df.head()

In [None]:
print('Shape of the pivot table: ', pivot_df.shape)

In [None]:
pivot_df['user_index'] = np.arrange(0, pivot_df.shape[0], 1)
pivot_df.head()

In [None]:
pivot_df.set_index(['user_index'], inplace=True)
pivot_df.head()

In [None]:
U, sigma, Vt = svds(pivot_df, k = 10)

In [None]:
print('Left singular matrix: \n',U)

In [None]:
print('Sigma: \n',sigma)

In [None]:
sigma = np.diag(sigma)
print('Diagonal matrix: \n',sigma)

In [None]:
print('Right singular matrix: \n',Vt)

In [None]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = pivot_df.columns)
preds_df.head()

In [None]:
def recommend_items(userID, pivot_df, pred_df, num_recommendations):
    user_idx = userID-1
    sorted_user_ratings = pivot_df.iloc[user_idx].sort_values(ascending=False)
    sorted_user_predictions = preds_df.iloc[user_idx].sort_values(ascending=False)
    
    temp = pd.concat([sorted_user_ratings, sorted_user_predictions], axis=1)
    temp.index.name = 'Recommended Items'
    temp.columns = ['user_ratings', 'user_predictions']
    temp = temp.loc[temp.user_ratings == 0]
    temp = temp.sort_values('user_predictions', ascending=False)
    print('\nBelow are the recommended items for user(user_id = {}):\n'.format(userID))
    print(temp.head(num_recommendations))

In [None]:
userID = 4
num_recommendations = 5
recommend_items(userID, pivot_df, preds_df, num_recommendations)

In [None]:
userID = 6
num_recommendations = 5
recommend_items(userID, pivot_df, preds_df, num_recommendations)

In [None]:
userID = 8
num_recommendations = 5
recommend_items(userID, pivot_df, preds_df, num_recommendations)

In [None]:
final_ratings_matrix.head()

In [None]:
final_ratings_matrix.mean().head()

In [None]:
preds_df.head()

In [None]:
preds_df.mean().head()

In [None]:
rmse_df = pd.concat([final_ratings_matrix.mean(), preds_df.mean()], axis=1)
rmse_df.columns = ['Avg_actual_ratings', 'Avg_predicted_ratings']
print(rmse_df.shape)
rmse_df['item_index'] = np.arrange(0, rmse_df.shape[0], 1)
rmse_df.head()

In [None]:
RMSE = round((((rmse_df.Avg_actual_ratings - rmse_df.Avg_predicted_ratings) ** 2).mean() ** 0.5), 5)
print('\nRMSE SVD Model = {} \n'.format(RMSE))


In [None]:
userID = 9
num_recommendations = 5
recommend_items(userID, pivot_df, preds_df, num_recommendations)