In [12]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import pandas as pd
import math
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

import joblib

import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import warnings; warnings.simplefilter('ignore')


In [13]:
df = pd.read_csv('collaborative.csv')
print(df.shape)

(1000000, 4)


In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,user-id,product-id,reviews
0,0,0,36250,1
1,1,0,33831,4
2,2,0,12799,2
3,3,0,33431,5
4,4,0,18536,1


In [15]:
df.drop('Unnamed: 0',axis=1,inplace=True)
print(df.shape)

(1000000, 3)


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column      Non-Null Count    Dtype
---  ------      --------------    -----
 0   user-id     1000000 non-null  int64
 1   product-id  1000000 non-null  int64
 2   reviews     1000000 non-null  int64
dtypes: int64(3)
memory usage: 22.9 MB


In [17]:
rows,columns=df.shape
print('Number of rows: ',rows)
print('Number of columns: ',columns)

Number of rows:  1000000
Number of columns:  3


In [18]:
df.dtypes

user-id       int64
product-id    int64
reviews       int64
dtype: object

In [19]:
df['reviews'].describe().transpose()

count    1000000.000000
mean           3.665633
std            1.490878
min            1.000000
25%            2.000000
50%            4.000000
75%            5.000000
max            5.000000
Name: reviews, dtype: float64

In [20]:
print('Minimum rating is: %d' %(df.reviews.min()))
print('Maximum rating is: %d' %(df.reviews.max()))

Minimum rating is: 1
Maximum rating is: 5


In [21]:
print('Number of missing values across columns: \n',df.isnull().sum())

Number of missing values across columns: 
 user-id       0
product-id    0
reviews       0
dtype: int64


In [22]:
# Number of unique user id  in the data
print('Number of unique users in Raw data = ', df['user-id'].nunique())
# Number of unique product id  in the data
print('Number of unique product in Raw data = ', df['product-id'].nunique())

Number of unique users in Raw data =  5001
Number of unique product in Raw data =  44424


In [23]:
most_rated=df.groupby('user-id').size().sort_values(ascending=False)[:10]
print('Top 10 users based on ratings: \n',most_rated)

Top 10 users based on ratings: 
 user-id
2597    258
1814    255
580     253
3422    251
234     251
742     247
3486    246
686     246
3868    246
4639    243
dtype: int64


In [24]:
counts=df['user-id'].value_counts()
df_final=df[df['user-id'].isin(counts[counts>=15].index)]
print('Number of users who have rated 25 or more items =', len(counts))
print('Number of unique users in the final data = ', df_final['user-id'].nunique())
print('Number of unique products in the final data = ', df_final['product-id'].nunique())

Number of users who have rated 25 or more items = 5001
Number of unique users in the final data =  5001
Number of unique products in the final data =  44424


In [25]:
final_ratings_matrix = df_final.pivot(index = 'user-id', columns ='product-id', values = 'reviews').fillna(0)
final_ratings_matrix.head()

product-id,1163,1164,1165,1525,1526,1528,1529,1530,1531,1532,...,59990,59991,59992,59993,59994,59995,59996,59998,59999,60000
user-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
print('Shape of final_ratings_matrix: ', final_ratings_matrix.shape)

Shape of final_ratings_matrix:  (5001, 44424)


In [27]:
given_num_of_ratings = np.count_nonzero(final_ratings_matrix)
print('given_num_of_ratings = ', given_num_of_ratings)
possible_num_of_ratings = final_ratings_matrix.shape[0] * final_ratings_matrix.shape[1]
print('possible_num_of_ratings = ', possible_num_of_ratings)
density = (given_num_of_ratings/possible_num_of_ratings)
density *= 100
print ('density: {:4.2f}%'.format(density))

given_num_of_ratings =  1000000
possible_num_of_ratings =  222164424
density: 0.45%


In [28]:
train_data, test_data = train_test_split(df_final, test_size = 0.3, random_state=0)
train_data.head()

Unnamed: 0,user-id,product-id,reviews
823666,4118,33155,5
70066,352,46839,5
591943,2963,46293,1
578645,2896,8612,5
675461,3379,42804,1


In [29]:
print('Shape of training data: ',train_data.shape)
print('Shape of testing data: ',test_data.shape)

Shape of training data:  (700000, 3)
Shape of testing data:  (300000, 3)


In [30]:
train_data_grouped = train_data.groupby('product-id').agg({'user-id': 'count'}).reset_index()
train_data_grouped.rename(columns = {'user-id': 'score'},inplace=True)
train_data_grouped.head(40)

Unnamed: 0,product-id,score
0,1163,8
1,1164,13
2,1165,19
3,1525,19
4,1526,9
5,1528,10
6,1529,19
7,1530,14
8,1531,11
9,1532,11


In [31]:
train_data_sort = train_data_grouped.sort_values(['score', 'product-id'], ascending = [0,1]) 
      
train_data_sort['rank'] = train_data_sort['score'].rank(ascending=0, method='first') 
          
popularity_recommendations = train_data_sort.head(5) 
popularity_recommendations 

Unnamed: 0,product-id,score,rank
15615,20046,35,1.0
7597,10564,33,2.0
28330,37344,33,3.0
15255,19647,32,4.0
30779,40749,32,5.0


In [32]:
def recommend(user_id):     
    user_recommendations = popularity_recommendations 
          
    user_recommendations['user-id'] = user_id 
      
    cols = user_recommendations.columns.tolist() 
    cols = cols[-1:] + cols[:-1] 
    user_recommendations = user_recommendations[cols]
          
    return user_recommendations 

In [33]:

print(recommend(1)['product-id'].values[4])


40749
