In [1]:
import pandas as pd
import numpy as np

In [2]:
cols = ['UserID','ItemID','Rating','Time']
ds = pd.read_csv('ratings_Electronics.csv',names = cols)

In [3]:
ds.shape

(7824482, 4)

In [4]:
ds.head()

Unnamed: 0,UserID,ItemID,Rating,Time
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [5]:
ds.UserID.unique().shape[0]

4201696

In [6]:
ds.drop(['Time'],axis=1,inplace =True)

In [7]:
ds.ItemID.unique().shape[0]

476002

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
Unique_Items_Count = ds.ItemID.value_counts().reset_index()
#print(Unique_Items_Count)

In [10]:
# As the dataset is too huge, from the above we know that there are items that are reated only once., 
# we can think of removing tose items

In [11]:
User_count_df = ds.groupby(['UserID', 'ItemID']).size().groupby('UserID').size()
print('# Users: %d' % len(User_count_df))
User_with_enough_ratings_df = User_count_df[User_count_df >= 2].reset_index()[['UserID']]
print('# Users Who has rated atleast twice: %d' % len(User_with_enough_ratings_df))
User_with_enough_ratings_df.head()

# Users: 4201696
# Users Who has rated atleast twice: 1319864


Unnamed: 0,UserID
0,A000063614T1OE0BUSKUT
1,A0001528BGUBOEVR6T5U
2,A00018041RRVMCICCAP79
3,A000187635I595IAVSQLH
4,A00033481VZEEGYXEN32T


In [12]:
Item_count_df = ds.groupby(['UserID', 'ItemID' ,'Rating']).size().groupby('ItemID').size()
print('# items: %d' % len(Item_count_df))
Item_with_enough_ratings_df = Item_count_df[Item_count_df >= 2].reset_index()[['ItemID']]
print('# Items with atleast 2 purchases: %d' % len(Item_with_enough_ratings_df))

# items: 476002
# Items with atleast 2 purchases: 296264


In [13]:
Item_with_enough_ratings_df.head()

Unnamed: 0,ItemID
0,0439886341
1,0511189877
2,0528881469
3,059400232X
4,0594012015


In [14]:
density_Ratio = float(len(User_with_enough_ratings_df) / len(User_count_df))
density_Ratio

0.31412648606657884

In [15]:
# The data has density of 31%

In [16]:
User_count_df.head()

UserID
A00000262KYZUE4J55XGL    1
A000063614T1OE0BUSKUT    2
A00009182QVLSWIGHLS1B    1
A00009661LC9LQPGKJ24G    1
A00010809P09NUU6ZP6H     1
dtype: int64

In [17]:
ds.Rating.value_counts()

5.0    4347541
4.0    1485781
1.0     901765
3.0     633073
2.0     456322
Name: Rating, dtype: int64

In [18]:
ds.isna().sum()

UserID    0
ItemID    0
Rating    0
dtype: int64

In [19]:
# of unique Users / Interactions
print(float(ds.UserID.unique().shape[0]/ds.shape[0]))

0.5369935032120976


In [20]:
## Of the total number of rating data we have, we are having the 53.7% unique users data. 
## Rest 46 % are the Same Users for various Items rating data.


In [21]:
## Make the data Sparse by selectin those users who have atleast rated twice
#df = ds.merge(User_with_enough_ratings_df,how = "inner", on = 'UserID')
df = ds.head(200000)

In [22]:
df.shape

(200000, 3)

In [23]:
df.columns

Index(['UserID', 'ItemID', 'Rating'], dtype='object')

In [24]:
item_grouped = df.groupby(['ItemID']).agg({'Rating':'count'}).reset_index()
item_grp_sum = item_grouped.Rating.sum()
item_grouped['Percentage'] = (item_grouped['Rating'].div(item_grp_sum))*100
item_grouped.head(5)

Unnamed: 0,ItemID,Rating,Percentage
0,132793040,1,0.0005
1,321732944,1,0.0005
2,439886341,3,0.0015
3,511189877,6,0.003
4,528881469,27,0.0135


In [25]:
## Sort items based on Ratings in descending order, coz, more the number of Ratings, more frequent / used / good / popular the item is
item_grouped.sort_values(['Rating','ItemID'],ascending=[0,1])

Unnamed: 0,ItemID,Rating,Percentage
5130,B00004ZCJE,2547,1.2735
2111,B00001P4ZH,2075,1.0375
9388,B000065BP9,1714,0.8570
3268,B00004T8R2,1692,0.8460
2248,B00001WRSJ,1586,0.7930
9390,B000065BPB,1304,0.6520
7345,B00005N6KG,1296,0.6480
8104,B00005T3G0,1287,0.6435
7088,B00005LEN4,1107,0.5535
10953,B00006B7DA,1106,0.5530


In [26]:
from sklearn.model_selection import train_test_split

In [27]:
train_df, test_df = train_test_split(df,test_size=0.30,random_state=42)

In [28]:
import time
import Recommenders as Recommenders
import Evaluation as Evaluation

In [29]:
# Build Popularity based model
PM = Recommenders.popularity_recommender_py()

In [30]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140000 entries, 21269 to 121958
Data columns (total 3 columns):
UserID    140000 non-null object
ItemID    140000 non-null object
Rating    140000 non-null float64
dtypes: float64(1), object(2)
memory usage: 4.3+ MB


In [31]:
def create_popularity_recommendation(train_data, user_id, item_id):
    #Get a count of user_ids for each unique song as recommendation score
    train_data_grouped = train_data.groupby([item_id]).agg({user_id: 'count'}).reset_index()
    train_data_grouped.rename(columns = {user_id: 'score'},inplace=True)
    
    #Sort the songs based upon recommendation score
    train_data_sort = train_data_grouped.sort_values(['score', item_id], ascending = [0,1])
    
    #Generate a recommendation rank based upon score
    train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
        
    #Get the top 10 recommendations
    popularity_recommendations = train_data_sort.head(20)
    return popularity_recommendations

In [32]:
recommendations = create_popularity_recommendation(train_df,'UserID','ItemID')

In [33]:
recommendations

Unnamed: 0,ItemID,score,Rank
4572,B00004ZCJE,1777,1.0
1803,B00001P4ZH,1471,2.0
8458,B000065BP9,1206,3.0
2871,B00004T8R2,1183,4.0
1930,B00001WRSJ,1077,5.0
7285,B00005T3G0,931,6.0
8460,B000065BPB,887,7.0
6590,B00005N6KG,863,8.0
6343,B00005LEN4,795,9.0
9865,B00006B7DA,769,10.0


In [34]:
uid = ds['UserID'][35575]
uid

'A3FTRN2V98QBCP'

In [35]:
def recommend_item(user_id):    
        user_recommendations = recommendations
        
        #Add user_id column for which the recommendations are being generated
        user_recommendations['userId'] = user_id
    
        #Bring user_id column to the front
        cols = user_recommendations.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_recommendations = user_recommendations[cols]
        
        return (user_recommendations)

In [36]:
recommended_items = recommend_item(uid)

In [37]:
recommended_items

Unnamed: 0,userId,ItemID,score,Rank
4572,A3FTRN2V98QBCP,B00004ZCJE,1777,1.0
1803,A3FTRN2V98QBCP,B00001P4ZH,1471,2.0
8458,A3FTRN2V98QBCP,B000065BP9,1206,3.0
2871,A3FTRN2V98QBCP,B00004T8R2,1183,4.0
1930,A3FTRN2V98QBCP,B00001WRSJ,1077,5.0
7285,A3FTRN2V98QBCP,B00005T3G0,931,6.0
8460,A3FTRN2V98QBCP,B000065BPB,887,7.0
6590,A3FTRN2V98QBCP,B00005N6KG,863,8.0
6343,A3FTRN2V98QBCP,B00005LEN4,795,9.0
9865,A3FTRN2V98QBCP,B00006B7DA,769,10.0


In [38]:
print(train_df[train_df['UserID'] == uid])

               UserID      ItemID  Rating
37377  A3FTRN2V98QBCP  B00004RFIP     4.0
18022  A3FTRN2V98QBCP  B00000JYWQ     5.0
35575  A3FTRN2V98QBCP  B00003WGP5     4.0


In [39]:
## Above Popularity Based approach shows no Personalisation

In [40]:
## To have some personalisation - use Item based

In [41]:
item_PM = Recommenders.item_similarity_recommender_py()

In [42]:
item_PM.create(train_df,'UserID','ItemID')

In [43]:
item_PM.get_user_items(uid)

['B00004RFIP', 'B00000JYWQ', 'B00003WGP5']

In [44]:

def recommend_item(user):
    user_songs = item_PM.get_user_items(user)    
    print("No. of unique items for the user: %d" % len(user_songs))
    all_songs = item_PM.get_all_items_train_data()
    print("no. of unique items in the training set: %d" % len(all_songs))
    cooccurence_matrix = item_PM.construct_cooccurence_matrix(user_songs, all_songs)
    df_recommendations = item_PM.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
    return df_recommendations

In [45]:
recommended_items = recommend_item(uid)

No. of unique items for the user: 3
no. of unique items in the training set: 11796
Non zero values in cooccurence_matrix :72


In [46]:
recommended_items

Unnamed: 0,userId,movieId,score,rank
0,A3FTRN2V98QBCP,B00000JBRP,0.00641,1
1,A3FTRN2V98QBCP,B00004Z5SG,0.00641,2
2,A3FTRN2V98QBCP,B00004RG7O,0.00641,3
3,A3FTRN2V98QBCP,B00004XREC,0.006289,4
4,A3FTRN2V98QBCP,B00000J4A5,0.006173,5
5,A3FTRN2V98QBCP,B00000JI4A,0.006173,6
6,A3FTRN2V98QBCP,B00005A1KV,0.005747,7
7,A3FTRN2V98QBCP,B000066TPO,0.005496,8
8,A3FTRN2V98QBCP,B00004UE2J,0.004975,9
9,A3FTRN2V98QBCP,B00006HUAS,0.004762,10


In [73]:
### When we compare the recommended items for the same user, popularity based RS recommends 20 products,where when compared with 
### item based recommendation sys, it gets only top 10 recommendations.

In [74]:
### Collaborative Filtering

In [90]:
from scipy import stats
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD, evaluate, KNNBasic

In [85]:
reader = Reader()
data = Dataset.load_from_df(df[['UserID', 'ItemID', 'Rating']], reader)
data.split(n_folds=5)
evaluate(svd, data, measures=['RMSE', 'MAE'])



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 1.2911
MAE:  1.0195
------------
Fold 2
RMSE: 1.3020
MAE:  1.0262
------------
Fold 3
RMSE: 1.3017
MAE:  1.0251
------------
Fold 4
RMSE: 1.2848
MAE:  1.0139
------------
Fold 5
RMSE: 1.2983
MAE:  1.0258
------------
------------
Mean RMSE: 1.2956
Mean MAE : 1.0221
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [1.0194572807581757,
                             1.026206623076863,
                             1.025143227117134,
                             1.0139475878289075,
                             1.0258396873954028],
                            'rmse': [1.2911411201424752,
                             1.3019651089080901,
                             1.3016709404138072,
                             1.28482654452442,
                             1.2982584822199361]})

In [87]:
from surprise import accuracy
# We'll use the famous SVD algorithm.
algo = SVD()
trainset, testset = train_test_split(data, test_size=.25)
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)


RMSE: 1.2928


1.2928090005490744

In [80]:
ds[ds['UserID'] == uid]

Unnamed: 0,UserID,ItemID,Rating
18022,A3FTRN2V98QBCP,B00000JYWQ,5.0
35575,A3FTRN2V98QBCP,B00003WGP5,4.0
37377,A3FTRN2V98QBCP,B00004RFIP,4.0
327214,A3FTRN2V98QBCP,B0000BVV37,4.0
545360,A3FTRN2V98QBCP,B00065HLTW,4.0
678891,A3FTRN2V98QBCP,B0009Q4PH4,4.0
1406961,A3FTRN2V98QBCP,B000UVM0ES,4.0


In [94]:
algo.predict(uid, 'B00000JYWQ' , 5)

Prediction(uid='A3FTRN2V98QBCP', iid='B00000JYWQ', r_ui=5, est=4.268064398163256, details={'was_impossible': False})

In [93]:
from surprise.model_selection import GridSearchCV
param_grid = {'n_epochs': [10,20], 'lr_all': [0.001, 0.005],
              'reg_all': [0.6, 0.8]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.3021912443295884
{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.6}


In [95]:
best_SVD_algo = gs.best_estimator['rmse']

In [96]:
best_SVD_algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1df4cba0208>

In [98]:
pred = best_SVD_algo.predict(uid,'B00000JYWQ',5)

In [99]:
score=pred.est
print(score)

4.182052572679787


In [100]:
### Making Recommendations

In [102]:
iids = ds['ItemID'].unique()
iids_uid = ds.loc[ds['UserID'] == uid , 'ItemID']
iids_to_predict = np.setdiff1d(iids,iids_uid)

In [104]:
testset = [[uid,iid,5] for iid in iids_to_predict]
predictions = best_SVD_algo.test(testset)
predictions[0]

Prediction(uid='A3FTRN2V98QBCP', iid='0132793040', r_ui=5, est=4.040179286799432, details={'was_impossible': False})

In [105]:
predictions

[Prediction(uid='A3FTRN2V98QBCP', iid='0132793040', r_ui=5, est=4.040179286799432, details={'was_impossible': False}),
 Prediction(uid='A3FTRN2V98QBCP', iid='0321732944', r_ui=5, est=4.257434617900313, details={'was_impossible': False}),
 Prediction(uid='A3FTRN2V98QBCP', iid='0439886341', r_ui=5, est=4.0380265087503755, details={'was_impossible': False}),
 Prediction(uid='A3FTRN2V98QBCP', iid='0511189877', r_ui=5, est=4.301470389721395, details={'was_impossible': False}),
 Prediction(uid='A3FTRN2V98QBCP', iid='0528881469', r_ui=5, est=3.393588729172877, details={'was_impossible': False}),
 Prediction(uid='A3FTRN2V98QBCP', iid='0558835155', r_ui=5, est=4.059025202855743, details={'was_impossible': False}),
 Prediction(uid='A3FTRN2V98QBCP', iid='059400232X', r_ui=5, est=4.137776725798517, details={'was_impossible': False}),
 Prediction(uid='A3FTRN2V98QBCP', iid='0594012015', r_ui=5, est=3.1849475471181337, details={'was_impossible': False}),
 Prediction(uid='A3FTRN2V98QBCP', iid='0594017

In [106]:
# Here each prediction is a object, hence we need to convert it to aparse matrix, then pick recommendations

In [None]:
pred_rating = np.array[[pred.est for pred in predictions]]
item_max = pred_rating.argpartition(5)
iids = iids_to_predict(item_max)
