# Collaborative Filtering on Blog Post.

In [1]:
import pandas as pd
import numpy as np

from surprise.model_selection import cross_validate, train_test_split
from surprise.accuracy import rmse
from surprise import (Reader, Dataset, SVD, SVDpp, SlopeOne, NMF, 
                        NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, 
                        KNNWithZScore, BaselineOnly, CoClustering)
from unittest import result
import warnings

In [3]:
df_posts = pd.read_csv('/home/dewan/codespace/news-recommender-system/data/raw/posts.csv')
df_users = pd.read_csv('/home/dewan/codespace/news-recommender-system/data/raw/users.csv')
df_views = pd.read_csv('/home/dewan/codespace/news-recommender-system/data/raw/views.csv')

df_posts.rename(columns={"_id":"post_id", ' post_type': 'post_type'}, inplace=True)
df_users.rename(columns={"_id":"user_id"}, inplace=True)


merged_df = pd.merge(df_views, df_users, on="user_id")
merged_df = pd.merge(merged_df, df_posts, on="post_id")
merged_df

Unnamed: 0,user_id,post_id,timestamp,name,gender,academics,title,category,post_type
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,2020-06-01T10:46:45.131Z,Niriksha Sharma,female,undergraduate,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork
1,5d7c994d5720533e15c3b1e9,5ec821ddec493f4a2655889e,2020-05-22T20:11:32.317Z,Varun Chowhan,male,undergraduate,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork
2,5e5af599d701ab08af792b63,5ec821ddec493f4a2655889e,2020-05-22T20:01:37.309Z,Ilupeju Ayokunnumi,female,graduate,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork
3,5de50d768eab6401affbb135,5ec821ddec493f4a2655889e,2020-05-22T20:10:41.100Z,thesocialcomment,male,graduate,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork
4,5deeef6142a8854bf6eabab9,5ec821ddec493f4a2655889e,2020-05-22T20:08:45.061Z,siddharth saxena,male,undergraduate,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork
...,...,...,...,...,...,...,...,...,...
1442,5e899be7a3258347b42f25d0,5e3ab644eab55d319938a72d,2020-04-05T08:57:02.583Z,Srinidhi A,female,undergraduate,Travelling,Photography,artwork
1443,5e78ce84cfc8b713f5ac7cee,5e78d6dccfc8b713f5ac7cf4,2020-04-05T05:44:03.998Z,Fauziya Shaikh,female,undergraduate,Childhood Sketch,Drawings|Watercolours,artwork
1444,5e880f02a3258347b42f252c,5e783590cfc8b713f5ac7c5d,2020-04-04T04:44:54.488Z,Keshant Sharma,male,graduate,Quarantine days,Painting,artwork
1445,5e783375cfc8b713f5ac7c5b,5e787636cfc8b713f5ac7cbe,2020-03-28T11:45:05.321Z,Bhargav Prakash,male,undergraduate,Sunsets in Ottawa,Painting,artwork


In [4]:
print(merged_df.gender.unique())
print(merged_df.academics.unique())
print(merged_df.post_type.unique())

['female' 'male' 'undefined']
['undergraduate' 'graduate' 'undefined']
['artwork' 'blog' 'project' 'skill']


In [5]:
merged_df.title.unique()[:10]

array(['Save Earth.', 'Machine Learning”&“Operations" (MlOps)',
       'Computer Aided Machine Drawing (CAMD)', 'EID MUBARAK',
       'Dakrai Artwork.', 'My First Animated Post.',
       'Zero-Waste Lifestyle', 'Eid Mubarak',
       'Future Communication Predictions', 'Happy Eid-ul-Fitr 2020'],
      dtype=object)

In [6]:
# assigning weights to different factors

w1 = {'artwork': 2.1, 'blog':0.9, 'project':3, 'skill':4.1}
w2 = {'female':2.5, 'male':3.5, 'undefined':1}
w3 = {'undergraduate':3, 'graduate':4, 'undefined':1}

merged_df['rank']= (
    merged_df['post_type'].apply(lambda x : w1[x]/4)+
    merged_df['gender'].apply(lambda x : w2[x]/4.1)+
    merged_df['academics'].apply(lambda x : w3[x]/3.5))/3
merged_df['rank'] = 5*(merged_df['rank']/max(merged_df['rank']))

merged_df = merged_df[['user_id', 'post_id','rank']]
merged_df

Unnamed: 0,user_id,post_id,rank
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,3.296192
1,5d7c994d5720533e15c3b1e9,5ec821ddec493f4a2655889e,3.699801
2,5e5af599d701ab08af792b63,5ec821ddec493f4a2655889e,3.768991
3,5de50d768eab6401affbb135,5ec821ddec493f4a2655889e,4.172601
4,5deeef6142a8854bf6eabab9,5ec821ddec493f4a2655889e,3.699801
...,...,...,...
1442,5e899be7a3258347b42f25d0,5e3ab644eab55d319938a72d,3.296192
1443,5e78ce84cfc8b713f5ac7cee,5e78d6dccfc8b713f5ac7cf4,3.296192
1444,5e880f02a3258347b42f252c,5e783590cfc8b713f5ac7c5d,4.172601
1445,5e783375cfc8b713f5ac7c5b,5e787636cfc8b713f5ac7cbe,3.699801


In [7]:
reader = Reader()
data = Dataset.load_from_df(merged_df, reader)
bestalgo  = []


for algo in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    warnings.filterwarnings("ignore")
    result = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=0)
    temp = pd.DataFrame.from_dict(result).mean(axis=0)
    temp = temp.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    bestalgo.append(temp)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


In [8]:
result = pd.DataFrame(bestalgo).sort_values('test_rmse').set_index('Algorithm')
result

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNNWithMeans,0.275107,0.180359,0.00066,0.002054
KNNWithZScore,0.284733,0.189637,0.002121,0.002434
SVDpp,0.311211,0.2275,0.0505,0.012176
SlopeOne,0.322196,0.230243,0.002348,0.004187
SVD,0.354166,0.263014,0.00726,0.001956
BaselineOnly,0.358271,0.265151,0.000878,0.001005
NMF,0.387409,0.292511,0.020366,0.001592
KNNBasic,0.394547,0.293871,0.000185,0.002052
CoClustering,0.575269,0.47681,0.025193,0.001302
NormalPredictor,0.704246,0.55869,0.000452,0.001156


In [9]:
# although all these parameters have some default values. Refer documentation
# other names are pearson_baseline, msd
sim_options = {'name': 'cosine', 'user_based': True, 'shrinkage': 0}    # 'user_based': True means perform user based recommendation, false means do item based recommendation

# Using Alternating Least Squares (ALS)
bsl_optionsA = {'method': 'als', 'reg_u': 15, 'reg_i': 5, 'n_epochs': 20}  # reg_u, reg_i = regularization parameter for users and items
# Using Stochastic Gradient Descent (SGD)
bsl_optionsS = {'method': 'sgd', 'reg': 0.02, 'learning_rate': .00005, 'n_epochs': 20}

algoA = KNNWithMeans(sim_options=sim_options, bsl_options=bsl_optionsA)
algoS = KNNWithMeans(sim_options=sim_options, bsl_options=bsl_optionsS)

print('ALS-------------------------------------------------------------------------------------------------------------')
cross_validate(algoA, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('SGD-------------------------------------------------------------------------------------------------------------')
cross_validate(algoS, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

ALS-------------------------------------------------------------------------------------------------------------
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2414  0.2561  0.2314  0.2585  0.2522  0.2479  0.0101  
MAE (testset)     0.1601  0.1730  0.1464  0.1635  0.1718  0.1629  0.0096  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    
SGD------------------------------------------------------------------

{'test_rmse': array([0.24959121, 0.25413118, 0.29053209, 0.22868166, 0.24812114]),
 'test_mae': array([0.15993302, 0.15780348, 0.1743584 , 0.15434995, 0.16866142]),
 'fit_time': (0.001310586929321289,
  0.0008149147033691406,
  0.0008502006530761719,
  0.0013425350189208984,
  0.001207590103149414),
 'test_time': (0.0016074180603027344,
  0.0014297962188720703,
  0.0013823509216308594,
  0.002393960952758789,
  0.0014255046844482422)}

In [10]:
train, test = train_test_split(data, test_size=0.2, random_state=200)
algo = KNNWithMeans(algo=KNNWithMeans(sim_options=sim_options, bsl_options=bsl_optionsA))
prediction = algo.fit(train).test(test)
rmse(prediction)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.2562


0.25620998355107905

In [11]:
def getU(ruid):
    try:
        return len(train.ur[train.to_inner_uid(ruid)])
    except ValueError: # User id is not a part of trainset
        return 0

def getI(riid):
    try:
        return len(train.ir[train.to_inner_iid(riid)])
    except ValueError: # Item id is not a part of trainset
        return 0

df_new = pd.DataFrame(prediction, columns=['user_id', 'post_id', 'rui', 'est', 'details'])
df_new['no_item_rated_by_user'] = df_new.user_id.apply(getU)
df_new['no_user_rated_item'] = df_new.post_id.apply(getI)
df_new['errors'] = abs(df_new.est - df_new.rui)
df_new.head()

Unnamed: 0,user_id,post_id,rui,est,details,no_item_rated_by_user,no_user_rated_item,errors
0,5e5855ced701ab08af792b51,5e7bd922cfc8b713f5ac7da9,2.799752,3.627067,"{'was_impossible': True, 'reason': 'User and/o...",0,2,0.827315
1,5ecb979eeaff6b0c3a58a4f0,5eca8fceeaff6b0c3a58a3c0,3.668521,3.693427,"{'actual_k': 6, 'was_impossible': False}",39,7,0.024906
2,5d60098a653a331687083238,5ec278b574f7660d73aa10d5,3.699801,3.582145,"{'actual_k': 5, 'was_impossible': False}",178,5,0.117656
3,5e35a5ed8d344822fed4d13e,5ed0e31a76027d35905cc302,4.141321,4.198898,"{'actual_k': 7, 'was_impossible': False}",16,7,0.057577
4,5e1ef04c2a37d20505da2b8b,5eb1551e10426255a7aaa003,3.296192,3.334776,"{'actual_k': 3, 'was_impossible': False}",46,3,0.038584


In [12]:
bestPred = df_new.sort_values(by='errors')
worstPred = df_new.sort_values(by='errors', ascending=False)

In [13]:
bestPred.head()

Unnamed: 0,user_id,post_id,rui,est,details,no_item_rated_by_user,no_user_rated_item,errors
278,5d7c994d5720533e15c3b1e9,5eb2cbde10426255a7aaa074,3.699801,3.699801,"{'actual_k': 1, 'was_impossible': False}",75,1,0.0
232,5e99b0d4a3258347b42f2bf0,5e9a7e73a3258347b42f2c24,3.699801,3.699801,"{'actual_k': 1, 'was_impossible': False}",3,1,0.0
49,5d9a2982979d5962253c2f81,5e81be54a3258347b42f221d,3.203361,3.203361,"{'actual_k': 0, 'was_impossible': False}",1,2,0.0
183,5d610ae1653a331687083239,5eaf8b9310426255a7aa9f7e,5.0,5.0,"{'actual_k': 2, 'was_impossible': False}",105,3,0.0
256,5deeef6142a8854bf6eabab9,5e947c4aa3258347b42f283e,3.203361,3.203361,"{'actual_k': 1, 'was_impossible': False}",56,1,0.0


In [14]:
worstPred.head()

Unnamed: 0,user_id,post_id,rui,est,details,no_item_rated_by_user,no_user_rated_item,errors
289,5e4ce251f5561b1994c8e40d,5ea7cd9610426255a7aa9bd2,5.0,3.627067,"{'was_impossible': True, 'reason': 'User and/o...",0,4,1.372933
195,5e5af599d701ab08af792b63,5de8d73249e8203ff9219a74,4.596391,3.627067,"{'was_impossible': True, 'reason': 'User and/o...",50,0,0.969324
96,5e840a75a3258347b42f2437,5e4ed85af5561b1994c8e470,4.54493,3.627067,"{'was_impossible': True, 'reason': 'User and/o...",0,1,0.917864
45,5d60098a653a331687083238,5ec2d29074f7660d73aa113b,4.5272,3.627067,"{'was_impossible': True, 'reason': 'User and/o...",178,0,0.900134
282,5ea5bf5110426255a7aa9b88,5ea5aacd10426255a7aa9b71,4.141321,3.272552,"{'actual_k': 0, 'was_impossible': False}",1,4,0.868769


In [15]:
df_new = pd.merge(df_new, df_posts, on='post_id')
df_new = df_new[['user_id', 'post_id', 'title', 'category', 'post_type', 'rui', 'est', 'errors']]
df_new.head()

Unnamed: 0,user_id,post_id,title,category,post_type,rui,est,errors
0,5e5855ced701ab08af792b51,5e7bd922cfc8b713f5ac7da9,What sports will look like in the future,Computer Technology|Robotics|Data Science|Info...,blog,2.799752,3.627067,0.827315
1,5ecb979eeaff6b0c3a58a4f0,5eca8fceeaff6b0c3a58a3c0,Zero-Waste Lifestyle,,project,3.668521,3.693427,0.024906
2,5d60098a653a331687083238,5ec278b574f7660d73aa10d5,Rides,Drawings,artwork,3.699801,3.582145,0.117656
3,5e35a5ed8d344822fed4d13e,5ec278b574f7660d73aa10d5,Rides,Drawings,artwork,3.768991,3.786627,0.017636
4,5e35a5ed8d344822fed4d13e,5ed0e31a76027d35905cc302,Designing Cmos circuit from Boolean expression...,,project,4.141321,4.198898,0.057577


In [16]:
df_new[df_new['user_id']==df_new.user_id.value_counts().index[0]].sort_values(by='errors').head()

Unnamed: 0,user_id,post_id,title,category,post_type,rui,est,errors
117,5d60098a653a331687083238,5eb4fab110426255a7aaa0ed,God Drawing,Drawings,artwork,3.699801,3.71088,0.011078
136,5d60098a653a331687083238,5e52fd0ed701ab08af792a1f,Network Security Threats,Computer Technology|Computer Application,blog,3.203361,3.174578,0.028783
274,5d60098a653a331687083238,5e7df283a3258347b42f2128,screw2,Photography,artwork,3.699801,3.659776,0.040025
230,5d60098a653a331687083238,5e7df068a3258347b42f2125,screw town,Photography,artwork,3.699801,3.659776,0.040025
206,5d60098a653a331687083238,5ecf818376027d35905cbf03,GAN's INTRODUCTION,Computer Technology|Machine Learning,blog,3.203361,3.161402,0.04196


In [17]:
df_test = pd.DataFrame(test, columns=['user_id', 'post_id', 'merged'])
df_test

Unnamed: 0,user_id,post_id,merged
0,5e5855ced701ab08af792b51,5e7bd922cfc8b713f5ac7da9,2.799752
1,5ecb979eeaff6b0c3a58a4f0,5eca8fceeaff6b0c3a58a3c0,3.668521
2,5d60098a653a331687083238,5ec278b574f7660d73aa10d5,3.699801
3,5e35a5ed8d344822fed4d13e,5ed0e31a76027d35905cc302,4.141321
4,5e1ef04c2a37d20505da2b8b,5eb1551e10426255a7aaa003,3.296192
...,...,...,...
285,5ed237ee76027d35905cc6c5,5e5e3b35fbc8805f69e02c9e,3.676161
286,5df20f1fee4bb5252b4f5351,5e8c2d01a3258347b42f2627,4.072131
287,5d60098a653a331687083238,5e9489e7a3258347b42f2896,3.699801
288,5e5af599d701ab08af792b63,5e9415d2a3258347b42f27f8,3.272552


In [18]:
def recommend(user_id, n=10):
    res = pd.DataFrame(columns=['user_id', 'post_id', 'estimate'])
    for i in df_test.post_id.unique():
        temp = pd.DataFrame([[user_id, i, algo.predict(user_id, i)[3]]], columns=['user_id', 'post_id', 'estimate'])
        res = res.append(temp, ignore_index=True)
    res = pd.merge(res, df_posts, on='post_id')
    return res.sort_values(by='estimate', ascending=False).reset_index(drop=True)[:n]

In [19]:
user1 = '5e4ce251f5561b1994c8e40d'
user2 = '5e1ef04c2a37d20505da2b8b'

In [20]:
print(recommend(user1)['title'].values[:3])
print(recommend(user2)['title'].values[:3])

['What sports will look like in the future' 'peace'
 'Benefits of Buying Grocery Online in Twin city of Odisha']
['Always Remember !!!' 'Keen learner.' 'Quick Sketch of Gangster Skull.']
