In [0]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## User & Item based Collaborative Filtering

### Load data

In [0]:
import pandas as pd
import numpy as np

In [0]:
ratings = pd.read_csv("../data/ratings_sub.csv",encoding = "ISO-8859-1")

In [0]:
ratings.shape

(487469, 7)

In [0]:
ratings.userId=ratings.userId.astype(str)
ratings.movieId=ratings.movieId.astype(str)

In [0]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres', 'year'], dtype='object')

### Data Exploration & Transformation

<b> Find the top 10 most popular movies watched </b>


In [0]:
# Total unique users 
print("total unique users - ",len(ratings["userId"].unique()))

total unique users -  2827


<b> Q: Who are the users with maximum no of movies watched? </b>

In [0]:
# Users with max no of movies watches
ratings["userId"].value_counts().head()

65117     200
115822    200
129585    200
21711     200
59148     200
Name: userId, dtype: int64

### Transforming data to surprise format

In [0]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))

In [0]:
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [0]:
data

<surprise.dataset.DatasetAutoFolds at 0x1ce346c6390>

In [0]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

# to build on full data
#trainset = data.build_full_trainset()

In [0]:
type(trainset)

surprise.trainset.Trainset

### Making sense of trainset 

Points to Note:
    
    1) Trainset is no longer a pandas dataframe. Rather, it's a specific datatypes defined by the Surprise library
    2) UserId and ItemId in the pandas dataframe can contain any value (either string/integer etc). However, Trainset convert these raw ids into numeric indexes called as "inner id"
    3) Methods are provided to convert rw id to inner id and vice verca

In [0]:
# user item rating data can be obtained as follows
user_records = trainset.ur
type(user_records)

collections.defaultdict

In [0]:
for keys in user_records.keys():
    print(keys)

In [0]:
user_records[0]

In [0]:
# However the ids are the inner ids and not the raw ids
# raw ids can be obatined as follows

print(trainset.to_raw_uid(0))
print(trainset.to_raw_iid(1066))

248
Step Up 2 the Streets (2008)


In [0]:
user_records[0]

<b> In Class Assignment </b>

Confirm the raw to internal id mapping with original data, for a given user/item combination (uid - 0 & iid - 1066)


### Training the model

In [0]:
from surprise import KNNWithMeans
from surprise import accuracy

In [0]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson', 'user_based': False})
algo.fit(trainset)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1ce00006668>

### Find K most similiar items

<b> In-class assignment </b>

Which movies are most similiar to Finding Nemo? (Hint: Use <b> get_neighbors </b> method of the algo object)

### Evaluating Model Performance

In [0]:
len(testset)

121868

In [0]:
testset[0:5]

[('107317', 'Signs (2002)', 2.5),
 ('103061', 'Inconvenient Truth, An (2006)', 4.5),
 ('84115', 'Battlefield Earth (2000)', 2.5),
 ('130756',
  'Fast and the Furious: Tokyo Drift, The (Fast and the Furious 3, The) (2006)',
  2.0),
 ('24878', 'Drive (2011)', 4.5)]

In [0]:
# Evalute on test set
test_pred = algo.test(testset)

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 0.7900


0.7900129665152281

In [0]:
# View a particular prediction
test_pred[12]

# To access a particular value, say estimate simply mention test_pred[12].est

Prediction(uid='7051', iid='Black Hawk Down (2001)', r_ui=5.0, est=4.265561774995584, details={'actual_k': 10, 'was_impossible': False})

In [0]:
test_pred[12].details["actual_k"]

10

In [0]:
# convert results to dataframe
test_pred_df = pd.DataFrame(test_pred)
test_pred_df["was_impossible"] = [x["was_impossible"] for x in test_pred_df["details"]]

In [0]:
test_pred_df.loc[test_pred_df.was_impossible].head(5)

Unnamed: 0,uid,iid,r_ui,est,details,was_impossible
159,36730,Grill Point (Halbe Treppe) (2002),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
604,131040,Escape from Planet Earth (2013),2.0,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
827,116349,No Good Deed (2014),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
1865,124431,Films to Keep You Awake: The Christmas Tale (P...,0.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
2020,21811,Insanitarium (2008),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True


<b> In class assignment </b>

What does <i>"was impossible": True indicate?</i>  

For how many cases in Test set, the predictions are set to "was_impossible"? And what could be the reasons for it?

### Predictions

In [0]:
# Mkae prediction for a single user
algo.predict(uid="user_405",iid="Wrong Trousers, The (1993)")

### Generating top n recommendations

In [0]:
testset_new = trainset.build_anti_testset()

In [0]:
len(testset_new)

17308818

In [0]:
testset_new[0:5]

[('248', 'Disturbia (2007)', 3.511396303620614),
 ('248', 'Hamlet 2 (2008)', 3.511396303620614),
 ('248', 'Unbreakable (2000)', 3.511396303620614),
 ('248', 'Finding Neverland (2004)', 3.511396303620614),
 ('248', 'X2: X-Men United (2003)', 3.511396303620614)]

In [0]:
predictions = algo.test(testset_new[0:10000])

In [0]:
predictions_df = pd.DataFrame([[x.uid,x.est] for x in predictions])

In [0]:
predictions_df.columns = ["userId","est_rating"]
predictions_df.sort_values(by = ["userId", "est_rating"],ascending=False,inplace=True)

In [0]:
predictions_df.head()

Unnamed: 0,userId,est_rating
8040,45844,5.0
8348,45844,5.0
8394,45844,5.0
9039,45844,5.0
9147,45844,5.0


In [0]:
top_10_recos = predictions_df.groupby("userId").head(10).reset_index(drop=True)

## SVD Based Recommendation

In [0]:
# Lets exclude movies with very few ratings, say less than 5
movie_count = ratings["title"].value_counts(ascending=False)
pop_movie = movie_count.loc[movie_count.values > 200].index
len(pop_movie)


567

In [0]:
ratings = ratings.loc[ratings.title.isin(pop_movie)]
ratings.shape

(350710, 7)

In [0]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [0]:
ratings.shape

(350710, 7)

In [0]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

# to build on full data
#trainset = data.build_full_trainset()

In [0]:
from surprise import SVD
from surprise import accuracy

In [0]:
svd_model = SVD(n_factors=50,biased=False)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ce39a132b0>

In [0]:
test_pred = svd_model.test(testset)

In [0]:

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 0.7748


0.7747645710104428

<b> In class assignment </b>

What is the impact of changing the number of SVD components on model accuracy?

<b> Recreating the SVD predictions using Matrix multiplcation of User and Item factors </b>

In [0]:
user_factors = svd_model.pu
user_factors.shape
item_factors = svd_model.qi
item_factors.shape

(2827, 50)

(567, 50)

In [0]:
pred = np.dot(user_factors,np.transpose(item_factors))

In [0]:
pred[1523,0:5]

array([4.01220552, 2.86285687, 4.27408716, 4.06345127, 3.81461572])

In [0]:
svd_model.predict(uid = trainset.to_raw_uid(1523), iid = trainset.to_raw_iid(0))

Prediction(uid='19573', iid='Shanghai Noon (2000)', r_ui=None, est=4.012205521710225, details={'was_impossible': False})

<b> Parameter tuning of SVD Recommendation system </b>

In [0]:
from surprise.model_selection import GridSearchCV
param_grid = {'n_factors' : [5,10,15], "reg_all":[0.01,0.02]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3,refit = True)


In [0]:
gs.fit(data)

In [0]:
# get all parameter combinations
gs.param_combinations

[{'n_factors': 5, 'reg_all': 0.01},
 {'n_factors': 5, 'reg_all': 0.02},
 {'n_factors': 10, 'reg_all': 0.01},
 {'n_factors': 10, 'reg_all': 0.02},
 {'n_factors': 15, 'reg_all': 0.01},
 {'n_factors': 15, 'reg_all': 0.02}]

In [0]:
# get best parameters
gs.best_params

{'rmse': {'n_factors': 15, 'reg_all': 0.01}}

In [0]:
# Use the "best model" for prediction
gs.test(testset)

<b> Computing Similarity Matrix </b>

In [0]:
import numpy as np

In [0]:
item_factors

In [0]:
item_sim = np.corrcoef(item_factors)
max_val = (-item_sim).argsort()

In [0]:
topk = pd.DataFrame(max_val[:,0:20])

In [0]:
# create item iid dictionary

all_movies = [trainset.to_raw_iid(x) for x in range(0,567)]
movie_iid_dict = dict(zip(range(0,567), all_movies))

In [0]:
topk = topk.replace(movie_iid_dict)

In [0]:
topk["movie"] = all_movies

In [0]:
topk.to_csv("sim_movies_svd.csv",index=False)