# Ranking/Recommendation on small movielens dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
movie_data = pd.read_csv('mlens/movies.csv')
print(movie_data.shape)
movie_data.head()

(9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
rating_data = pd.read_csv('mlens/ratings.csv')
print(rating_data.shape)
rating_data.head()

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
combined_df = rating_data.merge(movie_data, on='movieId', how='left')
print(combined_df.shape)
combined_df.head(10)

(100836, 6)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
5,1,70,3.0,964982400,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller
6,1,101,5.0,964980868,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance
7,1,110,4.0,964982176,Braveheart (1995),Action|Drama|War
8,1,151,5.0,964984041,Rob Roy (1995),Action|Drama|Romance|War
9,1,157,5.0,964984100,Canadian Bacon (1995),Comedy|War


In [5]:
final_df = combined_df[['userId', 'movieId', 'genres', 'rating']]
final_df['keys'] = final_df['genres'].apply(lambda x: ' '.join(x.lower().split('|')))
final_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,userId,movieId,genres,rating,keys
0,1,1,Adventure|Animation|Children|Comedy|Fantasy,4.0,adventure animation children comedy fantasy
1,1,3,Comedy|Romance,4.0,comedy romance
2,1,6,Action|Crime|Thriller,4.0,action crime thriller
3,1,47,Mystery|Thriller,5.0,mystery thriller
4,1,50,Crime|Mystery|Thriller,5.0,crime mystery thriller


In [6]:
final_df.loc[final_df['movieId'] == 49]

Unnamed: 0,userId,movieId,genres,rating,keys
29386,202,49,Drama|Romance,3.0,drama romance


In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
out_mat = cv.fit_transform(final_df['keys'])
out_mat.shape

(100836, 24)

In [8]:
import scipy as sp
comb = sp.sparse.hstack((out_mat, final_df[['userId', 'movieId']].values))
comb.shape

(100836, 26)

In [9]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(comb, final_df[['rating']], test_size=0.25)
xtrain.shape, ytrain.shape

((75627, 26), (75627, 1))

In [10]:
dt = DecisionTreeRegressor(max_depth=15)
dt.fit(xtrain, ytrain)

DecisionTreeRegressor(max_depth=15)

In [11]:
from sklearn.metrics import mean_squared_error
ypred = dt.predict(xtest)
mean_squared_error(ytest, ypred)


1.0165440701255322

In [12]:
test_mat = out_mat[29386]

In [13]:
test_comb = sp.sparse.hstack((test_mat, final_df.loc[final_df['movieId'] == 49][['userId', 'movieId']]))

In [14]:
dt.predict(test_comb)

array([3.54028698])

Can expect user 1 to predict movieId 49 as 3.67
# Ranking attempt

In [15]:
final_df.head()

Unnamed: 0,userId,movieId,genres,rating,keys
0,1,1,Adventure|Animation|Children|Comedy|Fantasy,4.0,adventure animation children comedy fantasy
1,1,3,Comedy|Romance,4.0,comedy romance
2,1,6,Action|Crime|Thriller,4.0,action crime thriller
3,1,47,Mystery|Thriller,5.0,mystery thriller
4,1,50,Crime|Mystery|Thriller,5.0,crime mystery thriller


In [16]:
final_df['keys'] = final_df['genres'].apply(lambda x: x.lower().split('|'))
final_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,userId,movieId,genres,rating,keys
0,1,1,Adventure|Animation|Children|Comedy|Fantasy,4.0,"[adventure, animation, children, comedy, fantasy]"
1,1,3,Comedy|Romance,4.0,"[comedy, romance]"
2,1,6,Action|Crime|Thriller,4.0,"[action, crime, thriller]"
3,1,47,Mystery|Thriller,5.0,"[mystery, thriller]"
4,1,50,Crime|Mystery|Thriller,5.0,"[crime, mystery, thriller]"


In [17]:
uniques = []
for i, row in final_df.iterrows():
    for j in row['keys']:
        uniques.append(j)
set(uniques)

{'(no genres listed)',
 'action',
 'adventure',
 'animation',
 'children',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'film-noir',
 'horror',
 'imax',
 'musical',
 'mystery',
 'romance',
 'sci-fi',
 'thriller',
 'war',
 'western'}

In [18]:
tra_df = final_df.reindex(columns=['userId', 'movieId', 'rating'] + list(set(uniques)))

In [19]:
tra_df.head()

Unnamed: 0,userId,movieId,rating,romance,horror,crime,documentary,drama,fantasy,thriller,...,comedy,western,animation,(no genres listed),imax,musical,sci-fi,war,film-noir,children
0,1,1,4.0,,,,,,,,...,,,,,,,,,,
1,1,3,4.0,,,,,,,,...,,,,,,,,,,
2,1,6,4.0,,,,,,,,...,,,,,,,,,,
3,1,47,5.0,,,,,,,,...,,,,,,,,,,
4,1,50,5.0,,,,,,,,...,,,,,,,,,,


In [20]:
for i, row in final_df.iterrows():
    for j in row['keys']:
        tra_df.at[i, str(j)] = 1
tra_df.head()

Unnamed: 0,userId,movieId,rating,romance,horror,crime,documentary,drama,fantasy,thriller,...,comedy,western,animation,(no genres listed),imax,musical,sci-fi,war,film-noir,children
0,1,1,4.0,,,,,,1.0,,...,1.0,,1.0,,,,,,,1.0
1,1,3,4.0,1.0,,,,,,,...,1.0,,,,,,,,,
2,1,6,4.0,,,1.0,,,,1.0,...,,,,,,,,,,
3,1,47,5.0,,,,,,,1.0,...,,,,,,,,,,
4,1,50,5.0,,,1.0,,,,1.0,...,,,,,,,,,,


In [21]:
tra_df.fillna(0, inplace=True)
tra_df.head()

Unnamed: 0,userId,movieId,rating,romance,horror,crime,documentary,drama,fantasy,thriller,...,comedy,western,animation,(no genres listed),imax,musical,sci-fi,war,film-noir,children
0,1,1,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,3,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,6,4.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,47,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,50,5.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
import lightgbm as lgb
train_df = tra_df[:2 * tra_df.shape[0]//3]
test_df = tra_df[2 * tra_df.shape[0]//3:]
train_df.shape, test_df.shape

((67224, 23), (33612, 23))

In [23]:
qids_train = train_df.groupby('userId')['userId'].count().to_numpy()
x_train = train_df.drop(['userId', 'rating'], axis=1)
y_train = train_df['rating'].astype(int)

In [24]:
qids_test = test_df.groupby('userId')['userId'].count().to_numpy()
x_test = test_df.drop(['userId', 'rating'], axis=1)
y_test = test_df['rating'].astype(int)

In [25]:
model = lgb.LGBMRanker(objective='lambdarank', metric='ndcg')
model.fit(x_train, y_train, group=qids_train, eval_set=[(x_test, y_test)], eval_group=[qids_test], eval_at=10, verbose=10)



[10]	valid_0's ndcg@10: 0.661394
[20]	valid_0's ndcg@10: 0.660533
[30]	valid_0's ndcg@10: 0.661057
[40]	valid_0's ndcg@10: 0.660558
[50]	valid_0's ndcg@10: 0.66251
[60]	valid_0's ndcg@10: 0.661821
[70]	valid_0's ndcg@10: 0.660374
[80]	valid_0's ndcg@10: 0.66141
[90]	valid_0's ndcg@10: 0.660974
[100]	valid_0's ndcg@10: 0.660843


LGBMRanker(metric='ndcg', objective='lambdarank')

In [26]:
y_pred = model.predict(x_test)
y_pred.shape

(33612,)

In [27]:
x_test['pred_ranking'] = y_pred
x_test.sort_values('pred_ranking', ascending=False)

Unnamed: 0,movieId,romance,horror,crime,documentary,drama,fantasy,thriller,action,adventure,...,western,animation,(no genres listed),imax,musical,sci-fi,war,film-noir,children,pred_ranking
100789,141799,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.772527
79760,140523,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.772527
88119,144620,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.746027
70436,144620,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.746027
100804,149011,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.744346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98777,546,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-2.237362
92698,181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-2.320871
98705,181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-2.320871
97077,181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-2.320871


In [28]:
x_query = train_df.drop(['userId', 'rating'], axis=1).iloc[0].to_numpy().reshape(1, -1)
x_query

array([[1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0.,
        0., 0., 0., 0., 1.]])

In [29]:
model.predict(x_query)

array([0.16113497])

# Matrix Factorization attempt

In [56]:
rating_data_filt = rating_data.groupby('movieId').filter(lambda x: x['movieId'].count() >= 2)
rating_data_filt = rating_data_filt.groupby('userId').filter(lambda x: x['userId'].count() >= 5)
rating_data_filt.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [59]:
from surprise import Reader, Dataset, NMF, SVD
from surprise.model_selection import cross_validate, GridSearchCV

reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(rating_data_filt[['movieId', 'userId', 'rating']], reader)
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

In [60]:
param_grid = {'n_factors': [4, 10, 15, 20, 25]}
gs = GridSearchCV(NMF, param_grid, measures=['rmse'], cv=5)
gs.fit(data)
gs.best_score['rmse']

0.91126809566844

In [61]:
gs.best_params['rmse']

{'n_factors': 15}

In [62]:
nmf = NMF(n_factors=15)
nmf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1ed76142708>

In [63]:
preds = nmf.test(testset)

In [64]:
preds[:10]

[Prediction(uid=1, iid=2, r_ui=3.5110432282575212, est=4.203438251507705, details={'was_impossible': False}),
 Prediction(uid=1, iid=3, r_ui=3.5110432282575212, est=1.8162635525176203, details={'was_impossible': False}),
 Prediction(uid=1, iid=4, r_ui=3.5110432282575212, est=3.510630974197849, details={'was_impossible': False}),
 Prediction(uid=1, iid=6, r_ui=3.5110432282575212, est=3.9287586753984813, details={'was_impossible': False}),
 Prediction(uid=1, iid=8, r_ui=3.5110432282575212, est=4.012005694669968, details={'was_impossible': False}),
 Prediction(uid=1, iid=9, r_ui=3.5110432282575212, est=4.036510008093392, details={'was_impossible': False}),
 Prediction(uid=1, iid=10, r_ui=3.5110432282575212, est=3.525463089951031, details={'was_impossible': False}),
 Prediction(uid=1, iid=11, r_ui=3.5110432282575212, est=4.277004345903196, details={'was_impossible': False}),
 Prediction(uid=1, iid=12, r_ui=3.5110432282575212, est=4.805876355895821, details={'was_impossible': False}),
 Pred

In [82]:
from collections import defaultdict
def get_top_n(preds, userId, movie_df, rating_df, n=10):
    top_n = defaultdict(list)
    for uid, iid, r, est, _ in preds:
        top_n[uid].append((iid, est))
    for uid, ratings in top_n.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = ratings[:n]
    # hist data
    user_data = rating_df[rating_df.userId == (userId)]
    print(f'User {userId} has already rated {user_data.shape[0]} movies')

    preds_df = pd.DataFrame([(id, pair[0], pair[1]) for id, row in top_n.items() for pair in row], columns=['userId', 'movieId', 'pred_rating'])
    pred_usr = preds_df[preds_df['userId'] == (userId)].merge(movie_df, how='left', left_on='movieId', right_on='movieId')
    hist_usr = rating_df[rating_df.userId == (userId)].sort_values('rating', ascending=False).merge(movie_df, how='left', on='movieId')
    return hist_usr, pred_usr

hist_1, pred_1 = get_top_n(preds, 1, movie_data, rating_data)

User 1 has already rated 232 movies


In [66]:
hist_1.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,5060,5.0,964984002,M*A*S*H (a.k.a. MASH) (1970),Comedy|Drama|War
1,1,2872,5.0,964981680,Excalibur (1981),Adventure|Fantasy
2,1,1291,5.0,964981909,Indiana Jones and the Last Crusade (1989),Action|Adventure
3,1,1298,5.0,964984086,Pink Floyd: The Wall (1982),Drama|Musical
4,1,2948,5.0,964982191,From Russia with Love (1963),Action|Adventure|Thriller


In [67]:
pred_1.head(10)

Unnamed: 0,userId,movieId,pred_rating,title,genres
0,1,53,5.0,Lamerica (1994),Adventure|Drama
1,1,154,5.0,Beauty of the Day (Belle de jour) (1967),Drama
2,1,250,5.0,Heavyweights (Heavy Weights) (1995),Children|Comedy
3,1,429,5.0,Cabin Boy (1994),Comedy
4,1,452,5.0,Widows' Peak (1994),Drama
5,1,475,4.949152,In the Name of the Father (1993),Drama
6,1,543,4.906793,So I Married an Axe Murderer (1993),Comedy|Romance|Thriller
7,1,519,4.828402,RoboCop 3 (1993),Action|Crime|Drama|Sci-Fi|Thriller
8,1,586,4.826953,Home Alone (1990),Children|Comedy
9,1,441,4.817136,Dazed and Confused (1993),Comedy


In [73]:
param_grid = {'n_factors': [1, 3, 5, 7, 9, 15, 27]}
gs_svd = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data)

In [74]:
print(gs.best_params)
print(gs.best_score)

{'rmse': {'n_factors': 20}}
{'rmse': 0.9131329410615603}


In [75]:
svd = SVD(n_factors=20)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ecc082ae08>

In [79]:
testset = trainset.build_anti_testset()
preds_svd = svd.test(testset)

In [84]:
hist_103_svd, preds_103_svd = get_top_n(preds_svd, 103, movie_data, rating_data)

User 103 has already rated 377 movies


In [85]:
preds_103_svd.head(10)

Unnamed: 0,userId,movieId,pred_rating,title,genres
0,103,53,4.653938,Lamerica (1994),Adventure|Drama
1,103,43,4.596248,Restoration (1995),Drama
2,103,12,4.303246,Dracula: Dead and Loving It (1995),Comedy|Horror
3,103,276,4.267126,Milk Money (1994),Comedy|Romance
4,103,452,4.265711,Widows' Peak (1994),Drama
5,103,523,4.215394,Ruby in Paradise (1993),Drama
6,103,337,4.209635,What's Eating Gilbert Grape (1993),Drama
7,103,584,4.195718,,
8,103,515,4.137509,"Remains of the Day, The (1993)",Drama|Romance
9,103,543,4.135358,So I Married an Axe Murderer (1993),Comedy|Romance|Thriller
