# Рекомендации на основе содержания

1. Использовать dataset MovieLens
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
* TF-IDF на тегах и жанрах
* Средние оценки (+ median, variance, etc.) пользователя и фильма
3. Оценить RMSE на тестовой выборке

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [None]:
#tags[tags['movieId']==34]

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies.shape

(9742, 3)

In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
tags.shape

(3683, 4)

In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
ratings.shape

(100836, 4)

In [10]:
movies['genres_new'] = movies['genres'].str.replace('-', ' ').str.replace(' ', '')

In [11]:
movies['year'] = movies['title'].str.slice(start=-5, stop=-1)

In [12]:
movies = movies.drop(['genres'], axis=1)

In [13]:
text_genres = movies.genres_new.values
text_genres[:5]

array(['Adventure|Animation|Children|Comedy|Fantasy',
       'Adventure|Children|Fantasy', 'Comedy|Romance',
       'Comedy|Drama|Romance', 'Comedy'], dtype=object)

In [14]:
vectorizer = TfidfVectorizer()

In [15]:
X_genres = vectorizer.fit_transform(text_genres)

In [16]:
vectorizer.vocabulary_

{'adventure': 1,
 'animation': 2,
 'children': 3,
 'comedy': 4,
 'fantasy': 8,
 'romance': 15,
 'drama': 7,
 'action': 0,
 'crime': 5,
 'thriller': 17,
 'horror': 10,
 'mystery': 13,
 'scifi': 16,
 'war': 18,
 'musical': 12,
 'documentary': 6,
 'imax': 11,
 'western': 19,
 'filmnoir': 9,
 'nogenreslisted': 14}

In [17]:
vectorizer.idf_

array([2.67277971, 3.04226778, 3.76757207, 3.68451732, 1.95292831,
       3.09422752, 4.09525948, 1.80361841, 3.52501044, 5.70696754,
       3.29777271, 5.11540016, 4.37017383, 3.83167496, 6.6289563 ,
       2.80842221, 3.2957319 , 2.63733024, 4.23626937, 5.06034038])

In [18]:
print(X_genres.shape)
print(X_genres.toarray())

(9742, 20)
[[0.         0.41684567 0.51622547 ... 0.         0.         0.        ]
 [0.         0.51236121 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.57860574 0.         0.81560738 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [19]:
vectorizer.get_feature_names()

['action',
 'adventure',
 'animation',
 'children',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'filmnoir',
 'horror',
 'imax',
 'musical',
 'mystery',
 'nogenreslisted',
 'romance',
 'scifi',
 'thriller',
 'war',
 'western']

In [20]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [21]:
tags[tags['movieId']==60756]

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
205,62,60756,comedy,1528934384
206,62,60756,funny,1528934381
207,62,60756,will ferrell,1528934379
909,424,60756,funny,1457846127
910,424,60756,will ferrell,1457846129


In [None]:
#tags['tag'] = tags['tag'].str.replace('-', ' ').str.replace(' ', '')

In [22]:
tags = tags[['movieId','tag']].drop_duplicates()

In [23]:
tags['tag_new'] = tags[['movieId', 'tag']].groupby(['movieId'])['tag'].transform(lambda x: ' '.join(x))

In [24]:
tags = tags.drop(columns='tag')

In [25]:
tags = tags.drop_duplicates()

In [26]:
tags.head()

Unnamed: 0,movieId,tag_new
0,60756,funny Highly quotable will ferrell comedy
3,89774,Boxing story MMA Tom Hardy
6,106782,drugs Leonardo DiCaprio Martin Scorsese Stock ...
9,48516,way too long Leonardo DiCaprio suspense twist ...
10,431,Al Pacino gangster mafia


In [27]:
tags.shape

(1572, 2)

In [28]:
movies['tags_for_tfidf'] = movies.apply(lambda r: ' '.join(tags[tags['movieId']==r['movieId']]['tag_new'].values), axis=1)

In [29]:
movies.head()

Unnamed: 0,movieId,title,genres_new,year,tags_for_tfidf
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,pixar fun
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,fantasy magic board game Robin Williams game
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,moldy old
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,
4,5,Father of the Bride Part II (1995),Comedy,1995,pregnancy remake


In [30]:
tag_strings = movies['tags_for_tfidf'].values
tag_strings[:5]

array(['pixar fun', 'fantasy magic board game Robin Williams game',
       'moldy old', '', 'pregnancy remake'], dtype=object)

In [31]:
vectorizer_tags = TfidfVectorizer()

In [32]:
X_tags = vectorizer_tags.fit_transform(tag_strings)

In [33]:
vectorizer_tags.vocabulary_

{'pixar': 1186,
 'fun': 609,
 'fantasy': 548,
 'magic': 952,
 'board': 187,
 'game': 617,
 'robin': 1307,
 'williams': 1714,
 'moldy': 1023,
 'old': 1120,
 'pregnancy': 1216,
 'remake': 1279,
 'politics': 1200,
 'president': 1219,
 'mafia': 950,
 'jane': 837,
 'austen': 116,
 'hollywood': 736,
 'serial': 1379,
 'killer': 879,
 'alcoholism': 50,
 'shakespeare': 1389,
 'in': 785,
 'netflix': 1083,
 'queue': 1245,
 'kidnapping': 877,
 'high': 719,
 'school': 1353,
 'teacher': 1537,
 'time': 1578,
 'travel': 1604,
 'brad': 204,
 'pitt': 1185,
 'bruce': 221,
 'willis': 1715,
 'mindfuck': 1011,
 'post': 1207,
 'apocalyptic': 88,
 'twist': 1621,
 'ending': 507,
 'animal': 77,
 'movie': 1045,
 'pigs': 1183,
 'villain': 1664,
 'nonexistent': 1099,
 'or': 1129,
 'not': 1106,
 'needed': 1075,
 'for': 586,
 'good': 649,
 'story': 1480,
 'death': 414,
 'penalty': 1167,
 'nun': 1110,
 'twins': 1620,
 'chick': 283,
 'flick': 582,
 'funny': 610,
 'paul': 1162,
 'rudd': 1322,
 'quotable': 1249,
 'seen'

In [34]:
print(X_tags.shape)
print(X_tags.toarray())

(9742, 1744)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [35]:
X_tags = X_tags.toarray()[:, :400]
X_tags.shape

(9742, 400)

In [36]:
movies[movies['movieId']==2]

Unnamed: 0,movieId,title,genres_new,year,tags_for_tfidf
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,fantasy magic board game Robin Williams game


In [37]:
# tags[tags['tag_new'].str.contains('villainnonexistentornotneededforgoodstory')]

In [38]:
X_genres_tfidf = pd.DataFrame(X_genres.toarray(), columns=vectorizer.get_feature_names())
X_genres_tfidf.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
X_tags_tfidf = pd.DataFrame(X_tags, columns=vectorizer_tags.get_feature_names()[:400])
X_tags_tfidf.shape

(9742, 400)

In [None]:
#X_tags_tfidf = pd.DataFrame(X_tags.toarray(), columns=vectorizer_tags.get_feature_names())
#X_tags_tfidf.shape

In [40]:
movies1 = movies.join(X_genres_tfidf, how='left')
movies1.head()

Unnamed: 0,movieId,title,genres_new,year,tags_for_tfidf,action,adventure,animation,children,comedy,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,pixar fun,0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,fantasy magic board game Robin Williams game,0.0,0.512361,0.0,0.620525,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,moldy old,0.0,0.0,0.0,0.0,0.570915,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,,0.0,0.0,0.0,0.0,0.505015,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,1995,pregnancy remake,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
movies_tfidf = movies1.join(X_tags_tfidf, how='left', lsuffix='t')
movies_tfidf.head()

Unnamed: 0,movieId,title,genres_new,year,tags_for_tfidf,actiont,adventuret,animationt,childrent,comedyt,...,cusack,cuts,cyberpunk,cyborg,cynical,dahl,damon,dan,dance,dancing
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,pixar fun,0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,fantasy magic board game Robin Williams game,0.0,0.512361,0.0,0.620525,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,moldy old,0.0,0.0,0.0,0.0,0.570915,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,,0.0,0.0,0.0,0.0,0.505015,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,1995,pregnancy remake,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
movies_tfidf.shape

(9742, 425)

In [None]:
#movies_tfidf[movies_tfidf['zooey']!=0]

In [None]:
# movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId', how='left')
# movies_with_tags.head()

In [None]:
# movies_with_tags.shape

In [None]:
#movies_with_tags['tag_new'].fillna(0,inplace=True)

In [None]:
#tag_strings = tags['tag_new'].values
#tag_strings[-5:]

In [None]:
#vectorizer_tags = TfidfVectorizer()

In [None]:
#X_tags = vectorizer_tags.fit_transform(tag_strings)

In [None]:
#vectorizer_tags.vocabulary_

In [None]:
#vectorizer_tags.idf_

In [None]:
#print(X_tags.shape)
#print(X_tags.toarray())

In [43]:
ratings.groupby('userId').count().sort_values('rating', ascending=False).head(5)

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
414,2698,2698,2698
599,2478,2478,2478
474,2108,2108,2108
448,1864,1864,1864
274,1346,1346,1346


In [44]:
User = 448

In [45]:
ratings_with_mean = ratings.groupby('movieId')['rating'].agg(['mean', 'var', 'count']).reset_index()
ratings_with_mean=ratings_with_mean.fillna(0)
ratings_with_mean.head()

Unnamed: 0,movieId,mean,var,count
0,1,3.92093,0.69699,215
1,2,3.431818,0.777419,110
2,3,3.259615,1.112651,52
3,4,2.357143,0.72619,7
4,5,3.071429,0.822917,49


In [46]:
ratings_with_mean[ratings_with_mean.movieId ==138610]

Unnamed: 0,movieId,mean,var,count
8963,138610,1.5,0.0,1


In [47]:
ratings[ratings.movieId ==138610]

Unnamed: 0,userId,movieId,rating,timestamp
100778,610,138610,1.5,1493848976


In [48]:
ratings_with_mean = ratings.merge(ratings_with_mean)
ratings_with_mean.head()

Unnamed: 0,userId,movieId,rating,timestamp,mean,var,count
0,1,1,4.0,964982703,3.92093,0.69699,215
1,5,1,4.0,847434962,3.92093,0.69699,215
2,7,1,4.5,1106635946,3.92093,0.69699,215
3,15,1,2.5,1510577970,3.92093,0.69699,215
4,17,1,4.5,1305696483,3.92093,0.69699,215


In [49]:
df_finish = ratings_with_mean.merge(movies_tfidf)
df_finish.head()

Unnamed: 0,userId,movieId,rating,timestamp,mean,var,count,title,genres_new,year,...,cusack,cuts,cyberpunk,cyborg,cynical,dahl,damon,dan,dance,dancing
0,1,1,4.0,964982703,3.92093,0.69699,215,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,1,4.0,847434962,3.92093,0.69699,215,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,1,4.5,1106635946,3.92093,0.69699,215,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15,1,2.5,1510577970,3.92093,0.69699,215,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,1,4.5,1305696483,3.92093,0.69699,215,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
df_for_user = df_finish[df_finish['userId']==User]
df_for_user.head()

Unnamed: 0,userId,movieId,rating,timestamp,mean,var,count,title,genres_new,year,...,cusack,cuts,cyberpunk,cyborg,cynical,dahl,damon,dan,dance,dancing
154,448,1,5.0,1019126661,3.92093,0.69699,215,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
252,448,3,3.0,1019128536,3.259615,1.112651,52,Grumpier Old Men (1995),Comedy|Romance,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
526,448,47,4.0,1019132386,3.975369,0.850875,203,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
729,448,50,4.0,1064741727,4.237745,0.641475,204,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1995,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
843,448,101,3.5,1076328586,3.782609,1.086957,23,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance,1996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
df_for_user = df_for_user.drop(columns=['title', 'genres_new', 'timestamp', 'tags_for_tfidf'])

In [52]:
df_for_user.loc[:, df_for_user.dtypes == object].head()

Unnamed: 0,year
154,1995
252,1995
526,1995
729,1995
843,1996


In [53]:
y = df_for_user['rating']
y.shape

(1864,)

In [54]:
X = df_for_user.drop(columns=['rating','userId', 'movieId'])
X.shape

(1864, 424)

In [55]:
X_dummies = pd.get_dummies(X, columns=['year'])

In [56]:
X_dummies.head()

Unnamed: 0,mean,var,count,actiont,adventuret,animationt,childrent,comedyt,crimet,documentary,...,year_2008,year_2009,year_2010,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016,year_2017
154,3.92093,0.69699,215,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
252,3.259615,1.112651,52,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
526,3.975369,0.850875,203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
729,4.237745,0.641475,204,0.0,0.0,0.0,0.0,0.0,0.553854,0.0,...,0,0,0,0,0,0,0,0,0,0
843,3.782609,1.086957,23,0.0,0.55059,0.0,0.0,0.353441,0.559994,0.0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
from sklearn.model_selection import train_test_split

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, test_size=0.2)

In [59]:
from sklearn.preprocessing import StandardScaler

In [60]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [61]:
from sklearn.ensemble import RandomForestRegressor

In [62]:
model = RandomForestRegressor(max_depth=5)

In [63]:
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=5)

In [64]:
y_pred = model.predict(X_test)

In [65]:
from sklearn.metrics import r2_score, mean_squared_error

In [66]:
r2_score(y_test, y_pred)

0.5730453119144072

In [67]:
mean_squared_error(y_test, y_pred)

0.4254448539996861

In [68]:
from sklearn.model_selection import GridSearchCV

In [69]:
grid_params = {'max_depth': list(range(1, 20)), 'n_estimators': [30, 50, 70, 100, 120]}

In [70]:
forest = GridSearchCV(RandomForestRegressor(), param_grid=grid_params, cv=10, scoring = 'r2')
forest.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19],
                         'n_estimators': [30, 50, 70, 100, 120]},
             scoring='r2')

In [71]:
y_pred = forest.best_estimator_.predict(X_test)

In [75]:
forest.best_params_

{'max_depth': 8, 'n_estimators': 70}

In [72]:
r2_score(y_test, y_pred)

0.5812002008531747

In [74]:
mean_squared_error(y_test, y_pred)**0.5

0.6460021695662642

In [76]:
from lightgbm import LGBMRegressor

In [77]:
lgbm = LGBMRegressor()

In [78]:
lgbm.fit(X_train, y_train)

LGBMRegressor()

In [79]:
y_pred = lgbm.predict(X_test)

In [80]:
r2_score(y_test, y_pred)

0.570248472117419

In [81]:
mean_squared_error(y_test, y_pred)**0.5

0.6543942262657482