In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
movies = pd.read_csv('./netology-recsys-master/lecture-1/movies.csv')

In [3]:
tags = pd.read_csv('./netology-recsys-master/lecture-1/tags.csv')

In [4]:
ratings = pd.read_csv('./netology-recsys-master/lecture-1/ratings.csv')

In [5]:
# закинем все тэги в одну строку
tags_grpd = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x))

In [6]:
# выделим уникальные movieId для тэгов
tags_unique = pd.DataFrame(tags['movieId'].unique(), columns=['movieId'])

In [7]:
tags_grpd_df = pd.merge(tags_unique, tags_grpd, on='movieId')

In [8]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [8]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

### Оформим tf-idf для тэгов

In [9]:
cv = CountVectorizer()
cv_tag = cv.fit_transform(tags_grpd_df['tag'])

In [10]:
tft = TfidfTransformer()
tft_tag = tft.fit_transform(cv_tag).todense()

In [11]:
cols = [None for _ in range(len(cv.vocabulary_))]
for i in cv.vocabulary_:
    cols[cv.vocabulary_[i]] = i

In [12]:
tf_tag_df = pd.DataFrame(tft_tag, columns=cols)

In [13]:
tf_tag_df = pd.concat((tags_unique, tf_tag_df), axis=1)

In [14]:
movies['genres_split'] = movies.genres.str.split('|')

In [15]:
movies['genres_space'] = movies.apply(lambda r: ' '.join(r['genres_split']), axis=1)

In [16]:
cv = CountVectorizer()
cv_genre = cv.fit_transform(movies['genres_space'])

In [17]:
tft = TfidfTransformer()
tft_genre = tft.fit_transform(cv_genre).todense()

In [38]:
cols = [None for _ in range(len(cv.vocabulary_))]
for i in cv.vocabulary_:
    cols[cv.vocabulary_[i]] = i

### Очередь tf-idf для жанров

In [39]:
genre_unique = pd.DataFrame(movies['movieId'].unique(), columns=['movieId'])

In [40]:
tf_genre_df = pd.DataFrame(tft_genre, columns=cols)
tf_genre_df = pd.concat((genre_unique, tf_genre_df), axis=1)

In [41]:
tfs = tf_genre_df.join(tf_tag_df, on = ['movieId'], lsuffix = '_genre', rsuffix = '_tag')

In [42]:
rates_cnt = ratings.groupby('movieId')[['userId']].count()

In [23]:
rates_mean = ratings.groupby('movieId')[['rating']].mean()

In [24]:
rates_mrg = pd.merge(rates_cnt, rates_mean, on = 'movieId')

In [25]:
rate_users = len(ratings['userId'].unique())
rates_mrg['rateing_weighted'] = rates_mrg['userId']*rates_mrg['rating']/rate_users

In [26]:
rates_mrg.columns = ['users', 'rating', 'rating_weighted']

In [27]:
# схлопываем все в одну таблицу
df_fin = pd.merge(rates_mrg, tfs, left_on = 'movieId', right_on = 'movieId_genre')

In [28]:
df_fin = df_fin.fillna(df_fin.mean())

### Строим саму модель

In [29]:
# подготовим список переменных для регрессии
cols_reg = list(df_fin.columns)
cols_reg.remove('users')
cols_reg.remove('movieId_genre')
cols_reg.remove('movieId_tag')
cols_reg.remove('rating')

In [30]:
X, y = df_fin[cols_reg], df_fin['rating']

In [31]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)

In [33]:
# строим случайный лес

model = RandomForestRegressor()
model.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [34]:
# смотрим на результат

print('R^2_train =', model.score(X_train, y_train))
print('R^2_test =', model.score(X_test, y_test))
print('MAE_train =', mean_absolute_error(model.predict(X_train), y_train))
print('MAE_test =', mean_absolute_error(model.predict(X_test), y_test))
print('MSE_train =', mean_squared_error(model.predict(X_train), y_train))
print('MSE_test =', mean_squared_error(model.predict(X_test), y_test))

R^2_train = 0.8075972675922378
R^2_test = 0.5325848256097261
MAE_train = 0.20996729409425072
MAE_test = 0.37674112077681904
MSE_train = 0.14611597750403124
MSE_test = 0.3482934418330896


In [36]:
model.predict(X_test)

array([1.5       , 3.25388391, 1.        , ..., 3.65916667, 3.05740741,
       2.49375   ])

In [37]:
X_train

Unnamed: 0,rating_weighted,action_genre,adventure_genre,animation_genre,children_genre,comedy_genre,crime_genre,documentary_genre,drama_genre,fantasy_genre,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
8595,0.004918,0.000000,0.0,0.0,0.0,0.000000,0.527067,0.0,0.307226,0.000000,...,0.003937,0.0,0.0,0.000629,0.000383,0.000424,0.00043,0.0012,0.003393,0.000331
9492,0.006557,0.000000,0.0,0.0,0.0,0.478734,0.758507,0.0,0.442132,0.000000,...,0.003937,0.0,0.0,0.000629,0.000383,0.000424,0.00043,0.0012,0.003393,0.000331
4826,0.006557,0.000000,0.0,0.0,0.0,0.000000,0.321186,0.0,0.187219,0.000000,...,0.003937,0.0,0.0,0.000629,0.000383,0.000424,0.00043,0.0012,0.003393,0.000331
2712,0.011475,0.502044,0.0,0.0,0.0,0.000000,0.000000,0.0,0.338784,0.000000,...,0.003937,0.0,0.0,0.000629,0.000383,0.000424,0.00043,0.0012,0.003393,0.000331
4914,0.038525,0.000000,0.0,0.0,0.0,0.397590,0.000000,0.0,0.000000,0.717646,...,0.003937,0.0,0.0,0.000629,0.000383,0.000424,0.00043,0.0012,0.003393,0.000331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,0.036885,0.000000,0.0,0.0,0.0,0.734632,0.000000,0.0,0.678466,0.000000,...,0.003937,0.0,0.0,0.000629,0.000383,0.000424,0.00043,0.0012,0.003393,0.000331
189,0.098361,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.540377,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.0000,0.000000,0.000000
8606,0.006557,0.000000,0.0,0.0,0.0,0.734632,0.000000,0.0,0.678466,0.000000,...,0.003937,0.0,0.0,0.000629,0.000383,0.000424,0.00043,0.0012,0.003393,0.000331
4313,0.004098,0.000000,0.0,0.0,0.0,0.734632,0.000000,0.0,0.678466,0.000000,...,0.003937,0.0,0.0,0.000629,0.000383,0.000424,0.00043,0.0012,0.003393,0.000331
