In [13]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split

In [14]:
root = '../HW_Datasets/movie_lens/'

tags_path = 'tags.csv'
movies_path = 'movies.csv'
ratings_path = 'ratings.csv'

In [15]:
tags = pd.read_csv(root + tags_path).drop('timestamp', axis=1)
movies = pd.read_csv(root + movies_path)
ratings = pd.read_csv(root + ratings_path).drop('timestamp', axis=1)

## Get all data in one table

In [16]:
temp = pd.merge(
    left=ratings, 
    right=tags,
    how='left',
    left_on=['userId', 'movieId'],
    right_on=['userId', 'movieId']
)

In [17]:
temp1 = pd.merge(
    left = temp,
    right = movies,
    how='left',
    left_on='movieId',
    right_on='movieId'
).sort_values('userId').dropna()

In [18]:
temp1

Unnamed: 0,userId,movieId,rating,tag,title,genres
252,2,89774,5.0,Boxing story,Warrior (2011),Drama
253,2,89774,5.0,MMA,Warrior (2011),Drama
254,2,89774,5.0,Tom Hardy,Warrior (2011),Drama
259,2,106782,5.0,Leonardo DiCaprio,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama
260,2,106782,5.0,Martin Scorsese,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama
...,...,...,...,...,...,...
99749,606,3578,3.5,Romans,Gladiator (2000),Action|Adventure|Drama
99522,606,1948,3.5,British,Tom Jones (1963),Adventure|Comedy|Romance
102673,610,168248,5.0,Heroic Bloodshed,John Wick: Chapter Two (2017),Action|Crime|Thriller
101553,610,3265,5.0,gun fu,Hard-Boiled (Lat sau san taam) (1992),Action|Crime|Drama|Thriller


## Global preprocessing

In [19]:
temp1['genres'] = temp1['genres'].apply(lambda x: ' '.join(x.lower().split('|')))

d = {}

for movie, group in temp1.groupby('movieId'):
    d[movie] = group.rating.values.mean()
    
temp1['mean_rating'] = [d[i] for i in temp1['movieId']]

In [26]:
users = []
movies = []
user_mean_ratings = []
user_act_ratings = []
movie_ratings = []
user_tags = []
genres = []
new_names = ['userId', 'movieId', 'user_mean_rate', 'user_act_rate', 'movie_mean_rate', 'user_tags', 'genres']

 # Creating a final dataset with all values
for user, user_group in temp1.groupby('userId'):
    for movie, movie_group in user_group.groupby('movieId'):
        users.append(user)
        movies.append(movie)
        user_mean_ratings.append(user_group.rating.mean())
        user_act_ratings.append(movie_group.rating.values[0])
        movie_ratings.append(d[movie_group.movieId.values[0]])
        user_tags.append(' '.join(i.replace(' ', '').replace('-', '') for i in movie_group.tag.values).lower())
        genres.append(movie_group.genres.values[0])

In [27]:
fin = pd.DataFrame([users, movies, user_mean_ratings, user_act_ratings, movie_ratings, user_tags, genres]).T

fin = fin.rename(columns={old:new for old, new in zip(fin.columns, new_names)})

In [28]:
fin

Unnamed: 0,userId,movieId,user_mean_rate,user_act_rate,movie_mean_rate,user_tags,genres
0,2,60756,5.0,5.0,4.1875,funny highlyquotable willferrell,comedy
1,2,89774,5.0,5.0,5.0,boxingstory mma tomhardy,drama
2,2,106782,5.0,5.0,5.0,leonardodicaprio martinscorsese drugs,comedy crime drama
3,7,48516,1.0,1.0,3.95,waytoolong,crime drama thriller
4,18,431,4.125,4.0,4.0,mafia gangster alpacino,crime drama
...,...,...,...,...,...,...,...
1630,606,5694,3.75,3.0,3.0,70mm,comedy drama musical
1631,606,6107,3.75,4.0,4.0,worldwarii,drama war
1632,606,7382,3.75,4.5,4.166667,forkatie,drama mystery thriller
1633,610,3265,5.0,5.0,5.0,gunfu heroicbloodshed,action crime drama thriller


### Adding tags-genres features

In [29]:
words = (fin['user_tags'].values + fin['genres'].values)

In [30]:
cv = CountVectorizer()
cv.fit(words)
words_counts = cv.transform(words)

In [31]:
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(words_counts)

In [32]:
tfidf = pd.DataFrame.sparse.from_spmatrix(tfidf)

### Separate onto X and y data
### Scale numeric data in X dataframe

In [38]:
X = pd.concat([fin, tfidf], axis=1).drop(['user_tags', 'genres', 'userId', 'movieId', 'user_act_rate'], axis=1)
X['movie_mean_rate'] = minmax_scale(X['movie_mean_rate'])
X['user_mean_rate'] = minmax_scale(X['user_mean_rate'])
X = X.rename(columns={'user_mean_rate':-2, 'movie_mean_rate':-1}) # rename so model wont show us errors

y = fin['user_act_rate']

In [41]:
X

Unnamed: 0,-2,-1,0,1,2,3,4,5,6,7,...,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038
0,1.000000,0.819444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.000000,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.000000,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.111111,0.766667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.805556,0.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1630,0.722222,0.555556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1631,0.722222,0.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1632,0.722222,0.814815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1633,1.000000,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Train-test split

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=15)

In [43]:
X_train

Unnamed: 0,-2,-1,0,1,2,3,4,5,6,7,...,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038
680,0.711535,0.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
617,0.711535,0.888889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
897,0.711535,0.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60,0.763964,0.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328,0.711535,0.984127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.711535,0.444444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156,0.793210,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
384,0.711535,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
645,0.711535,0.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Models fitting and evaluation

In [44]:
from sklearn.ensemble import GradientBoostingRegressor as GBR

In [46]:
from sklearn.metrics import mean_squared_error as MSE #Слишком старый у меня склеарн

In [47]:
regressor = GBR()

In [48]:
regressor.fit(X_train, y_train)



In [49]:
yhat = regressor.predict(X_test)



In [50]:
mse = MSE(y_test, yhat)
rmse = np.sqrt(mse)

In [51]:
print(f'Final RMSE metric on test: {rmse}')

Final RMSE metric on test: 0.2850929453464323
