In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np

In [21]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [22]:
data = pd.merge(ratings, movies, on='movieId', how='inner')

In [23]:
tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

In [24]:
data = pd.merge(data, tags_grouped, on='movieId', how='left')
data['tag'] = data['tag'].fillna('')

In [25]:
data['combined_features'] = data['tag'] + ' ' + data['genres']

In [26]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(data['combined_features'])

In [27]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,tag,combined_features
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar pixar fun,pixar pixar fun Adventure|Animation|Children|C...
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,moldy old,moldy old Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,,Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,mystery twist ending serial killer,mystery twist ending serial killer Mystery|Thr...
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,mindfuck suspense thriller tricky twist ending...,mindfuck suspense thriller tricky twist ending...


In [28]:
# Среднее, медиана и стандартное отклонение оценок пользователя и фильма
mean_user_rating = data.groupby('userId')['rating'].transform('mean').values.reshape(-1, 1)
median_user_rating = data.groupby('userId')['rating'].transform('median').values.reshape(-1, 1)
std_user_rating = data.groupby('userId')['rating'].transform('std').fillna(0).values.reshape(-1, 1)

mean_movie_rating = data.groupby('movieId')['rating'].transform('mean').values.reshape(-1, 1)
median_movie_rating = data.groupby('movieId')['rating'].transform('median').values.reshape(-1, 1)
std_movie_rating = data.groupby('movieId')['rating'].transform('std').fillna(0).values.reshape(-1, 1)


In [29]:
numerical_features = np.hstack([mean_user_rating, median_user_rating, std_user_rating,
                                mean_movie_rating, median_movie_rating, std_movie_rating])

In [30]:
X = hstack((X_tfidf, numerical_features)).tocsr()
y = data['rating'].values

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

In [33]:
predictions = model.predict(X_test)

In [34]:
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'RMSE: {rmse:.4f}')

RMSE: 0.8187
