In [1]:
# movie recommender using linear regression

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [7]:
#loading the dataset
ratings = pd.read_csv('ratings.csv')  
movies = pd.read_csv('movies.csv')

In [9]:
# Merge datasets
data = pd.merge(ratings, movies, on='movieId')


In [17]:
data['year'] = data['title'].str.extract(r'\((\d{4})\)', expand=False).fillna(0).astype(int)
data = pd.get_dummies(data, columns=['genres'], drop_first=True)


In [19]:
# aggregeation
user_stats = data.groupby('userId').agg({'rating': ['mean', 'count']}).reset_index()
movie_stats = data.groupby('movieId').agg({'rating': ['mean', 'count']}).reset_index()
user_stats.columns = ['userId', 'user_avg_rating', 'user_rating_count']
movie_stats.columns = ['movieId', 'movie_avg_rating', 'movie_rating_count']

In [21]:
# Merge stats back to data
data = pd.merge(data, user_stats, on='userId', how='left')
data = pd.merge(data, movie_stats, on='movieId', how='left')

In [23]:
# Define features and target
X = data[['user_avg_rating', 'user_rating_count', 'movie_avg_rating', 'movie_rating_count', 'year'] + list(data.filter(like='genres_').columns)]
y = data['rating']


In [25]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [29]:
# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 116438459317762.23


In [31]:
# Make predictions
def recommend_movies(user_id, n=5):
    user_data = data[data['userId'] == user_id]
    user_features = user_data[X.columns]
    predicted_ratings = model.predict(user_features)
    user_data['predicted_rating'] = predicted_ratings
    return user_data.sort_values('predicted_rating', ascending=False).head(n)

In [35]:
print(recommend_movies(user_id=3))

     userId  movieId  rating   timestamp  \
290       3     5746     5.0  1306463708   
294       3     6835     5.0  1306463670   
291       3     5764     4.5  1306464021   
295       3     7899     4.5  1306464036   
289       3     5181     5.0  1306463718   

                                                 title  year  genres_Action  \
290                    Galaxy of Terror (Quest) (1981)  1981          False   
294                         Alien Contamination (1980)  1980          False   
291                                      Looker (1981)  1981          False   
295  Master of the Flying Guillotine (Du bi quan wa...  1975           True   
289                                   Hangar 18 (1980)  1980          False   

     genres_Action|Adventure  genres_Action|Adventure|Animation  \
290                    False                              False   
294                    False                              False   
291                    False                              F

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_data['predicted_rating'] = predicted_ratings
