In [3]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

In [4]:
import kagglehub

path = kagglehub.dataset_download("parasharmanas/movie-recommendation-system")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/movie-recommendation-system


In [5]:
ratings_full_df = pd.read_csv('/kaggle/input/movie-recommendation-system/ratings.csv')
movies_df = pd.read_csv('/kaggle/input/movie-recommendation-system/movies.csv')

In [23]:
mean_movie_ratings = ratings_full_df.groupby("movieId")["rating"].mean()
movies_df = movies_df.merge(mean_movie_ratings, how="left", on="movieId").rename({"rating":"ratings_movie_mean"}, axis=1)
movies_df

Unnamed: 0,movieId,title,genres,ratings_movie_mean
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.893708
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.251527
2,3,Grumpier Old Men (1995),Comedy|Romance,3.142028
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.853547
4,5,Father of the Bride Part II (1995),Comedy,3.058434
...,...,...,...,...
62418,209157,We (2018),Drama,1.500000
62419,209159,Window of the Soul (2001),Documentary,3.000000
62420,209163,Bad Poems (2018),Comedy|Drama,4.500000
62421,209169,A Girl Thing (2001),(no genres listed),3.000000


In [6]:
mean_user_ratings = ratings_full_df.groupby("userId")["rating"].mean()
mean_user_ratings

userId
1         3.814286
2         3.630435
3         3.697409
4         3.378099
5         3.752475
            ...   
162537    4.039604
162538    3.415584
162539    4.510638
162540    3.829545
162541    3.365385
Name: rating, Length: 162541, dtype: float64

In [7]:
ratings_df = ratings_full_df.merge(mean_user_ratings, how="left", on="userId", suffixes=("", "_user_mean"))
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp,rating_user_mean
0,1,296,5.0,1147880044,3.814286
1,1,306,3.5,1147868817,3.814286
2,1,307,5.0,1147868828,3.814286
3,1,665,5.0,1147878820,3.814286
4,1,899,3.5,1147868510,3.814286
...,...,...,...,...,...
25000090,162541,50872,4.5,1240953372,3.365385
25000091,162541,55768,2.5,1240951998,3.365385
25000092,162541,56176,2.0,1240950697,3.365385
25000093,162541,58559,4.0,1240953434,3.365385


In [8]:
datetimes = pd.to_datetime(ratings_df['timestamp'], unit='s')

ratings_df['year'] = datetimes.dt.year
ratings_df['month'] = datetimes.dt.month
ratings_df['day'] = datetimes.dt.day
ratings_df['day_of_week'] = datetimes.dt.dayofweek
ratings_df['hour'] = datetimes.dt.hour

In [9]:
def get_part_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

ratings_df['part_of_day'] = ratings_df['hour'].apply(get_part_of_day)

ratings_df = pd.get_dummies(ratings_df, columns=['part_of_day'], prefix='time')

ratings_df = ratings_df.drop(columns=['timestamp'])
ratings_df.head(15)

Unnamed: 0,userId,movieId,rating,rating_user_mean,year,month,day,day_of_week,hour,time_afternoon,time_evening,time_morning,time_night
0,1,296,5.0,3.814286,2006,5,17,2,15,True,False,False,False
1,1,306,3.5,3.814286,2006,5,17,2,12,True,False,False,False
2,1,307,5.0,3.814286,2006,5,17,2,12,True,False,False,False
3,1,665,5.0,3.814286,2006,5,17,2,15,True,False,False,False
4,1,899,3.5,3.814286,2006,5,17,2,12,True,False,False,False
5,1,1088,4.0,3.814286,2006,5,17,2,12,True,False,False,False
6,1,1175,3.5,3.814286,2006,5,17,2,12,True,False,False,False
7,1,1217,3.5,3.814286,2006,5,17,2,15,True,False,False,False
8,1,1237,5.0,3.814286,2006,5,17,2,12,True,False,False,False
9,1,1250,4.0,3.814286,2006,5,17,2,12,True,False,False,False


In [10]:
def is_weekend(d):
  if d > 5:
    return True
  else:
    return False

ratings_df['is_weekend'] = ratings_df['day_of_week'].apply(is_weekend)

ratings_df.head(15)

Unnamed: 0,userId,movieId,rating,rating_user_mean,year,month,day,day_of_week,hour,time_afternoon,time_evening,time_morning,time_night,is_weekend
0,1,296,5.0,3.814286,2006,5,17,2,15,True,False,False,False,False
1,1,306,3.5,3.814286,2006,5,17,2,12,True,False,False,False,False
2,1,307,5.0,3.814286,2006,5,17,2,12,True,False,False,False,False
3,1,665,5.0,3.814286,2006,5,17,2,15,True,False,False,False,False
4,1,899,3.5,3.814286,2006,5,17,2,12,True,False,False,False,False
5,1,1088,4.0,3.814286,2006,5,17,2,12,True,False,False,False,False
6,1,1175,3.5,3.814286,2006,5,17,2,12,True,False,False,False,False
7,1,1217,3.5,3.814286,2006,5,17,2,15,True,False,False,False,False
8,1,1237,5.0,3.814286,2006,5,17,2,12,True,False,False,False,False
9,1,1250,4.0,3.814286,2006,5,17,2,12,True,False,False,False,False


In [11]:
genres_list = movies_df['genres'].apply(lambda x: x.split('|'))
genres_list

0        [Adventure, Animation, Children, Comedy, Fantasy]
1                           [Adventure, Children, Fantasy]
2                                        [Comedy, Romance]
3                                 [Comedy, Drama, Romance]
4                                                 [Comedy]
                               ...                        
62418                                              [Drama]
62419                                        [Documentary]
62420                                      [Comedy, Drama]
62421                                 [(no genres listed)]
62422                           [Action, Adventure, Drama]
Name: genres, Length: 62423, dtype: object

In [12]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit(genres_list)
mlb.transform(genres_list)
genre_encoded = pd.DataFrame(mlb.transform(genres_list),
                             columns=mlb.classes_,
                             index=movies_df.index)
genre_encoded.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
movies_encoded = pd.concat([movies_df.drop("genres", axis=1), genre_encoded], axis=1)
movies_encoded.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df = ratings_df.merge(movies_encoded, how="left", on="movieId")
df

Unnamed: 0,userId,movieId,rating,rating_user_mean,year,month,day,day_of_week,hour,time_afternoon,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,296,5.0,3.814286,2006,5,17,2,15,True,...,0,0,0,0,0,0,0,1,0,0
1,1,306,3.5,3.814286,2006,5,17,2,12,True,...,0,0,0,0,0,0,0,0,0,0
2,1,307,5.0,3.814286,2006,5,17,2,12,True,...,0,0,0,0,0,0,0,0,0,0
3,1,665,5.0,3.814286,2006,5,17,2,15,True,...,0,0,0,0,0,0,0,0,1,0
4,1,899,3.5,3.814286,2006,5,17,2,12,True,...,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25000090,162541,50872,4.5,3.365385,2009,4,28,1,21,False,...,0,0,0,0,0,0,0,0,0,0
25000091,162541,55768,2.5,3.365385,2009,4,28,1,20,False,...,0,0,0,0,0,0,0,0,0,0
25000092,162541,56176,2.0,3.365385,2009,4,28,1,20,False,...,0,0,0,0,0,0,0,0,0,0
25000093,162541,58559,4.0,3.365385,2009,4,28,1,21,False,...,0,0,1,0,0,0,0,0,0,0


In [15]:
mean_genres_by_users = pd.concat(
    [
        df["userId"], 
        df[list(mlb.classes_)].multiply(df['rating'],axis=0)
    ], axis=1
).groupby("userId").mean()
df = df.merge(mean_genres_by_users, how="left", on="userId", suffixes=("", "_mean_gen"))

In [17]:
df = df[:100000]

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(["rating", "title"], axis=1), df["rating"], test_size=0.2, random_state=41)

In [26]:
X_train

Unnamed: 0,userId,movieId,rating_user_mean,year,month,day,day_of_week,hour,time_afternoon,time_evening,...,Film-Noir_mean_gen,Horror_mean_gen,IMAX_mean_gen,Musical_mean_gen,Mystery_mean_gen,Romance_mean_gen,Sci-Fi_mean_gen,Thriller_mean_gen,War_mean_gen,Western_mean_gen
84229,633,1460,4.677419,1998,5,22,4,12,True,False,...,0.000000,0.354839,0.000000,0.000000,0.774194,1.193548,0.548387,0.774194,0.000000,0.000000
1433,8,1440,3.612903,1998,3,21,5,14,True,False,...,0.032258,0.219355,0.051613,0.200000,0.451613,0.651613,0.554839,1.458065,0.187097,0.077419
21185,171,3534,4.219821,2005,1,12,2,21,False,False,...,0.038003,0.235469,0.025335,0.111028,0.385991,0.687034,0.490313,1.175112,0.248882,0.073770
3060,18,587,3.371658,2005,2,13,6,5,False,False,...,0.000000,0.312834,0.053476,0.120321,0.200535,0.582888,0.705882,1.074866,0.131016,0.018717
59152,468,6979,3.891061,2005,3,29,1,5,False,False,...,0.044693,0.145251,0.050279,0.078212,0.290503,0.765363,0.332402,0.969274,0.134078,0.022346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53491,431,2959,2.503322,2005,3,24,3,14,True,False,...,0.010797,0.155316,0.032392,0.093854,0.191030,0.362957,0.365449,0.672757,0.259136,0.118771
85986,648,45431,2.493411,2017,2,11,5,12,True,False,...,0.024890,0.087116,0.128111,0.055637,0.244510,0.430454,0.249634,0.657394,0.243777,0.054173
61324,489,1917,4.356918,2019,5,25,5,5,False,False,...,0.000000,0.493711,0.443396,0.000000,0.518868,0.444969,1.683962,1.737421,0.474843,0.020440
931,4,1288,3.378099,2019,11,16,5,22,False,False,...,0.000000,0.130165,0.411157,0.105372,0.283058,0.142562,1.111570,0.820248,0.123967,0.105372


In [27]:
#using kNN

def get_most_popular_rating(X_train, y_train, X_test, userId, movieId):
    movie_data = X_train[X_train["movieId"] == movieId]   
    if movie_data.empty:
        return y_train.mean()
    else:
        return movie_data.iloc[0]['rating_user_mean']

predict = [get_most_popular_rating(X_train, y_train, X_test, userId, movieId) for userId, movieId in zip(X_test["userId"], X_test["movieId"])]
predict

[3.9,
 3.586,
 3.6202898550724636,
 4.217563989408649,
 3.6538461538461537,
 3.4411764705882355,
 3.586,
 3.092063492063492,
 3.139130434782609,
 3.723684210526316,
 3.8767123287671232,
 3.1041666666666665,
 3.9056603773584904,
 3.4741379310344827,
 3.8168316831683167,
 3.840909090909091,
 3.6166666666666667,
 3.4477611940298507,
 3.3098739495798317,
 3.696759259259259,
 3.865625,
 3.7070063694267517,
 3.586,
 3.776315789473684,
 3.737142857142857,
 3.139064475347661,
 3.3220338983050848,
 3.401123595505618,
 3.2451612903225806,
 3.139064475347661,
 3.918067226890756,
 3.8484848484848486,
 3.865625,
 3.827996340347667,
 3.8238993710691824,
 3.24252801992528,
 3.4011627906976742,
 3.8867924528301887,
 3.7685185185185186,
 3.526190476190476,
 3.3098739495798317,
 3.6707317073170733,
 3.6056511056511056,
 3.24252801992528,
 3.773529411764706,
 3.7479338842975207,
 3.7615384615384615,
 4.685483870967742,
 3.586,
 3.1822222222222223,
 4.155882352941177,
 3.7479338842975207,
 3.586,
 2.57304

In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
import pandas as pd

def evaluate_model(predict, y_test):
    metrics = {
        "Mean Squared Error (MSE)": mean_squared_error(y_test, predict),
        "Mean Absolute Error (MAE)": mean_absolute_error(y_test, predict),
        "Mean Absolute Percentage Error (MAPE)": mean_absolute_percentage_error(y_test, predict) * 100,
        "R-squared (R²)": r2_score(y_test, predict)
    }
    results_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"])
    print("\nModel Evaluation Metrics:")
    print(results_df.to_string(index=False, float_format="%.4f"))


evaluate_model(predict, y_test)


Model Evaluation Metrics:
                               Metric   Value
             Mean Squared Error (MSE)  1.2400
            Mean Absolute Error (MAE)  0.8784
Mean Absolute Percentage Error (MAPE) 38.5658
                       R-squared (R²) -0.1473


In [31]:
def get_recommendation(X_train, y_train, X_test, userId, movieId):
    movie_data = X_train[X_train["movieId"] == movieId]
    if movie_data.empty:
        return y_train.mean()
    user_data = X_test[X_test["userId"] == userId]

    movie_mean_rating = movie_data.iloc[0]["rating_user_mean"]
    user_mean_rating = user_data["rating_user_mean"].iloc[0]

    final_score = (0.5 * movie_mean_rating) + (0.5 * user_mean_rating)

    return final_score



predict = [get_recommendation(X_train, y_train, X_test, userId, movieId) for userId, movieId in zip(X_test["userId"], X_test["movieId"])]
evaluate_model(predict, y_test)


Model Evaluation Metrics:
                               Metric   Value
             Mean Squared Error (MSE)  0.9523
            Mean Absolute Error (MAE)  0.7663
Mean Absolute Percentage Error (MAPE) 34.3396
                       R-squared (R²)  0.1189


In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(["userId", "movieId", "rating", "title"], axis=1), df["rating"], test_size=0.2, random_state=41)

In [34]:
#using linear regression

from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
predict = linear_model.predict(X_test)
evaluate_model(predict, y_test)


Model Evaluation Metrics:
                               Metric   Value
             Mean Squared Error (MSE)  0.8308
            Mean Absolute Error (MAE)  0.7047
Mean Absolute Percentage Error (MAPE) 31.0413
                       R-squared (R²)  0.2313


In [None]:
#using XGBoost

from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
xgb_model = XGBRegressor()
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

xgb_model = XGBRegressor()

param_grid = {
    'max_depth': [7, 9, 10],         # Different depths for the trees
    'gamma': [0.1, 0.15, 0.3],         # Minimum loss reduction to make a split
    'alpha': [0.1, 0.15, 0.3],         # L1 regularization term on weights
    'reg_lambda': [1.5, 2, 2.5],          # L2 regularization term on weights
    'learning_rate': [0.02, 0.05, 0.1], # Learning rate
    'n_estimators': [200, 300, 400]   # Number of trees
}

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train)
best_params = random_search.best_params_
best_model = random_search.best_estimator_

print("Best Parameters:", best_params)
# there should be ... Best Parameters: {'reg_lambda': 2.5, 'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0.1, 'alpha': 0.15}

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [None]:
predict = best_model.predict(X_test)

In [None]:
evaluate_model(predict, y_test)

In [None]:
import matplotlib.pyplot as plt

best_model.feature_importances_
sorted_idx = best_model.feature_importances_.argsort()
plt.figure(figsize=(10,25))
plt.barh(X_test.columns[sorted_idx], best_model.feature_importances_[sorted_idx])