In [18]:
import numpy as np 
import pandas as pd 
import os
from ast import literal_eval
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.functional import mse_loss

from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import Reader, Dataset, SVD, SlopeOne, accuracy

from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error

import ast

In [19]:
def extract_actor_name(cast_list, target_order):
    cast_list = list(cast_list)
    if isinstance(cast_list, list):
        for member in cast_list:
            if isinstance(member, dict) and member.get('order') == target_order:
                return member.get('name')
    return np.nan  

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words


credits = pd.read_csv('/kaggle/input/the-movies-dataset/credits.csv')
keywords = pd.read_csv('/kaggle/input/the-movies-dataset/keywords.csv')
movies = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')
ratings = pd.read_csv('/kaggle/input/the-movies-dataset/ratings.csv')

keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

movies = movies.drop([19730, 29503, 35587])
movies['id'] = movies['id'].astype('int')
movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
print("Movies before empty genres filter: ", len(movies))
movies = movies[movies['genres'].apply(lambda x: len(x) > 0)].reset_index(drop=True)
print("Movies after empty genres filter: ", len(movies))
movies['description'] = movies['overview']
movies['description'] = movies['description'].fillna('')
movies = movies.merge(credits, on='id')
movies = movies.merge(keywords, on='id')
movies['cast'] = movies['cast'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

#Add top 5 actors
for i in range(5):
    movies[f'order_{i}'] = movies['cast'].apply(lambda x: extract_actor_name(x, i))

columns_take = ['genres', 'id', 'title', 'description', 'cast', 'crew', 'keywords', 'order_0', 'order_1', 'order_2', 'order_3', 'order_4']
all_columns = movies.columns
columns_drop = [column for column in all_columns if column not in columns_take]
movies = movies.drop(columns=columns_drop)

#movies['cast'] = movies['cast'].apply(literal_eval)
movies['crew'] = movies['crew'].apply(literal_eval)
movies['keywords'] = movies['keywords'].apply(literal_eval)

movies['director'] = movies['crew'].apply(get_director)
movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies['cast'] = movies['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
movies['keywords'] = movies['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
print("Movies before empty keywords filter: ", len(movies))
movies = movies[movies['keywords'].apply(lambda x: len(x) > 0)].reset_index(drop=True)
print("Movies after empty keywords filter: ", len(movies))
movies['cast'] = movies['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
movies['director'] = movies['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
movies['director'] = movies['director'].apply(lambda x: [x,x, x])

global s
s = movies.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]

stemmer = SnowballStemmer('english')

movies['keywords'] = movies['keywords'].apply(filter_keywords)
movies['keywords'] = movies['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
movies['soup'] = movies['keywords'] + movies['cast'] + movies['director'] + movies['genres']
movies['soup'] = movies['soup'].apply(lambda x: ' '.join(x))

avg_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
avg_ratings.rename(columns={'rating': 'avg_rating'}, inplace=True)
movies = movies.merge(avg_ratings, how='left', left_on='id', right_on='movieId')
movies.drop(columns=['movieId'], inplace=True)

#movies = movies[:500]

movie_ids_in_ratings = ratings['movieId'].unique()
print(f"Number of movie IDs in ratings dataframe: {len(movie_ids_in_ratings)}")
movie_ids_in_movies = movies['id'].unique()
print(f"Number of movie IDs in movies dataframe: {len(movie_ids_in_movies)}")

missing_ids_ratings = [movie_id for movie_id in movie_ids_in_ratings 
                    if movie_id not in movie_ids_in_movies]
missing_ids_movies = [movie_id for movie_id in movie_ids_in_movies 
                    if movie_id not in movie_ids_in_ratings]

print(f"Number of movie IDs present in ratings but missing from movies: {len(missing_ids_ratings)}")
print(f"Percentage of missing movies: {len(missing_ids_ratings) / len(movie_ids_in_ratings) * 100:.2f}%")

print(f"Number of movie IDs present in movies but missing from ratings: {len(missing_ids_movies)}")
print(f"Percentage of missing movies: {len(missing_ids_movies) / len(movie_ids_in_movies) * 100:.2f}%")
# Ensure consistent types
movies['id'] = movies['id'].astype(int)
ratings['movieId'] = ratings['movieId'].astype(int)

valid_movie_ids = set(movies['id'])  # Convert to set for fast lookup

ratings = ratings[ratings['movieId'].isin(valid_movie_ids)]
movies = movies[movies['id'].isin(ratings['movieId'].unique())]
print(f"New number of rows in ratings after removing invalid movie IDs: {len(ratings)}")
movies = movies.drop_duplicates(subset='id', keep='first')
num_users = ratings['userId'].nunique()
num_items = ratings['movieId'].nunique()

user_mapping = {id: idx for idx, id in enumerate(ratings['userId'].unique())}
item_mapping = {id: idx for idx, id in enumerate(ratings['movieId'].unique())}
#convert non-sequential user IDs to sequential indices for matrix factorization
ratings['userId'] = ratings['userId'].map(user_mapping)
ratings['movieId'] = ratings['movieId'].map(item_mapping)
movies['id'] = movies['id'].map(item_mapping)
movies.drop(columns=['crew', 'cast'], inplace=True)
ratings.drop(columns=['timestamp'], inplace=True)

  movies = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')


Movies before empty genres filter:  45463
Movies after empty genres filter:  43021
Movies before empty keywords filter:  44104
Movies after empty keywords filter:  31362
Number of movie IDs in ratings dataframe: 45115
Number of movie IDs in movies dataframe: 30728
Number of movie IDs present in ratings but missing from movies: 39481
Percentage of missing movies: 87.51%
Number of movie IDs present in movies but missing from ratings: 25094
Percentage of missing movies: 81.66%
New number of rows in ratings after removing invalid movie IDs: 10911543


In [20]:
movies.columns

Index(['genres', 'id', 'title', 'description', 'keywords', 'order_0',
       'order_1', 'order_2', 'order_3', 'order_4', 'director', 'soup',
       'avg_rating'],
      dtype='object')

In [21]:
from collections import Counter

all_genres = movies['genres'].explode()  
genre_counts = Counter(all_genres)
genre_counts_df = pd.DataFrame(genre_counts.items(), columns=['genre', 'count']).sort_values(by='count', ascending=False)

print(genre_counts_df)


              genre  count
7             Drama   2991
1            Comedy   1601
8          Thriller   1222
5            Action    989
9           Romance    971
6             Crime    791
3         Adventure    629
13           Horror    607
10  Science Fiction    524
11          Mystery    414
4           Fantasy    401
16      Documentary    309
2            Family    304
14          History    262
12            Music    204
15              War    188
17          Western    167
0         Animation    157
18          Foreign    145
19         TV Movie     56


In [22]:
len(ratings)

10911543

In [23]:
print(ratings['movieId'].max())
print(len(movies))
print(movies['id'].max())

5633
5634
5633


In [24]:
rating_counts = ratings['movieId'].value_counts()
popular_movie_ids = rating_counts[rating_counts > 100].index

# Step 3: Filter ratings and movies DataFrames
filtered_ratings = ratings[ratings['movieId'].isin(popular_movie_ids)].copy()
filtered_movies = movies[movies['id'].isin(popular_movie_ids)].copy()

In [25]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'
device

'cpu'

In [26]:
"""
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(filtered_ratings[['userId', 'movieId', 'rating']], reader)

# Define hyperparameter grid
sim_options = {
    "n_factors": [x * 50 for x in range(2,6)],
    "n_epochs": [x * 10 for x in range(4, 7)],
    "lr_all": [x / 50 for x in range(1,3)],
    "reg_all": [0.01, 0.02]

}
#param_grid = {"sim_options": sim_options}

split = 5
gs = GridSearchCV(SVD, sim_options,
                   measures=['rmse', 'mae'], cv=split, 
                   n_jobs=6, refit=False)
gs.fit(data)

def print_results_table(gs_results, param_name):
    print(f"\nResults for '{param_name}':")
    results_df = pd.DataFrame(gs_results)

    # Filter columns to only include 'mean_test_rmse', 'mean_test_mae' and the parameter
    param_col = f'param_{param_name}'
    relevant_columns = [param_col, 'mean_test_rmse', 'mean_test_mae']
    filtered_results = results_df[relevant_columns]

    # Group by the parameter and calculate the mean of RMSE and MAE
    # This is crucial because gs.cv_results contains all combinations,
    # so we need to average across other parameters for a clean view of one.
    grouped_results = filtered_results.groupby(param_col).agg(
        Avg_RMSE=('mean_test_rmse', 'mean'),
        Avg_MAE=('mean_test_mae', 'mean')
    ).reset_index()

    # Rename the parameter column for cleaner output
    grouped_results = grouped_results.rename(columns={param_col: param_name})

    # Sort by the parameter for consistent output
    grouped_results = grouped_results.sort_values(by=param_name)

    # Format the numerical columns to 3 decimal places
    grouped_results['Avg_RMSE'] = grouped_results['Avg_RMSE'].map('{:.3f}'.format)
    grouped_results['Avg_MAE'] = grouped_results['Avg_MAE'].map('{:.3f}'.format)

    print(grouped_results.to_markdown(index=False))


# Print tables for each hyperparameter
for param in sim_options.keys():
    print_results_table(gs.cv_results, param)

print("\n--- Best Scores and Parameters ---")
print(f"Best RMSE: {gs.best_score['rmse']:.3f}")
print(f"Best parameters for RMSE: {gs.best_params['rmse']}")
print(f"Best MAE: {gs.best_score['mae']:.3f}")
print(f"Best parameters for MAE: {gs.best_params['mae']}")
"""

'\nreader = Reader(rating_scale=(0.5, 5))\ndata = Dataset.load_from_df(filtered_ratings[[\'userId\', \'movieId\', \'rating\']], reader)\n\n# Define hyperparameter grid\nsim_options = {\n    "n_factors": [x * 50 for x in range(2,6)],\n    "n_epochs": [x * 10 for x in range(4, 7)],\n    "lr_all": [x / 50 for x in range(1,3)],\n    "reg_all": [0.01, 0.02]\n\n}\n#param_grid = {"sim_options": sim_options}\n\nsplit = 5\ngs = GridSearchCV(SVD, sim_options,\n                   measures=[\'rmse\', \'mae\'], cv=split, \n                   n_jobs=6, refit=False)\ngs.fit(data)\n\ndef print_results_table(gs_results, param_name):\n    print(f"\nResults for \'{param_name}\':")\n    results_df = pd.DataFrame(gs_results)\n\n    # Filter columns to only include \'mean_test_rmse\', \'mean_test_mae\' and the parameter\n    param_col = f\'param_{param_name}\'\n    relevant_columns = [param_col, \'mean_test_rmse\', \'mean_test_mae\']\n    filtered_results = results_df[relevant_columns]\n\n    # Group by the

In [None]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD, SlopeOne, accuracy
import time
from surprise.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import KFold
import pickle

reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(filtered_ratings[['userId', 'movieId', 'rating']], reader)
#best_model = SVD(n_factors = 200, n_epochs = 30, lr_all = 0.02, reg_all = 0.02)
best_model = pickle.load(open('/kaggle/input/mf/scikitlearn/default/1/matrix_fac.pickle', "rb"))
trainset, testset = train_test_split(data, test_size=0.2)

#best_model.fit(trainset)
predictions = best_model.test(testset)
def round_to_half(x):
    return round(x * 2) / 2

rounded_predictions = [
    pred._replace(est=np.clip(round_to_half(pred.est), 0.5, 5.0))
    for pred in predictions
]
print("MSE: ", predictions)
print("Rounded MSE: ",accuracy.mse(rounded_predictions))

pickle.dump(best_model, open('matrix_fac.pickle', "wb"))

i = 0
j = 0
print(test.predict(i, j, r_ui= filtered_ratings[(filtered_ratings.userId==i)&(filtered_ratings.movieId==j)]['rating'].mean()))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [29]:
def calculate_actor_average_ratings(df):
    movies_df = df.copy()
    actor_ratings = {}
    order_columns = ['order_0', 'order_1', 'order_2', 'order_3', 'order_4']
    
    print("Calculating actor average ratings...")
    
    for idx, row in movies_df.iterrows():
        movie_rating = row['avg_rating']
        if pd.isna(movie_rating):
            continue
            
        for col in order_columns:
            actor = row[col]
            if pd.isna(actor) or actor == '':
                continue
            if actor not in actor_ratings:
                actor_ratings[actor] = {'total_rating': 0, 'movie_count': 0}
            
            # Add rating and increment count
            actor_ratings[actor]['total_rating'] += movie_rating
            actor_ratings[actor]['movie_count'] += 1
    
    actor_avg_ratings = {}
    for actor, data in actor_ratings.items():
        actor_avg_ratings[actor] = data['total_rating'] / data['movie_count']
    
    all_actor_ratings = list(actor_avg_ratings.values())
    overall_avg_rating = np.mean(all_actor_ratings) if all_actor_ratings else 0
    
    print(f"Total unique actors: {len(actor_avg_ratings)}")
    print(f"Overall average rating across all actors: {overall_avg_rating:.3f}")
    
    print("Substituting actor average ratings in order columns...")
    
    for col in order_columns:
        movies_df[col] = movies_df[col].map(actor_avg_ratings)
        movies_df[col] = movies_df[col].fillna(overall_avg_rating)
    
    print("Substitution completed!")
    
    return movies_df, actor_avg_ratings, overall_avg_rating

# Example usage and verification function
def verify_results(original_df, modified_df, actor_avg_ratings, overall_avg_rating):
    """
    Verify the results and show some statistics
    """
    print("\n" + "="*50)
    print("VERIFICATION RESULTS")
    print("="*50)
    
    order_columns = ['order_0', 'order_1', 'order_2', 'order_3', 'order_4']
    
    # Show sample of actor average ratings
    print(f"\nSample of actor average ratings:")
    sample_actors = list(actor_avg_ratings.items())[:10]
    for actor, rating in sample_actors:
        print(f"  {actor}: {rating:.3f}")
    
    # Show statistics for order columns
    print(f"\nOrder columns statistics (after substitution):")
    for col in order_columns:
        non_null_count = modified_df[col].notna().sum()
        avg_rating = modified_df[col].mean()
        print(f"  {col}: {non_null_count} non-null values, avg rating: {avg_rating:.3f}")
    
    # Show before/after comparison for first few rows
    print(f"\nBefore/After comparison (first 5 rows):")
    print("Original order columns:")
    print(original_df[order_columns].head())
    print("\nModified order columns (with average ratings):")
    print(modified_df[order_columns].head())

modified_df, actor_ratings, overall_avg = calculate_actor_average_ratings(movies)

Calculating actor average ratings...
Total unique actors: 14814
Overall average rating across all actors: 3.134
Substituting actor average ratings in order columns...
Substitution completed!


In [30]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genres_clean = modified_df['genres'].fillna('').apply(lambda x: x if isinstance(x, list) else [])
genres_encoded = mlb.fit_transform(genres_clean)
genres_df = pd.DataFrame(genres_encoded, 
                        columns=[f'{genre}' for genre in mlb.classes_],
                        index=modified_df.index)
modified_df = pd.concat([modified_df, genres_df], axis=1)

In [31]:
modified_df.drop(columns=['description', 'title', 'director', 'soup','keywords'], inplace=True)

In [32]:
modified_df.head()

Unnamed: 0,genres,id,order_0,order_1,order_2,order_3,order_4,avg_rating,Action,Adventure,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,"[Animation, Comedy, Family]",2710,3.218628,3.571914,3.638176,3.216323,3.205199,3.59893,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[Adventure, Fantasy, Family]",3738,3.16837,3.760163,3.090169,3.760163,3.817607,3.760163,0,1,...,0,0,0,0,0,0,0,0,0,0
5,"[Action, Crime, Drama, Thriller]",1221,3.385523,3.363182,3.202128,3.126661,3.098204,3.905544,1,0,...,0,0,0,0,0,0,0,1,0,0
8,"[Adventure, Action, Thriller]",2006,3.337938,3.345751,3.013024,2.893813,3.006704,2.740334,1,1,...,0,0,0,0,0,0,0,1,0,0
13,"[Action, Adventure]",302,3.645755,3.158339,2.829485,3.459919,3.710181,3.710181,1,1,...,0,0,0,0,0,0,0,0,0,0


In [33]:
modified_df.columns

Index(['genres', 'id', 'order_0', 'order_1', 'order_2', 'order_3', 'order_4',
       'avg_rating', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History',
       'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie',
       'Thriller', 'War', 'Western'],
      dtype='object')

In [40]:
import random
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

movies = movies.reset_index(drop=True)

tfidf_vectorizer = TfidfVectorizer(min_df=3, max_df=0.95, stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['soup'])

min_ratings = 100
max_ratings = 150

user_counts = ratings['userId'].value_counts()
eligible_users = user_counts[(user_counts >= min_ratings) & (user_counts < max_ratings)].index.tolist()

if not eligible_users:
    print("No users found in this rating interval.")
else:
    selected_user = random.choice(eligible_users)
    user_ratings = ratings[ratings['userId'] == selected_user]
    user_movies = user_ratings.merge(movies, left_on='movieId', right_on='id')

    print(f"\nSelected User ID: {selected_user}")

    selected_movie = user_movies.sample(1).iloc[0]
    selected_movie_id = selected_movie['id']
    selected_movie_index = movies[movies['id'] == selected_movie_id].index[0]

    print(f"\nSelected Movie for Similarity Search: {selected_movie['title']} (Movie ID: {selected_movie_id})")
    cosine_sim = cosine_similarity(tfidf_matrix[selected_movie_index], tfidf_matrix).flatten()

    similar_indices = cosine_sim.argsort()[::-1]
    top_indices = [i for i in similar_indices if i != selected_movie_index][:15]

    # Step 10: Print top 10 similar movies
    print("\nTop 15 Similar Movies:")
    for idx in top_indices:
        print(f"{movies.iloc[idx]['title']} (Similarity: {cosine_sim[idx]:.4f})")



Selected User ID: 169281

Selected Movie for Similarity Search: The Chorus (Movie ID: 773)

Top 15 Similar Movies:
Zero for Conduct (Similarity: 0.2858)
The Chemical Brothers: Don't Think (Similarity: 0.2726)
The Duke Is Tops (Similarity: 0.2726)
Les Misérables in Concert - The 25th Anniversary (Similarity: 0.2425)
Chapiteau-Show (Similarity: 0.2368)
The Heat's On (Similarity: 0.2368)
Radio Day (Similarity: 0.2368)
Five Dances (Similarity: 0.2299)
Diabolique (Similarity: 0.2165)
The Hessen Affair (Similarity: 0.2101)
Mädchen in Uniform (Similarity: 0.2099)
Jesus Christ Superstar (Similarity: 0.2098)
The Age of Love (Similarity: 0.2025)
Still Bill (Similarity: 0.1843)
Official Rejection (Similarity: 0.1798)


In [36]:
tfidf_vectorizer = TfidfVectorizer(min_df=3, max_df=0.95, stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['soup'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), 
                       columns=tfidf_vectorizer.get_feature_names_out(),
                       index=movies.index)

print(f"Tf-idf matrix shape: ", tfidf_matrix.shape)

movie_id_to_index = {movie_id: i for i, movie_id in enumerate(movies['id'])}
for i in range(1,5):
    min_ratings = i * 100
    max_ratings = min_ratings + 50

    print(f"\nRating Interval: [{min_ratings}, {max_ratings})")
    
    # Select all users within the rating count interval
    user_counts = ratings['userId'].value_counts()
    eligible_users = user_counts[(user_counts >= min_ratings) & (user_counts < max_ratings)].index

    if len(eligible_users) == 0:
        print("No users in this rating interval. Skipping...")
        continue

    lr_mse_list = []
    lr_mae_list = []
    ridge_mse_list = []
    ridge_mae_list = []

    for user_id in eligible_users:
        user_ratings = ratings[ratings['userId'] == user_id]
        user_movies = user_ratings.merge(movies, left_on='movieId', right_on='id')

        user_movie_indices = [movie_id_to_index.get(movie_id) for movie_id in user_movies['id'].values]
        user_movie_indices = [idx for idx in user_movie_indices if idx is not None]

        if len(user_movie_indices) < 5:
            continue

        X = tfidf_matrix[user_movie_indices]
        y = user_movies.loc[user_movies['id'].isin(
            [movies['id'].iloc[idx] for idx in user_movie_indices]), 'rating'].values

        if len(y) < 5:
            continue

        try:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            # Linear Regression
            lr_model = LinearRegression()
            lr_model.fit(X_train, y_train)
            lr_pred_raw = lr_model.predict(X_test)
            lr_pred = np.round(lr_pred_raw * 2) / 2
            lr_pred = np.clip(lr_pred, 0.5, 5.0)

            lr_mse = mean_squared_error(y_test, lr_pred)
            lr_mae = mean_absolute_error(y_test, lr_pred)
            lr_mse_list.append(lr_mse)
            lr_mae_list.append(lr_mae)

            # Ridge (SGDRegressor)
            sgd_model = SGDRegressor(penalty='l2', alpha=0.01, random_state=42)
            sgd_model.fit(X_train, y_train)
            ridge_pred_raw = sgd_model.predict(X_test)
            ridge_pred = np.round(ridge_pred_raw * 2) / 2
            ridge_pred = np.clip(ridge_pred, 0.5, 5.0)

            ridge_mse = mean_squared_error(y_test, ridge_pred)
            ridge_mae = mean_absolute_error(y_test, ridge_pred)
            ridge_mse_list.append(ridge_mse)
            ridge_mae_list.append(ridge_mae)

        except Exception as e:
            continue

    # Report averages for this interval
    if lr_mse_list:
        print("Average Model Evaluation:")
        print(f"Linear Regression - MSE: {np.mean(lr_mse_list):.4f}, MAE: {np.mean(lr_mae_list):.4f}")
        print(f"Ridge (SGDRegressor) - MSE: {np.mean(ridge_mse_list):.4f}, MAE: {np.mean(ridge_mae_list):.4f}")
    else:
        print("Not enough valid users/data to compute average metrics.")


Tf-idf matrix shape:  (5634, 5000)

Rating Interval: [100, 150)
Average Model Evaluation:
Linear Regression - MSE: 0.9923, MAE: 0.7571
Ridge (SGDRegressor) - MSE: 4.2617, MAE: 1.8526

Rating Interval: [200, 250)
Average Model Evaluation:
Linear Regression - MSE: 1.0463, MAE: 0.7795
Ridge (SGDRegressor) - MSE: 2.9519, MAE: 1.4969

Rating Interval: [300, 350)
Average Model Evaluation:
Linear Regression - MSE: 1.1101, MAE: 0.8042
Ridge (SGDRegressor) - MSE: 2.4330, MAE: 1.3357

Rating Interval: [400, 450)
Average Model Evaluation:
Linear Regression - MSE: 1.1967, MAE: 0.8395
Ridge (SGDRegressor) - MSE: 2.1220, MAE: 1.2285


In [None]:
'''
count = CountVectorizer(min_df=3, max_df=0.95, stop_words='english', max_features=5000)
count_data = count.fit_transform(movies['description'])

transformer = TfidfTransformer()
trans_data = transformer.fit_transform(count_data)
'''


In [41]:
feature_columns = [
    'order_0', 'order_1', 'order_2', 'order_3', 'order_4',
    'avg_rating', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
    'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History',
    'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie',
    'Thriller', 'War', 'Western'
]

movies_features = modified_df[feature_columns].copy()
movies_features.fillna(0, inplace=True) 

scaler = StandardScaler()
scaled_movie_features = scaler.fit_transform(movies_features)
scaled_movie_features_df = pd.DataFrame(scaled_movie_features, columns=feature_columns, index=movies.index)


print(f"Movie features shape: {scaled_movie_features_df.shape}")
movie_id_to_index = {movie_id: i for i, movie_id in enumerate(movies['id'])}

for i in range(1, 10):
    min_ratings = i * 50
    max_ratings = min_ratings + 50

    print(f"\nRating Interval: [{min_ratings}, {max_ratings})")

    # Select all users within the rating count interval
    user_counts = ratings['userId'].value_counts()
    eligible_users = user_counts[(user_counts >= min_ratings) & (user_counts < max_ratings)].index

    if len(eligible_users) == 0:
        print("No users in this rating interval. Skipping...")
        continue

    lr_mse_list = []
    lr_mae_list = []
    ridge_mse_list = []
    ridge_mae_list = []

    for user_id in eligible_users:
        user_ratings = ratings[ratings['userId'] == user_id]
        user_movies = user_ratings.merge(movies, left_on='movieId', right_on='id')

        user_movie_indices = [movie_id_to_index.get(movie_id) for movie_id in user_movies['id'].values]
        user_movie_indices = [idx for idx in user_movie_indices if idx is not None]

        if len(user_movie_indices) < 5: # Need at least 5 data points for train/test split
            continue

        X = scaled_movie_features_df.iloc[user_movie_indices].values
        y = user_movies.loc[user_movies['id'].isin(
            [movies['id'].iloc[idx] for idx in user_movie_indices]), 'rating'].values

        if len(y) < 5: 
            continue

        try:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            # Linear Regression
            lr_model = LinearRegression()
            lr_model.fit(X_train, y_train)
            lr_pred_raw = lr_model.predict(X_test)
            lr_pred = np.round(lr_pred_raw * 2) / 2
            lr_pred = np.clip(lr_pred, 0.5, 5.0)

            lr_mse = mean_squared_error(y_test, lr_pred)
            lr_mae = mean_absolute_error(y_test, lr_pred)
            lr_mse_list.append(lr_mse)
            lr_mae_list.append(lr_mae)

            # Ridge (SGDRegressor)
            sgd_model = SGDRegressor(penalty='l2', alpha=0.01, random_state=42, max_iter=1000) # Added max_iter
            sgd_model.fit(X_train, y_train)
            ridge_pred_raw = sgd_model.predict(X_test)
            ridge_pred = np.round(ridge_pred_raw * 2) / 2
            ridge_pred = np.clip(ridge_pred, 0.5, 5.0)

            ridge_mse = mean_squared_error(y_test, ridge_pred)
            ridge_mae = mean_absolute_error(y_test, ridge_pred)
            ridge_mse_list.append(ridge_mse)
            ridge_mae_list.append(ridge_mae)

        except Exception as e:
            continue
    if lr_mse_list:
        print("Average Model Evaluation:")
        print(f"Linear Regression - MSE: {np.mean(lr_mse_list):.4f}, MAE: {np.mean(lr_mae_list):.4f}")
        print(f"Ridge (SGDRegressor) - MSE: {np.mean(ridge_mse_list):.4f}, MAE: {np.mean(ridge_mae_list):.4f}")
    else:
        print("Not enough valid users/data to compute average metrics.")

Movie features shape: (5634, 26)

Rating Interval: [50, 100)
Average Model Evaluation:
Linear Regression - MSE: 1.2721, MAE: 0.8363
Ridge (SGDRegressor) - MSE: 1.5101, MAE: 0.9100

Rating Interval: [100, 150)
Average Model Evaluation:
Linear Regression - MSE: 0.9765, MAE: 0.7339
Ridge (SGDRegressor) - MSE: 1.0828, MAE: 0.7686

Rating Interval: [150, 200)
Average Model Evaluation:
Linear Regression - MSE: 0.8878, MAE: 0.6998
Ridge (SGDRegressor) - MSE: 0.9387, MAE: 0.7178

Rating Interval: [200, 250)
Average Model Evaluation:
Linear Regression - MSE: 0.8441, MAE: 0.6817
Ridge (SGDRegressor) - MSE: 0.8714, MAE: 0.6927

Rating Interval: [250, 300)
Average Model Evaluation:
Linear Regression - MSE: 0.8085, MAE: 0.6691
Ridge (SGDRegressor) - MSE: 0.8238, MAE: 0.6754

Rating Interval: [300, 350)
Average Model Evaluation:
Linear Regression - MSE: 0.7814, MAE: 0.6576
Ridge (SGDRegressor) - MSE: 0.7942, MAE: 0.6628

Rating Interval: [350, 400)
Average Model Evaluation:
Linear Regression - MSE: 