In [2]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split


#create directory
os.makedirs('/Users/yugjain/Documents/Machine_learning/Movies/datasets/processed',exist_ok=True)
os.makedirs('/Users/yugjain/Documents/Machine_learning/Movies/datasets/splits',exist_ok=True)
os.makedirs('/Users/yugjain/Documents/Machine_learning/Movies/models',exist_ok=True)



In [3]:
#now to load the data

ratings=pd.read_csv('/Users/yugjain/Documents/Machine_learning/Movies/datasets/raw/rating.csv')
movies = pd.read_csv('/Users/yugjain/Documents/Machine_learning/Movies/datasets/raw/movie.csv')

print(f"Original ratings shapes : {ratings.shape}")
print(f"Original movies shapes : {movies.shape}")


Original ratings shapes : (2512621, 4)
Original movies shapes : (27278, 3)


In [4]:
# check missing values
print("Missing values in rating Before cleaning")
print(ratings.isnull().sum())
print()

#drop the row of missing values
ratings_clean=ratings.dropna(subset=['userId','movieId','rating'])

print(f"rows removed {len(ratings)-len(ratings_clean)}")
print(f"ratings shape after removing the missing values : {ratings_clean.shape}")


Missing values in rating Before cleaning
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

rows removed 0
ratings shape after removing the missing values : (2512621, 4)


In [6]:
#check missing for movies
print("Missing values on movies Before cleaning")
print(movies.isnull().sum())
print()

if movies['title'].isnull().sum()>0:
    print(f"removing {movies['title'].isnull().sum()} movies with missing title")
    movies=movies.dropna(subset=['title'])

#for genre
if movies['genres'].isnull().sum()>0:
    print(f"found{movies['genres'].isnull().sum()} movies with missing genre")
    movies['genre']=movies['genre'].fillna('Unknown')
print(f"Movies shape after cleaning :{movies.shape}")



Missing values on movies Before cleaning
movieId    0
title      0
genres     0
dtype: int64

Movies shape after cleaning :(27278, 3)


In [7]:
#remove the duplicate ratings

print(f"duplicate rating Before{ratings_clean.duplicated(subset=['userId','movieId']).sum()}")

ratings_clean=ratings_clean.sort_values('timestamp')
ratings_clean=ratings_clean.drop_duplicates(subset=['userId','movieId'],keep='last')

print(f"ratings shape after removing duplicate :{ratings_clean.shape}")

#remove duplicate movies
print(f"duplicate movies before : {movies.duplicated(subset=['movieId']).sum()}")

movies=movies.drop_duplicates(subset=['movieId'],keep='first')
print(f"movies shape after removing duplicates :{movies.shape}")



duplicate rating Before0
ratings shape after removing duplicate :(2512621, 4)
duplicate movies before : 0
movies shape after removing duplicates :(27278, 3)


In [8]:
#filter low active user

#count ratings per user
ratings_per_user=ratings_clean.groupby('userId').size()

Min_User_Rating=10

active_user=ratings_per_user[ratings_per_user >= Min_User_Rating].index
print(f"User Before filtering : {ratings_clean['userId'].nunique()}")
print(f"user with >= {Min_User_Rating} ratings:{len(active_user)}")

ratings_clean=ratings_clean[ratings_clean['userId'].isin(active_user)]
print(f"Rating after filtering low-activity users : {ratings_clean.shape}")


User Before filtering : 17011
user with >= 10 ratings:17011
Rating after filtering low-activity users : (2512621, 4)


In [9]:
#filter for unpopular movies

ratings_per_movies=ratings_clean.groupby('movieId').size()
Min_Movies_Ratings=10
popular_movies=ratings_per_movies[ratings_per_movies >= Min_Movies_Ratings].index

print(f"movies Before filtering : {ratings_clean['movieId'].nunique()}")
print(f"movies with >={Min_Movies_Ratings} ratings :{len(popular_movies)}")

ratings_clean=ratings_clean[ratings_clean['movieId'].isin(popular_movies)]

print(f"ratings after filtering unpopular movies : {ratings_clean.shape}")


movies Before filtering : 17690
movies with >=10 ratings :9391
ratings after filtering unpopular movies : (2487172, 4)


In [10]:
#filter movies data to match the ratings

movies_clean=movies[movies['movieId'].isin(ratings_clean['movieId'].unique())].copy()

print(f"movies Before : {len(movies)}")
print(f"movies after {len(movies_clean)}")
print(f"movies removed : {len(movies)-len(movies_clean)}")


movies Before : 27278
movies after 9391
movies removed : 17887


In [11]:
#verify the data consistency

print("-"*30)
print("Data consistency check")
print("-"*30)

ratings_movies_ids=set(ratings_clean['movieId'].unique())
movies_movies_ids=set(movies_clean['movieId'].unique())

missing_in_movies=ratings_movies_ids-movies_movies_ids
if len(missing_in_movies) >0:
    print(f"Waring : {len(missing_in_movies)} moviesIds in ratings not found in movies")
    print("Removing these ratinggg.............")
    rating_clean = ratings_clean[ratings_clean['movieId'].isin(movies_movies_ids)]
else:
    print("All mmovies ids in ratings exists in movies")

print(f"\n Final rating shapes : {ratings_clean.shape}")
print(f"\nFinal movies shapes : {movies_clean.shape}")


------------------------------
Data consistency check
------------------------------
All mmovies ids in ratings exists in movies

 Final rating shapes : (2487172, 4)

Final movies shapes : (9391, 3)


In [12]:
ratings_clean=ratings_clean.reset_index(drop=True)
movies_clean=movies_clean.reset_index(drop=True)
print("reset successfully")



reset successfully


In [13]:
#now time to save the clean ddata
ratings_clean.to_csv('/Users/yugjain/Documents/Machine_learning/Movies/datasets/processed/ratings_cleaned.csv')
print("saved success")

movies_clean.to_csv('/Users/yugjain/Documents/Machine_learning/Movies/datasets/processed/movies_cleaned.csv')
print("saved successfully")


saved success
saved successfully


In [15]:
#now we can split the genre and create one-hot encoding
#like "Action|Adventure|Thriller -> separate columns for each genre

from sklearn.preprocessing import MultiLabelBinarizer
#split genres by pipe
movies_clean['genres_list']=movies_clean['genres'].str.split('|')

mlb=MultiLabelBinarizer()
genre_matrix=mlb.fit_transform(movies_clean['genres_list'])

genre_df=pd.DataFrame(genre_matrix,columns=mlb.classes_,index=movies_clean['movieId'])
print(f"Genre matrix shape : {genre_df.shape}")
print(f"genre found : {list(mlb.classes_)}")
print("\nFirst few rows of the genre matrix : ")
display(genre_df.head())

#save the genre matrix
genre_df.to_csv('/Users/yugjain/Documents/Machine_learning/Movies/datasets/processed/genre_matrix.csv')
print("successfully saved")


Genre matrix shape : (9391, 19)
genre found : ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

First few rows of the genre matrix : 


Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


successfully saved


In [16]:
#create user item matrix (rows=user , columns=movies , values=rating)
user_item_matrix= ratings_clean.pivot(index='userId',columns='movieId',values='rating')

print(f"user-item matrix shape : {user_item_matrix.shape}")
print(f"user : {user_item_matrix.shape[0]}")
print(f"Movies : {user_item_matrix.shape[1]}")

#calculate the sparsity
total_cells=user_item_matrix.shape[0]*user_item_matrix.shape[1]
filled_cells=user_item_matrix.count().sum()
sparsity=(1-filled_cells/total_cells)*100

print(f"\nMatrix sparsity : {sparsity:.3f}%")
print(f"Filled cells {filled_cells} out of {total_cells}")

with open('/Users/yugjain/Documents/Machine_learning/Movies/models/user_item_matrix.pkl','wb')as f:
    pickle.dump(user_item_matrix,f)
print("\n Saved successfull file user_item_matrix.pkl")


user-item matrix shape : (17011, 9391)
user : 17011
Movies : 9391

Matrix sparsity : 98.443%
Filled cells 2487172 out of 159750301

 Saved successfull file user_item_matrix.pkl


In [17]:
#normalization rating by user mean

user_mean_ratings=ratings_clean.groupby('userId')['rating'].mean()

ratings_clean['user_mean']=ratings_clean['userId'].map(user_mean_ratings)

ratings_clean['rating_normalization']=ratings_clean['rating']-ratings_clean['user_mean']

print("sample of normalized ratings :")
display(ratings_clean[['userId','movieId','rating','user_mean','rating_normalization']].head(20))

ratings_clean.to_csv('/Users/yugjain/Documents/Machine_learning/Movies/datasets/processed/ratings_with_normalized.csv',index=False)
print("\nSaved: successfully as ratings_with_normalized")


sample of normalized ratings :


Unnamed: 0,userId,movieId,rating,user_mean,rating_normalization
0,8050,70,1.0,3.268817,-2.268817
1,8050,21,3.0,3.268817,-0.268817
2,8050,10,3.0,3.268817,-0.268817
3,8050,1,5.0,3.268817,1.731183
4,8050,32,5.0,3.268817,1.731183
5,8050,50,4.0,3.268817,0.731183
6,8050,76,3.0,3.268817,-0.268817
7,8050,62,4.0,3.268817,0.731183
8,8050,16,4.0,3.268817,0.731183
9,8050,14,3.0,3.268817,-0.268817



Saved: successfully as ratings_with_normalized


In [18]:
#train test split data for evaluation

train_ratings,test_ratings=train_test_split(ratings_clean,test_size=0.2,random_state=42)

print(f"Train set size : {len(train_ratings)} ({len(train_ratings)/len(ratings_clean)*100:.2f}%")
print(f"test set size : {len(test_ratings)}({len(test_ratings)/len(ratings_clean)*100:.2f}%")

#verify split
print(f"\nunique user in train :{train_ratings['userId'].nunique()}")
print(f"Unique user in test : {test_ratings['userId'].nunique()}")
print(f"unique movies in train :{train_ratings['movieId'].nunique()}")
print(f"unique movies in test : {test_ratings['movieId'].nunique()}")

train_ratings.to_csv('/Users/yugjain/Documents/Machine_learning/Movies/datasets/splits/train_ratings.csv',index=False)
test_ratings.to_csv('/Users/yugjain/Documents/Machine_learning/Movies/datasets/splits/test_ratings.csv',index=False)

print("\n Saved train and test splits")



Train set size : 1989737 (80.00%
test set size : 497435(20.00%

unique user in train :17011
Unique user in test : 16991
unique movies in train :9391
unique movies in test : 9273

 Saved train and test splits


In [None]:
#preprocessing summary

print("*"*30)
print("Preprocessing Summary")
print("*"*30)

print("Original data")
print(f"Rating: {len(ratings):,}")
print(f"user: {ratings['userId'].nunique():,}")
print(f"Movies : {len(movies)}")

print(f"\nCleaned data:")
print(f"ratings : {len(ratings_clean)}")
print(f"user : {ratings_clean['userId'].nunique():,}")
print(f"Movies: {len(movies_clean)}")

print("\ndata removed")
print(f"ratings{len(ratings)-len(ratings_clean):,}({len(ratings)-len(ratings_clean)/len(ratings)*100:.2f}%")
print(f"movies {len(movies)-len(movies_clean)}")

print("\n train test split")
print(f"train : {len(train_ratings)} ({len(train_ratings)/len(ratings_clean)*100:.2f}%)")
print(f"test : {len(test_ratings)} ({len(test_ratings)/len(ratings_clean)*100:.2f}%)")

print(f"features created: ")
print(f"genre matrix : {genre_df.shape}")
print(f"user_item matrix: {user_item_matrix.shape}")
print(" make the normalized ratings")


print("The preprocessing is complete ready for buiding")


In [21]:
# REBUILD SIMILARITY MATRIX AFTER FILTERING
print("\nRebuilding similarity matrix with filtered movies...")

# Step 1: Filter genre_df to only include movies that exist in cleaned data
existing_movie_ids = movies_clean['movieId'].unique()
genre_df_filtered = genre_df[genre_df.index.isin(existing_movie_ids)]

print(f"Genre matrix before filtering: {genre_df.shape}")
print(f"Genre matrix after filtering: {genre_df_filtered.shape}")

# Step 2: Recalculate similarity matrix
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(genre_df_filtered)

# Step 3: Create DataFrame
similarity_df = pd.DataFrame(similarity_matrix,
                             index=genre_df_filtered.index,
                             columns=genre_df_filtered.index)

print(f"\nSimilarity matrix shape: {similarity_df.shape}")

# Step 4: Save the corrected similarity matrix
import pickle
with open('/Users/yugjain/Documents/Machine_learning/Movies/models/content_similarity_matrix.pkl', 'wb') as f:
    pickle.dump(similarity_df, f)

print("✓ Similarity matrix rebuilt and saved!")
print(f"  Movies in similarity matrix: {len(similarity_df)}")



Rebuilding similarity matrix with filtered movies...
Genre matrix before filtering: (9391, 19)
Genre matrix after filtering: (9391, 19)

Similarity matrix shape: (9391, 9391)
✓ Similarity matrix rebuilt and saved!
  Movies in similarity matrix: 9391
