In [15]:
import numpy as numpy
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from scipy.sparse import coo_matrix

In [2]:
users= pd.read_fwf('zee-users.dat',encoding='ISO-8859-1')
users=users['UserID::Gender::Age::Occupation::Zip-code'].str.split("::",expand=True)
users.columns=['UserId','Gender','Age','Occupation','Zip-code']
users

Unnamed: 0,UserId,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [3]:
movies=pd.read_fwf('zee-movies.dat',encoding='ISO-8859-1')
movies=movies['Movie ID::Title::Genres'].str.split("::",expand=True)
movies.columns=['MovieId','Title','Genres']
movies

Unnamed: 0,MovieId,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
ratings=pd.read_fwf('zee-ratings.dat',encoding='ISO-8859-1')
ratings=ratings['UserID::MovieID::Rating::Timestamp'].str.split("::",expand=True)
ratings.columns=['UserId','MovieId','Rating','Timestamp']
ratings

Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [5]:
df1= pd.merge(movies,ratings, how='inner',on='MovieId')
data= pd.merge(df1,users, how='inner',on='UserId')
data

Unnamed: 0,MovieId,Title,Genres,UserId,Rating,Timestamp,Gender,Age,Occupation,Zip-code
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268,F,1,10,48067
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008,F,50,9,55117
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496,M,25,12,11413
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952,M,25,17,61614
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474,F,35,1,95370
...,...,...,...,...,...,...,...,...,...,...
1000204,3952,"Contender, The (2000)",Drama|Thriller,5812,4,992072099,F,25,7,92120
1000205,3952,"Contender, The (2000)",Drama|Thriller,5831,3,986223125,M,25,1,92120
1000206,3952,"Contender, The (2000)",Drama|Thriller,5837,4,1011902656,M,25,7,60607
1000207,3952,"Contender, The (2000)",Drama|Thriller,5927,1,979852537,M,35,14,10003


In [7]:
# convert Rating to float
data['Rating']= data['Rating'].astype(float)

In [8]:
df_pivot= pd.pivot_table(data,index='UserId',columns='MovieId',values='Rating')
df_pivot

MovieId,1,10,100,1000,1002,1003,1004,1005,1006,1007,...,99,990,991,992,993,994,996,997,998,999
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
10,5.0,,,,,,,,,,...,,,,,,,,,,
100,,,,,,,,,,,...,,,,,,,,,,
1000,5.0,,,,,,,,,,...,,,,,,,,,,
1001,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,,,,,,,,,,,...,,,,,,,,,,
996,4.0,,,,,,,,,,...,,,,,,,,,,3.0
997,4.0,,,,,,,,,,...,,,,,,,,,,
998,,,,,,,,,,,...,,,,,,,,,,


In [9]:
df_pivot.isna().sum().sum()

21384031

In [13]:
sparsity = df_pivot.isna().sum().sum()/df_pivot.size
print("sparsity of the data" ,round(sparsity*100,2) ,"%")

sparsity of the data 95.53 %


In [32]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import warnings
warnings.filterwarnings("ignore")


# Prepare Surprise dataset
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(data[['UserId', 'MovieId', 'Rating']], reader)

# Train SVD with hyperparameter tuning
param_grid = {'n_factors': [2, 4, 10, 20]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(dataset)
best_model = gs.best_estimator['rmse']

# Train best model on full data
trainset = dataset.build_full_trainset()
best_model.fit(trainset)

# Create User-Movie Matrix with actual ratings
actual_ratings_matrix = data.pivot(index="UserId", columns="MovieId", values="Rating")

# Predict missing values
predicted_ratings_matrix = actual_ratings_matrix.copy()
user_ids = actual_ratings_matrix.index
movie_ids = actual_ratings_matrix.columns
for user in user_ids:
    for movie in movie_ids:
        if pd.isna(actual_ratings_matrix.at[user, movie]):
            predicted_ratings_matrix.at[user, movie] = best_model.predict(user, movie).est

# Replace MovieID with Titles
movieid_to_title = dict(zip(movies.MovieId, movies.Title))
predicted_ratings_matrix.columns = [movieid_to_title.get(col, col) for col in predicted_ratings_matrix.columns]

# Content-Based Filtering using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
movie_genres_matrix = vectorizer.fit_transform(movies['Genres'].fillna(""))
cosine_sim = cosine_similarity(movie_genres_matrix, movie_genres_matrix)

def recommend_movies(movie_title, top_n=5):
    if movie_title not in movies['Title'].values:
        return f"Error: Movie '{movie_title}' not found in the dataset."
    idx = movies[movies['Title'] == movie_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices][['Title', 'Genres']]

# MLflow Logging
mlflow.set_experiment("Hybrid Recommender System")
with mlflow.start_run():
    mlflow.log_param("Best_n_factors", gs.best_params['rmse']['n_factors'])
    
    testset = trainset.build_testset()
    predictions = best_model.test(testset)
    y_true = np.array([pred.r_ui for pred in predictions])
    y_pred = np.array([pred.est for pred in predictions])
    
    if len(y_true) > 0 and len(y_pred) > 0:
        rmse = mean_squared_error(y_true, y_pred) ** 0.5  # Manually taking the square root
        mape = mean_absolute_percentage_error(y_true, y_pred)
    else:
        rmse, mape = np.nan, np.nan
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAPE", mape)
    mlflow.sklearn.log_model(best_model, "SVD_Model")
    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("mlflow.runName", "Hybrid_Recommender")

    

    


Note: you may need to restart the kernel to use updated packages.




In [33]:
recommend_movies("Liar Liar", top_n=5)

"Error: Movie 'Liar Liar' not found in the dataset."

In [35]:
recommend_movies("Toy Story (1995)", top_n=10)

Unnamed: 0,Title,Genres
2072,"American Tail, An (1986)",Animation|Children's|Comedy
2285,"Rugrats Movie, The (1998)",Animation|Children's|Comedy
2286,"Bug's Life, A (1998)",Animation|Children's|Comedy
3045,Toy Story 2 (1999),Animation|Children's|Comedy
3542,Saludos Amigos (1943),Animation|Children's|Comedy
3682,Chicken Run (2000),Animation|Children's|Comedy
12,Balto (1995),Animation|Children's
241,Gumby: The Movie (1995),Animation|Children's
310,"Swan Princess, The (1994)",Animation|Children's
592,Pinocchio (1940),Animation|Children's


In [38]:
recommend_movies("Waiting to Exhale (1995)", top_n=10)

Unnamed: 0,Title,Genres
44,To Die For (1995),Comedy|Drama
71,Kicking and Screaming (1995),Comedy|Drama
74,Big Bully (1996),Comedy|Drama
83,Last Summer in the Hamptons (1995),Comedy|Drama
104,Nobody Loves Me (Keiner liebt mich) (1994),Comedy|Drama
131,Nueba Yol (1995),Comedy|Drama
164,"Doom Generation, The (1995)",Comedy|Drama
203,Unstrung Heroes (1995),Comedy|Drama
216,Boys on the Side (1995),Comedy|Drama
229,Eat Drink Man Woman (1994),Comedy|Drama


In [24]:
movies

Unnamed: 0,MovieId,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [26]:
recommend_movies("Two Family House (2000)", top_n=5)

Unnamed: 0,Title,Genres
25,Othello (1995),Drama
26,Now and Then (1995),Drama
29,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,Drama
30,Dangerous Minds (1995),Drama
35,Dead Man Walking (1995),Drama


In [39]:
!mlflow ui

[2025-02-02 23:27:55 +0530] [34498] [INFO] Starting gunicorn 23.0.0
[2025-02-02 23:27:55 +0530] [34498] [INFO] Listening at: http://127.0.0.1:5000 (34498)
[2025-02-02 23:27:55 +0530] [34498] [INFO] Using worker: sync
[2025-02-02 23:27:55 +0530] [34499] [INFO] Booting worker with pid: 34499
[2025-02-02 23:27:55 +0530] [34500] [INFO] Booting worker with pid: 34500
[2025-02-02 23:27:55 +0530] [34501] [INFO] Booting worker with pid: 34501
[2025-02-02 23:27:55 +0530] [34502] [INFO] Booting worker with pid: 34502
^C
[2025-02-02 23:55:12 +0530] [34498] [INFO] Handling signal: int
[2025-02-02 23:55:12 +0530] [34499] [INFO] Worker exiting (pid: 34499)
[2025-02-02 23:55:12 +0530] [34502] [INFO] Worker exiting (pid: 34502)
[2025-02-02 23:55:12 +0530] [34500] [INFO] Worker exiting (pid: 34500)
[2025-02-02 23:55:12 +0530] [34501] [INFO] Worker exiting (pid: 34501)
