In [17]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import csv
from scipy.sparse import csr_matrix
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from preprocess import *
from recommendation_system_methods import *
import pickle

: 

## 1. Data loading

In [None]:
combined_data_path = "data/combined_data_1.txt"
movies_data_path = "data/movie_titles.csv"

In [3]:
movies_data_path_new = "data/movie_titles_new.csv"

ratings_df = load_combined_data(combined_data_path)
preprocess_movie_titles(movies_data_path, movies_data_path_new)
movies_df = load_movie_titles(movies_data_path_new)

In [4]:
ratings_df.head()

Unnamed: 0,movie_id,user_id,rating
0,1,1488844,3
1,1,822109,5
2,1,885013,4
3,1,30878,4
4,1,823519,3


In [5]:
movies_df.head()

Unnamed: 0,movie_id,title
0,1,Dinosaur Planet
1,2,Isle of Man TT 2004 Review
2,3,Character
3,4,Paula Abdul's Get Up & Dance
4,5,The Rise and Fall of ECW


In [6]:
merged_df = ratings_df.merge(movies_df, on="movie_id")
merged_df.head()

Unnamed: 0,movie_id,user_id,rating,title
0,1,1488844,3,Dinosaur Planet
1,1,822109,5,Dinosaur Planet
2,1,885013,4,Dinosaur Planet
3,1,30878,4,Dinosaur Planet
4,1,823519,3,Dinosaur Planet


## 2. Data preprocessing

#### Filter movies and customers by review thresholds

In [7]:
functions = ["count", "mean"]

drop_movie_list, movie_benchmark = filter_dataframe(merged_df, 'movie_id', functions)
print(f"Movie minimum times of review: {movie_benchmark}")

drop_cust_list, cust_benchmark = filter_dataframe(merged_df, 'user_id', functions)
print(f"Customer minimum times of review: {cust_benchmark}")

ratings_df = ratings_df[~ratings_df["movie_id"].isin(drop_movie_list)]
ratings_df = ratings_df[~ratings_df["user_id"].isin(drop_cust_list)]

Movie minimum times of review: 1799.0
Customer minimum times of review: 52.0


## 3. Apply collaborative filtering

In [8]:
# Use Surprise for collaborative filtering
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[["user_id", "movie_id", "rating"]], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

model = SVD()
model.fit(trainset)

predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print(f"Model RMSE: {rmse}")


RMSE: 0.8478
Model RMSE: 0.8478395996778104


## 4. Save dfs and model to pickle

In [16]:
save_dataframe_to_pickle(ratings_df, "ratings_df.pkl")
save_dataframe_to_pickle(movies_df, "movies_df.pkl")

save_model_to_pickle(model)

DataFrame saved to ratings_df.pkl
DataFrame saved to movies_df.pkl


### Example usage

In [11]:
# Step 5: Create a function to display movies seen by a user
def seen_movies(user_id):
    seen = ratings_df[ratings_df["user_id"] == user_id].merge(movies_df, on="movie_id")
    return seen[["movie_id", "title", "rating"]]

In [12]:
# Find top users by movie count
top_users = top_users_by_movie_count(10)
print("Top users by the number of movies seen:")
print(top_users)


Top users by the number of movies seen:
user_id
305344     1344
387418     1339
2439493    1324
2118461    1305
1664010    1257
716173     1169
1314869    1132
2056022    1072
1852040    1053
2606799    1039
dtype: int64


In [13]:

# Example usage
user_id = 305344   # Replace with a valid user ID
num_recommendations = 5

# Display movies the user has already seen
seen = seen_movies(user_id)
print(f"Movies already seen by user {user_id}:")
print(seen)

# Get recommendations
recommendations = recommend_movies(user_id, num_recommendations)

# Display recommendations
print(f"Top {num_recommendations} recommendations for user {user_id}:")
print(recommendations)



Movies already seen by user 305344:
      movie_id                       title  rating
0            3                   Character       2
1            8  What the #$*! Do We Know!?       1
2           16                   Screamers       1
3           17                   7 Seconds       2
4           18            Immortal Beloved       2
...        ...                         ...     ...
1339      4488                 Wonder Boys       3
1340      4490                   Ned Kelly       2
1341      4492                  Club Dread       1
1342      4493           Ju-on: The Grudge       1
1343      4496       Farewell My Concubine       1

[1344 rows x 3 columns]
Top 5 recommendations for user 305344:
   movie_id           title  estimated_rating
0      3153  Doctor Zhivago          3.176996
1      2251      The Rookie          3.084893
2       886             Ray          2.889629
3      3309      The Pledge          2.862669
4      3680         Hackers          2.178755


In [14]:
ratings_df

Unnamed: 0,movie_id,user_id,rating
693,3,712664,5
694,3,1331154,4
695,3,2632461,3
696,3,44937,5
697,3,656399,4
...,...,...,...
24052345,4496,520675,3
24052346,4496,1055714,5
24052347,4496,2643029,4
24052349,4496,1559566,3


In [15]:
import pickle

# Step 4: Save the model to a pickle file
def save_model_to_pickle(model, filename="svd_model.pkl"):
    with open(filename, "wb") as file:
        pickle.dump(model, file)
    print(f"Model saved to {filename}")

save_model_to_pickle(model)

Model saved to svd_model.pkl
