In [None]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install seaborn
# %pip install scikit-surprise
# %pip install joblib

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
import os

In [None]:
# Function to read a single data file and return a DataFrame
def read_data_file(file_path):
    data_list = []
    with open(file_path, 'r') as file:
        current_movie_id = None
        for line in file:
            line = line.strip()
            if line.endswith(':'):
                current_movie_id = int(line.replace(':', ''))
            else:
                customer_id, rating, date = line.split(',')
                data_list.append([int(customer_id), current_movie_id, float(rating), date])
    return pd.DataFrame(data_list, columns=['Cust_Id', 'Movie_Id', 'Rating', 'Date'])

In [None]:
# Function to filter out inactive users based on the specified quantile threshold
def filter_active_users(ratings, quantile_threshold=0.7):
    ratings_count = ratings['Cust_Id'].value_counts()
    active_users = ratings_count[ratings_count >= ratings_count.quantile(quantile_threshold)].index
    return ratings[ratings['Cust_Id'].isin(active_users)]

In [None]:
# Function to prepare data for the Surprise library
def prepare_data_for_surprise(ratings):
    reader = Reader(rating_scale=(1, 5))
    return Dataset.load_from_df(ratings[['Cust_Id', 'Movie_Id', 'Rating']], reader)

In [None]:
# List of combined data files
data_files = [
    'combined_data_1.txt',
    'combined_data_2.txt',
    'combined_data_3.txt',
    'combined_data_4.txt'
]

In [None]:
# Load data from the first file for training
train_ratings_1 = read_data_file(os.path.join('/kaggle/input/netflix-prize-data/', data_files[0]))

# Load data from the second file for training
train_ratings_2 = read_data_file(os.path.join('/kaggle/input/netflix-prize-data/', data_files[1]))

# Concatenate training data
train_ratings = pd.concat([train_ratings_1, train_ratings_2])

print(train_ratings.head())


In [None]:
del train_ratings_1
del train_ratings_2

In [None]:
# Filter inactive users from training data
train_ratings = filter_active_users(train_ratings)
print(train_ratings.shape)


In [None]:
# Basic statistics for training data
print(train_ratings.describe())


In [None]:
# Rating distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=train_ratings, x='Rating', palette='viridis')
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()


In [None]:
# Number of ratings per movie
ratings_per_movie = train_ratings.groupby('Movie_Id').size()
plt.figure(figsize=(10, 6))
plt.hist(ratings_per_movie, bins=50, color='purple')
plt.title('Number of Ratings per Movie')
plt.xlabel('Number of Ratings')
plt.ylabel('Count')
plt.yscale('log')
plt.show()


In [None]:
# Number of ratings per user
ratings_per_user = train_ratings.groupby('Cust_Id').size()
plt.figure(figsize=(10, 6))
plt.hist(ratings_per_user, bins=50, color='orange')
plt.title('Number of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Count')
plt.yscale('log')
plt.show()


In [None]:
# Prepare training data for the Surprise library
train_data = prepare_data_for_surprise(train_ratings)

In [None]:
del train_ratings

In [None]:
# Split the training data into train and validation sets
trainset, valset = train_test_split(train_data, test_size=0.25)


In [None]:
del train_data

In [None]:
# Define the SVD model
algo = SVD(n_epochs=20)

In [None]:
# Train the SVD model
algo.fit(trainset)


In [None]:
# Evaluate the model on the validation set
val_predictions = algo.test(valset)
val_rmse = accuracy.rmse(val_predictions)
val_mae = accuracy.mae(val_predictions)
print(f'Validation RMSE: {val_rmse}, Validation MAE: {val_mae}')


In [None]:
# Clear unnecessary data structures to free up memory
del trainset
del valset
del val_predictions

import gc
gc.collect()

In [None]:
# Save the trained model to a file
model_filename = 'svd_model.joblib'
joblib.dump(algo, model_filename)
print(f"Model saved to {model_filename}")

In [None]:
# Load the saved model
model_filename = 'svd_model.joblib'
algo = joblib.load(model_filename)
print("Model loaded successfully")

In [None]:
# Load additional data files
train_ratings_3 = read_data_file(os.path.join('/kaggle/input/netflix-prize-data/', data_files[2]))
train_ratings_4 = read_data_file(os.path.join('/kaggle/input/netflix-prize-data/', data_files[3]))

In [None]:
# Concatenate the additional data with the existing training data
additional_ratings = pd.concat([train_ratings_3, train_ratings_4])

In [None]:
del train_ratings_3
del train_ratings_4

In [None]:
# Filter inactive users from additional training data
additional_ratings = filter_active_users(additional_ratings)
print(additional_ratings.shape)

In [None]:
# Basic statistics for additional data
print(additional_ratings.describe())

In [None]:
# Rating distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=additional_ratings, x='Rating', palette='viridis')
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

In [None]:
# Number of ratings per movie
ratings_per_movie = additional_ratings.groupby('Movie_Id').size()
plt.figure(figsize=(10, 6))
plt.hist(ratings_per_movie, bins=50, color='purple')
plt.title('Number of Ratings per Movie')
plt.xlabel('Number of Ratings')
plt.ylabel('Count')
plt.yscale('log')
plt.show()

In [None]:
# Number of ratings per user
ratings_per_user = additional_ratings.groupby('Cust_Id').size()
plt.figure(figsize=(10, 6))
plt.hist(ratings_per_user, bins=50, color='orange')
plt.title('Number of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Count')
plt.yscale('log')
plt.show()

In [None]:
# Prepare additional data for the Surprise library
additional_data = prepare_data_for_surprise(additional_ratings)

In [None]:
del additional_ratings

In [None]:
# Split the additional data into train and validation sets
additional_trainset, additional_valset = train_test_split(additional_data, test_size=0.25)

In [None]:
del additional_data

In [None]:
# Train the model incrementally with additional data
algo.fit(additional_trainset)

In [None]:
# Evaluate the model on the additional validation set
additional_val_predictions = algo.test(additional_valset)
additional_val_rmse = accuracy.rmse(additional_val_predictions)
additional_val_mae = accuracy.mae(additional_val_predictions)
print(f'Additional Validation RMSE: {additional_val_rmse}, Additional Validation MAE: {additional_val_mae}')

In [None]:
# Clear unnecessary data structures to free up memory
del additional_trainset
del additional_valset

import gc
gc.collect()

In [None]:
# Save the updated model
updated_model_filename = 'svd_model_updated.joblib'
joblib.dump(algo, updated_model_filename)
print(f"Updated model saved to {updated_model_filename}")

In [None]:
import zipfile

# Path to the model file
model_filename = 'svd_model_updated.joblib'

# Name of the zip file
zip_filename = 'svd_model.zip'

# Create a zip file and add the model file to it
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    zipf.write(model_filename)

print(f"Model has been zipped and saved as {zip_filename}")


In [None]:
# Load the saved model
updated_model_filename = 'svd_model_updated.joblib'
algo = joblib.load(model_filename)
print("Model loaded successfully")

In [None]:
# Function to get top-N recommendations for a user
def get_top_n_recommendations(algo, user_id, movie_ids, n=10):
    # Create a list of (movie_id, predicted_rating) for all movies
    predictions = [algo.predict(user_id, str(movie_id)) for movie_id in movie_ids]
    
    # Sort the predictions by estimated rating in descending order
    top_n_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    
    # Extract the movie IDs and estimated scores from the top-N predictions
    top_n_movie_ids = [int(pred.iid) for pred in top_n_predictions]
    top_n_scores = [pred.est for pred in top_n_predictions]
    
    return top_n_movie_ids, top_n_scores

In [None]:
# Load movie titles with custom parser to handle inconsistent number of columns
def read_movie_titles(file_path):
    movie_titles = []
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        for line in file:
            parts = line.strip().split(',', 2)
            if len(parts) == 3:
                movie_id, year, name = parts
                try:
                    movie_id = int(movie_id)
                except ValueError:
                    continue
                if year == 'NULL':
                    year = None
                else:
                    try:
                        year = int(year)
                    except ValueError:
                        continue
                movie_titles.append([movie_id, year, name])
    return pd.DataFrame(movie_titles, columns=['Movie_Id', 'Year', 'Name'])

movie_titles = read_movie_titles('/kaggle/input/netflix-prize-data/movie_titles.csv')

In [None]:
# Get all movie IDs from the movie titles dataset
all_movie_ids = movie_titles['Movie_Id'].unique()

In [None]:
# Load the qualifying dataset
def read_qualifying_file(file_path):
    qualifying_data = []
    current_movie_id = None
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.endswith(':'):
                current_movie_id = int(line.replace(':', ''))
            else:
                customer_id, date = line.split(',')
                qualifying_data.append([int(customer_id), current_movie_id])
    return pd.DataFrame(qualifying_data, columns=['Cust_Id', 'Movie_Id'])

qualifying_data = read_qualifying_file('/kaggle/input/netflix-prize-data/qualifying.txt')

In [None]:
# Get unique user IDs from the qualifying dataset
unique_user_ids = qualifying_data['Cust_Id'].unique()

In [None]:
# Generate predictions for the qualifying set
qualifying_predictions = []
for user_id in unique_user_ids:
    top_n_recommendations, top_n_scores = get_top_n_recommendations(algo, user_id, all_movie_ids, n=10)
    for movie_id, score in zip(top_n_recommendations, top_n_scores):
        qualifying_predictions.append([user_id, movie_id, score])

qualifying_predictions_df = pd.DataFrame(qualifying_predictions, columns=['Cust_Id', 'Movie_Id', 'Estimated_Score'])

In [None]:
# Merge with movie titles
qualifying_predictions_df = qualifying_predictions_df.merge(movie_titles, on='Movie_Id')


In [None]:
# Display the results for a specific user (for example, user_id = 712664)
specific_user_predictions = qualifying_predictions_df[qualifying_predictions_df['Cust_Id'] == 712664]
print(f"Top 10 movie recommendations for user {user_id}:")
print(specific_user_predictions[['Movie_Id', 'Year', 'Name', 'Estimated_Score']])