In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from math import sqrt
import time

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:

# Define file paths
train_path = '/kaggle/input/movie-data/recommendation-ratings-train.txt'
test_path = '/kaggle/input/movie-data/recommendation-ratings-test.txt'

# Load the data
def load_data(file_path):
    """Load rating data from file"""
    columns = ['userId', 'movieId', 'rating', 'timestamp']
    data = pd.read_csv(file_path, sep='\t', names=columns)
    return data

# Load train and test data
train_data = load_data(train_path)
test_data = load_data(test_path)

# Display basic information
print("Training data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

# Display first few rows of training data
print("\nTraining data sample:")
print(train_data.head())

# Basic statistics
print("\nTraining data statistics:")
print(train_data.describe())

# Check for missing values
print("\nMissing values in training data:")
print(train_data.isnull().sum())
print("\nMissing values in test data:")
print(test_data.isnull().sum())

# Visualize rating distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='rating', data=train_data)
plt.title('Rating Distribution in Training Data')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

# Create user-item matrices
def create_matrix(data):
    """Create a user-item matrix from the data"""
    return data.pivot_table(index='userId', columns='movieId', values='rating')

user_item_matrix_train = create_matrix(train_data)
print("\nUser-Item Matrix Shape (Train):", user_item_matrix_train.shape)

# Check sparsity
sparsity = 100 * (1 - user_item_matrix_train.count().sum() / 
                 (user_item_matrix_train.shape[0] * user_item_matrix_train.shape[1]))
print(f"Matrix Sparsity: {sparsity:.2f}%")


In [None]:

class UserCF:
    def __init__(self, similarity_method='pearson', k=30):
        """
        Initialize User-based Collaborative Filtering

        Parameters:
        -----------
        similarity_method : str
            Method to calculate similarity ('pearson' or 'cosine')
        k : int
            Number of neighbors to consider
        """
        self.similarity_method = similarity_method
        self.k = k
        self.user_similarity = None
        self.user_item_matrix = None

    def fit(self, user_item_matrix):
        """
        Fit the model with user-item matrix

        Parameters:
        -----------
        user_item_matrix : pandas.DataFrame
            User-item matrix with users as index and items as columns
        """
        self.user_item_matrix = user_item_matrix

        # Calculate user similarity matrix
        if self.similarity_method == 'pearson':
            self.user_similarity = self.user_item_matrix.T.corr(method='pearson')
        elif self.similarity_method == 'cosine':
            # Normalize the data for cosine similarity
            normalized_matrix = self.user_item_matrix.subtract(self.user_item_matrix.mean(axis=1), axis=0)
            # Fill NaN with 0 for dot product calculation
            normalized_matrix = normalized_matrix.fillna(0)

            # Calculate cosine similarity
            similarity_matrix = pd.DataFrame(index=self.user_item_matrix.index, 
                                            columns=self.user_item_matrix.index)

            for i in self.user_item_matrix.index:
                for j in self.user_item_matrix.index:
                    user1 = normalized_matrix.loc[i].values
                    user2 = normalized_matrix.loc[j].values

                    # Calculate dot product
                    dot_product = np.dot(user1, user2)

                    # Calculate magnitudes
                    magnitude1 = np.sqrt(np.dot(user1, user1))
                    magnitude2 = np.sqrt(np.dot(user2, user2))

                    # Calculate cosine similarity
                    if magnitude1 * magnitude2 == 0:
                        similarity_matrix.loc[i, j] = 0
                    else:
                        similarity_matrix.loc[i, j] = dot_product / (magnitude1 * magnitude2)

            self.user_similarity = similarity_matrix

        print(f"User similarity matrix shape: {self.user_similarity.shape}")

    def predict(self, user_id, item_id):
        """
        Predict rating for a user-item pair

        Parameters:
        -----------
        user_id : int
            User ID
        item_id : int
            Item ID

        Returns:
        --------
        float
            Predicted rating
        """
        if user_id not in self.user_item_matrix.index or item_id not in self.user_item_matrix.columns:
            # If user or item not in training data, return global mean
            return self.user_item_matrix.stack().mean()

        # Get users who rated this item
        users_rated_item = self.user_item_matrix[item_id].dropna().index

        if len(users_rated_item) == 0:
            # If no user rated this item, return user's mean rating
            user_mean = self.user_item_matrix.loc[user_id].mean()
            return user_mean if not np.isnan(user_mean) else self.user_item_matrix.stack().mean()

        # Get similarity scores for these users
        if user_id in self.user_similarity.index:
            similarities = self.user_similarity.loc[user_id, users_rated_item]
        else:
            # If user not in similarity matrix, return global mean
            return self.user_item_matrix.stack().mean()

        # Select top-k neighbors
        if len(similarities) > self.k:
            top_k_users = similarities.nlargest(self.k).index
        else:
            top_k_users = similarities.index

        # Calculate weighted average of ratings
        numerator = 0
        denominator = 0

        for neighbor in top_k_users:
            if item_id in self.user_item_matrix.columns and not np.isnan(self.user_item_matrix.loc[neighbor, item_id]):
                sim_score = self.user_similarity.loc[user_id, neighbor]
                rating = self.user_item_matrix.loc[neighbor, item_id]

                numerator += sim_score * rating
                denominator += abs(sim_score)

        if denominator == 0:
            # If no valid neighbors, return user's mean rating
            user_mean = self.user_item_matrix.loc[user_id].mean()
            return user_mean if not np.isnan(user_mean) else self.user_item_matrix.stack().mean()

        return numerator / denominator

    def predict_all(self, test_data):
        """
        Predict ratings for all user-item pairs in test data

        Parameters:
        -----------
        test_data : pandas.DataFrame
            Test data with userId, movieId columns

        Returns:
        --------
        pandas.Series
            Predicted ratings
        """
        predictions = []

        for _, row in test_data.iterrows():
            user_id = row['userId']
            item_id = row['movieId']
            pred = self.predict(user_id, item_id)
            predictions.append(pred)

        return pd.Series(predictions)


In [None]:

class ItemCF:
    def __init__(self, similarity_method='pearson', k=30):
        """
        Initialize Item-based Collaborative Filtering

        Parameters:
        -----------
        similarity_method : str
            Method to calculate similarity ('pearson' or 'cosine')
        k : int
            Number of neighbors to consider
        """
        self.similarity_method = similarity_method
        self.k = k
        self.item_similarity = None
        self.user_item_matrix = None

    def fit(self, user_item_matrix):
        """
        Fit the model with user-item matrix

        Parameters:
        -----------
        user_item_matrix : pandas.DataFrame
            User-item matrix with users as index and items as columns
        """
        self.user_item_matrix = user_item_matrix

        # Calculate item similarity matrix
        if self.similarity_method == 'pearson':
            self.item_similarity = self.user_item_matrix.corr(method='pearson')
        elif self.similarity_method == 'cosine':
            # Normalize the data for cosine similarity
            normalized_matrix = self.user_item_matrix.subtract(self.user_item_matrix.mean(axis=0), axis=1)
            # Fill NaN with 0 for dot product calculation
            normalized_matrix = normalized_matrix.fillna(0)

            # Calculate cosine similarity
            similarity_matrix = pd.DataFrame(index=self.user_item_matrix.columns, 
                                            columns=self.user_item_matrix.columns)

            for i in self.user_item_matrix.columns:
                for j in self.user_item_matrix.columns:
                    item1 = normalized_matrix[i].values
                    item2 = normalized_matrix[j].values

                    # Calculate dot product
                    dot_product = np.dot(item1, item2)

                    # Calculate magnitudes
                    magnitude1 = np.sqrt(np.dot(item1, item1))
                    magnitude2 = np.sqrt(np.dot(item2, item2))

                    # Calculate cosine similarity
                    if magnitude1 * magnitude2 == 0:
                        similarity_matrix.loc[i, j] = 0
                    else:
                        similarity_matrix.loc[i, j] = dot_product / (magnitude1 * magnitude2)

            self.item_similarity = similarity_matrix

        print(f"Item similarity matrix shape: {self.item_similarity.shape}")

    def predict(self, user_id, item_id):
        """
        Predict rating for a user-item pair

        Parameters:
        -----------
        user_id : int
            User ID
        item_id : int
            Item ID

        Returns:
        --------
        float
            Predicted rating
        """
        if user_id not in self.user_item_matrix.index or item_id not in self.user_item_matrix.columns:
            # If user or item not in training data, return global mean
            return self.user_item_matrix.stack().mean()

        # Get items rated by this user
        items_rated_by_user = self.user_item_matrix.loc[user_id].dropna().index

        if len(items_rated_by_user) == 0:
            # If user hasn't rated any items, return item's mean rating
            item_mean = self.user_item_matrix[item_id].mean()
            return item_mean if not np.isnan(item_mean) else self.user_item_matrix.stack().mean()

        # Get similarity scores for these items
        if item_id in self.item_similarity.index:
            similarities = self.item_similarity.loc[item_id, items_rated_by_user]
        else:
            # If item not in similarity matrix, return global mean
            return self.user_item_matrix.stack().mean()

        # Select top-k neighbors
        if len(similarities) > self.k:
            top_k_items = similarities.nlargest(self.k).index
        else:
            top_k_items = similarities.index

        # Calculate weighted average of ratings
        numerator = 0
        denominator = 0

        for neighbor in top_k_items:
            if neighbor in self.user_item_matrix.columns and not np.isnan(self.user_item_matrix.loc[user_id, neighbor]):
                sim_score = self.item_similarity.loc[item_id, neighbor]
                rating = self.user_item_matrix.loc[user_id, neighbor]

                numerator += sim_score * rating
                denominator += abs(sim_score)

        if denominator == 0:
            # If no valid neighbors, return item's mean rating
            item_mean = self.user_item_matrix[item_id].mean()
            return item_mean if not np.isnan(item_mean) else self.user_item_matrix.stack().mean()

        return numerator / denominator

    def predict_all(self, test_data):
        """
        Predict ratings for all user-item pairs in test data

        Parameters:
        -----------
        test_data : pandas.DataFrame
            Test data with userId, movieId columns

        Returns:
        --------
        pandas.Series
            Predicted ratings
        """
        predictions = []

        for _, row in test_data.iterrows():
            user_id = row['userId']
            item_id = row['movieId']
            pred = self.predict(user_id, item_id)
            predictions.append(pred)

        return pd.Series(predictions)


In [None]:

def evaluate_model(model, test_data, model_name):
    """
    Evaluate model using RMSE

    Parameters:
    -----------
    model : object
        Model with predict_all method
    test_data : pandas.DataFrame
        Test data with userId, movieId, rating columns
    model_name : str
        Name of the model for display

    Returns:
    --------
    float
        RMSE score
    pandas.Series
        Predictions
    """
    start_time = time.time()
    predictions = model.predict_all(test_data)
    end_time = time.time()

    rmse = sqrt(mean_squared_error(test_data['rating'], predictions))

    print(f"{model_name} RMSE: {rmse:.4f}")
    print(f"{model_name} Prediction Time: {end_time - start_time:.2f} seconds")

    return rmse, predictions

# Train and evaluate User-CF with Pearson correlation
print("\n--- User-based CF with Pearson Correlation ---")
user_cf_pearson = UserCF(similarity_method='pearson', k=30)
user_cf_pearson.fit(user_item_matrix_train)
rmse_user_cf_pearson, pred_user_cf_pearson = evaluate_model(user_cf_pearson, test_data, "User-CF (Pearson)")

# Train and evaluate User-CF with Cosine similarity
print("\n--- User-based CF with Cosine Similarity ---")
user_cf_cosine = UserCF(similarity_method='cosine', k=30)
user_cf_cosine.fit(user_item_matrix_train)
rmse_user_cf_cosine, pred_user_cf_cosine = evaluate_model(user_cf_cosine, test_data, "User-CF (Cosine)")

# Train and evaluate Item-CF with Pearson correlation
print("\n--- Item-based CF with Pearson Correlation ---")
item_cf_pearson = ItemCF(similarity_method='pearson', k=30)
item_cf_pearson.fit(user_item_matrix_train)
rmse_item_cf_pearson, pred_item_cf_pearson = evaluate_model(item_cf_pearson, test_data, "Item-CF (Pearson)")

# Train and evaluate Item-CF with Cosine similarity
print("\n--- Item-based CF with Cosine Similarity ---")
item_cf_cosine = ItemCF(similarity_method='cosine', k=30)
item_cf_cosine.fit(user_item_matrix_train)
rmse_item_cf_cosine, pred_item_cf_cosine = evaluate_model(item_cf_cosine, test_data, "Item-CF (Cosine)")


In [None]:

# Compare RMSE scores
models = ['User-CF (Pearson)', 'User-CF (Cosine)', 'Item-CF (Pearson)', 'Item-CF (Cosine)']
rmse_scores = [rmse_user_cf_pearson, rmse_user_cf_cosine, rmse_item_cf_pearson, rmse_item_cf_cosine]

plt.figure(figsize=(12, 6))
bars = plt.bar(models, rmse_scores, color=['blue', 'skyblue', 'green', 'lightgreen'])
plt.title('RMSE Comparison of Different Recommendation Algorithms')
plt.xlabel('Algorithm')
plt.ylabel('RMSE (lower is better)')
plt.xticks(rotation=15)

# Add values on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{height:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Visualize prediction distribution
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.histplot(pred_user_cf_pearson, bins=20, kde=True)
plt.title('User-CF (Pearson) Predictions')
plt.xlabel('Predicted Rating')

plt.subplot(2, 2, 2)
sns.histplot(pred_user_cf_cosine, bins=20, kde=True)
plt.title('User-CF (Cosine) Predictions')
plt.xlabel('Predicted Rating')

plt.subplot(2, 2, 3)
sns.histplot(pred_item_cf_pearson, bins=20, kde=True)
plt.title('Item-CF (Pearson) Predictions')
plt.xlabel('Predicted Rating')

plt.subplot(2, 2, 4)
sns.histplot(pred_item_cf_cosine, bins=20, kde=True)
plt.title('Item-CF (Cosine) Predictions')
plt.xlabel('Predicted Rating')

plt.tight_layout()
plt.show()

# Analyze prediction errors
def analyze_errors(true_ratings, predictions, model_name):
    errors = predictions - true_ratings

    plt.figure(figsize=(10, 6))
    sns.histplot(errors, bins=20, kde=True)
    plt.title(f'Error Distribution for {model_name}')
    plt.xlabel('Prediction Error')
    plt.axvline(x=0, color='r', linestyle='--')
    plt.show()

    print(f"\n{model_name} Error Statistics:")
    print(f"Mean Error: {errors.mean():.4f}")
    print(f"Std Dev of Error: {errors.std():.4f}")
    print(f"Max Overestimation: {errors.max():.4f}")
    print(f"Max Underestimation: {errors.min():.4f}")

# Analyze errors for each model
analyze_errors(test_data['rating'], pred_user_cf_pearson, "User-CF (Pearson)")
analyze_errors(test_data['rating'], pred_user_cf_cosine, "User-CF (Cosine)")
analyze_errors(test_data['rating'], pred_item_cf_pearson, "Item-CF (Pearson)")
analyze_errors(test_data['rating'], pred_item_cf_cosine, "Item-CF (Cosine)")


In [None]:

def tune_k(model_class, similarity_method, k_values, user_item_matrix, test_data, model_name):
    """
    Tune the number of neighbors (k) for CF models

    Parameters:
    -----------
    model_class : class
        Class of the model (UserCF or ItemCF)
    similarity_method : str
        Similarity method to use
    k_values : list
        List of k values to try
    user_item_matrix : pandas.DataFrame
        User-item matrix
    test_data : pandas.DataFrame
        Test data
    model_name : str
        Name of the model for display
    """
    rmse_scores = []

    for k in k_values:
        model = model_class(similarity_method=similarity_method, k=k)
        model.fit(user_item_matrix)
        rmse, _ = evaluate_model(model, test_data, f"{model_name} (k={k})")
        rmse_scores.append(rmse)

    # Plot RMSE vs k
    plt.figure(figsize=(10, 6))
    plt.plot(k_values, rmse_scores, marker='o')
    plt.title(f'RMSE vs Number of Neighbors (k) for {model_name}')
    plt.xlabel('Number of Neighbors (k)')
    plt.ylabel('RMSE (lower is better)')
    plt.grid(True)
    plt.show()

    # Find best k
    best_idx = np.argmin(rmse_scores)
    best_k = k_values[best_idx]
    best_rmse = rmse_scores[best_idx]

    print(f"\nBest k for {model_name}: {best_k} with RMSE: {best_rmse:.4f}")

    return best_k, best_rmse

# Define k values to try
k_values = [5, 10, 20, 30, 50, 70, 100]

# Tune k for User-CF with Pearson correlation
print("\n--- Tuning k for User-CF (Pearson) ---")
best_k_user_pearson, best_rmse_user_pearson = tune_k(
    UserCF, 'pearson', k_values, user_item_matrix_train, test_data, "User-CF (Pearson)")

# Tune k for Item-CF with Pearson correlation
print("\n--- Tuning k for Item-CF (Pearson) ---")
best_k_item_pearson, best_rmse_item_pearson = tune_k(
    ItemCF, 'pearson', k_values, user_item_matrix_train, test_data, "Item-CF (Pearson)")


In [None]:

print("\n--- Recommendation System Comparison Summary ---")
print("\nRMSE Scores:")
for model, rmse in zip(models, rmse_scores):
    print(f"{model}: {rmse:.4f}")

print("\nBest Models After Tuning:")
print(f"User-CF (Pearson) with k={best_k_user_pearson}: RMSE = {best_rmse_user_pearson:.4f}")
print(f"Item-CF (Pearson) with k={best_k_item_pearson}: RMSE = {best_rmse_item_pearson:.4f}")

# Determine the overall best model
best_model_idx = np.argmin([best_rmse_user_pearson, best_rmse_item_pearson])
best_model_name = ["User-CF (Pearson)", "Item-CF (Pearson)"][best_model_idx]
best_model_k = [best_k_user_pearson, best_k_item_pearson][best_model_idx]
best_model_rmse = [best_rmse_user_pearson, best_rmse_item_pearson][best_model_idx]

print(f"\nOverall Best Model: {best_model_name} with k={best_model_k} (RMSE = {best_model_rmse:.4f})")

# Final observations
print("\nFinal Observations:")
print("1. We implemented and compared User-based and Item-based collaborative filtering algorithms.")
print(f"2. We evaluated the models using RMSE and found that {best_model_name} performed best.")
print("3. We tuned the number of neighbors (k) and found optimal values for each algorithm.")
print("4. The visualization of rating distributions and prediction errors provided insights into model behavior.")
print("5. Future improvements could include matrix factorization methods, hybrid approaches, or deep learning models.")