# Table of contents
1. [Libraries](#libraries)
2. [Numerical columns](#numerical)
3. [Categorical columns](#categorical)



<h1 id = "libraries"> 1. Libraries </h1>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
from scipy.stats import sem

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
raw_data = pd.read_csv('../Data/retyped_data.csv')

<div id = "numerical"> <h1>Evalutation </h1> </div>

#### This notebook is dedicated to evaluate imputation using similarity strategy

+ Function to fill missing using similarity strategy

In [None]:

batch_size = 32
start = 0
end = 32

# Define the function to calculate similarities
def calculate_similarities(ratings, batch_start, batch_end):
    # Select the batch of users
    batch_ratings = ratings[batch_start:batch_end]
    
    # Calculate the absolute difference between the batch and all users
    abs_diff = np.abs(ratings - batch_ratings.reshape(batch_end - batch_start, 1, ratings.shape[1]))
    
    # Calculate the mean absolute difference across movies, ignoring NaN values
    mean_diff = np.nanmean(abs_diff, axis=2)
    
    # Compute similarity as the inverse of the mean absolute difference
    similarities = 1 / (mean_diff + 0.001)  # Adding a small epsilon to avoid division by zero
    similarities[np.isnan(similarities)] = 0
    return similarities

def fill_missing(data, batch_size = 32):
    n_movies = data.shape[0]
    filled_ratings = np.empty_like(data)
    num_batches = int(np.ceil(n_movies / batch_size))

    for i in range(num_batches):
        start = i * batch_size
        end = min((i + 1) * batch_size, n_movies)

        similarities = calculate_similarities(data, start, end)
        
        weights = ~np.isnan(data) * similarities.reshape(end - start, -1, 1)
        weights /= weights.sum(axis=1, keepdims=True)

        filled_ratings[start:end] = np.nansum(data * weights, axis=1)

    return filled_ratings



+ Evaluate by columns

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# Prepare the data (numeric columns only)
columns = ['Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']
test_size = 0.3

for test_col in columns:
    # Step 1: Prepare the test dataset
    raw_data_copy = raw_data.copy()
    raw_data_copy.dropna(inplace=True)
    raw_data_copy.reset_index(drop=True, inplace=True)

    # Select test rows (30% of the data)
    test_rows = raw_data_copy.sample(frac=test_size, random_state=42).index
    y_test = raw_data_copy.loc[test_rows, test_col].copy()

    # Mask test column values (set them to NaN for imputation)
    raw_data_copy.loc[test_rows, test_col] = np.nan

    # Step 2: Apply KNN Imputation
    knn_imputer = KNNImputer(n_neighbors=10, weights="uniform")
    imputed_data = knn_imputer.fit_transform(raw_data_copy[columns])

    # Reconstruct the imputed DataFrame
    imputed_df = pd.DataFrame(imputed_data, columns=columns)

    # Step 3: Evaluate the imputed values
    y_pred = imputed_df.loc[test_rows, test_col]

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'Summary of {test_col}:')
    print('Mean Absolute Error:', mae)
    print('Mean Squared Error:', mse)
    print('R² Score:', r2)
    print('--------\n')


Summary of Tomatoes CriticScore:
Mean Absolute Error: 7.453907815631262
Mean Squared Error: 102.86180360721441
R² Score: 0.866499879281352
--------

Summary of Tomatoes UserScore:
Mean Absolute Error: 10.875150300601202
Mean Squared Error: 188.2387975951904
R² Score: 0.548752980416857
--------

Summary of Metascore:
Mean Absolute Error: 6.166032064128257
Mean Squared Error: 64.38800601202405
R² Score: 0.8176613400599261
--------

Summary of Meta UserScore:
Mean Absolute Error: 0.5880761523046093
Mean Squared Error: 0.6473482965931864
R² Score: 0.580026019098285
--------

