# Table of contents
1. [Libraries](#libraries)
2. [Numerical columns](#numerical)
3. [Categorical columns](#categorical)



<h1 id = "libraries"> 1. Libraries </h1>

In [60]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
from scipy.stats import sem

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [61]:
raw_data = pd.read_csv('../Data/retyped_data.csv')

<div id = "numerical"> <h1>Evalutation </h1> </div>

#### This notebook is dedicated to evaluate imputation using similarity strategy

+ Function to fill missing using similarity strategy

In [62]:

batch_size = 32
start = 0
end = 32

# Define the function to calculate similarities
def calculate_similarities(ratings, batch_start, batch_end):
    # Select the batch of users
    batch_ratings = ratings[batch_start:batch_end]
    
    # Calculate the absolute difference between the batch and all users
    abs_diff = np.abs(ratings - batch_ratings.reshape(batch_end - batch_start, 1, ratings.shape[1]))
    
    # Calculate the mean absolute difference across movies, ignoring NaN values
    mean_diff = np.nanmean(abs_diff, axis=2)
    
    # Compute similarity as the inverse of the mean absolute difference
    similarities = 1 / (mean_diff + 0.001)  # Adding a small epsilon to avoid division by zero
    similarities[np.isnan(similarities)] = 0
    return similarities

def fill_missing(data, batch_size = 32):
    n_movies = data.shape[0]
    filled_ratings = np.empty_like(data)
    num_batches = int(np.ceil(n_movies / batch_size))

    for i in range(num_batches):
        start = i * batch_size
        end = min((i + 1) * batch_size, n_movies)

        similarities = calculate_similarities(data, start, end)
        
        weights = ~np.isnan(data) * similarities.reshape(end - start, -1, 1)
        weights /= weights.sum(axis=1, keepdims=True)

        filled_ratings[start:end] = np.nansum(data * weights, axis=1)

    return filled_ratings



+ Evaluate by columns

In [None]:
from sklearn.impute import KNNImputer
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Prepare the data (numeric columns only)
columns = ['Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']

# Copy raw data and scale 'Meta UserScore'
data_tmp_2 = raw_data.copy()
data_tmp_2['Meta UserScore'] = data_tmp_2['Meta UserScore'] * 10

# Create an imputer
knn_imputer = KNNImputer(n_neighbors=20, weights="uniform")

# Step 1: Apply KNN Imputation
# Impute all missing values
imputed_data = knn_imputer.fit_transform(data_tmp_2[columns])

# Reconstruct the DataFrame with imputed values
imputed_df = pd.DataFrame(imputed_data, columns=columns)

# Step 2: Fill missing values only
for col in columns:
    # Update only rows where the value is NaN
    data_tmp_2[col] = data_tmp_2[col].combine_first(imputed_df[col])



In [66]:
data_tmp_2

Unnamed: 0,Title,Tomatoes CriticScore,Tomatoes UserScore,Link,PlatformReleased,Cast,Director,Genre,Rating,Runtime,Studio,Release Date,Production Budget,Domestic Gross,Worldwide Gross,Metascore,Meta UserScore
0,L.A. Confidential,99.000000,94.00000,https://www.rottentomatoes.com/m/la_confidential,Cinema,"['Kevin Spacey', 'Russell Crowe', 'Guy Pearce'...",['Curtis Hanson'],"['Crime', 'Drama']",R,"{'hours': 2, 'minutes': 16}",Warner Home Vídeo,1997-09-19,35000000.0,64604977.0,1.262050e+08,91.000000,86.000000
1,The Godfather,97.000000,98.00000,https://www.rottentomatoes.com/m/the_godfather,Cinema,"['Marlon Brando', 'Al Pacino', 'James Caan', '...",['Francis Ford Coppola'],"['Crime', 'Drama']",R,"{'hours': 2, 'minutes': 57}",Paramount Pictures,1972-03-15,7000000.0,136479994.0,2.700074e+08,100.000000,93.000000
2,Casablanca,99.000000,95.00000,https://www.rottentomatoes.com/m/1003707-casab...,Cinema,"['Humphrey Bogart', 'Ingrid Bergman', 'Paul He...",['Michael Curtiz'],['Drama'],PG,"{'hours': 1, 'minutes': 42}",Warner Bros. Pictures,1943-01-23,1039000.0,10462500.0,1.056718e+07,100.000000,87.000000
3,Parasite,99.000000,90.00000,https://www.rottentomatoes.com/m/parasite_2019,Cinema,"['Song Kang-ho', 'Lee Sun-kyun', 'Jo Yeo-jeong...",['Bong Joon Ho'],"['Comedy', 'Mystery & Thriller', 'Drama']",R,"{'hours': 2, 'minutes': 12}",Neon,2019-05-30,11800000.0,53369745.0,2.532679e+08,97.000000,89.000000
4,Top Gun: Maverick,96.000000,99.00000,https://www.rottentomatoes.com/m/top_gun_maverick,Cinema,"['Tom Cruise', 'Miles Teller', 'Jennifer Conne...",['Joseph Kosinski'],"['Action', 'Adventure']",PG-13,"{'hours': 2, 'minutes': 11}",Paramount Pictures,2022-05-20,170000000.0,718732821.0,1.464400e+09,78.000000,84.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5101,Antarctic Edge: 70 South,57.000000,50.00000,https://www.rottentomatoes.com/m/antarctic_edg...,Cinema,"['Dena Seidel', 'Richard Ludescher', 'Dena Sei...",['Dena Seidel'],"['Documentary', 'Adventure']",,"{'hours': 1, 'minutes': 12}",First Run,2015-04-17,150000.0,7193.0,7.193000e+03,62.000000,70.125000
5102,8 Days to Hell,58.367886,62.45744,https://www.rottentomatoes.com/m/8_days_to_hell,Cinema,"['Eric Roberts', 'Shane Woodson', 'Drew Hale',...",['Shane Woodson'],['Horror'],,"{'hours': 1, 'minutes': 25}",Pegasus Flying Films,2021-07-30,100000.0,0.0,0.000000e+00,55.605100,64.939799
5103,20 Dates,35.000000,47.00000,https://www.rottentomatoes.com/m/20_dates,Cinema,"['Myles Berkowitz', 'Elisabeth Wagner', 'Richa...",['Myles Berkowitz'],"['Comedy', 'Romance']",R,"{'hours': 1, 'minutes': 32}",Fox,1999-02-26,66000.0,541636.0,6.029200e+05,35.000000,58.000000
5104,Happy 40th,58.367886,62.45744,https://www.rottentomatoes.com/m/happy_40th,Cinema,"['Fernando Acosta', 'Jenni Blong', 'Robyn Cohe...",['Madoka Raine'],['Drama'],,"{'hours': 1, 'minutes': 40}",English,,35000.0,0.0,0.000000e+00,55.605100,64.939799
