# Table of contents
1. [Libraries](#libraries)
2. [Numerical columns](#numerical)
3. [Categorical columns](#categorical)



<h1 id = "libraries"> 1. Libraries </h1>

In [70]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
from scipy.stats import sem


In [71]:
raw_data = pd.read_csv('../Data/retyped_data.csv')

<div id = "numerical"> <h1> 2. Numerical columns </h1> </div>

In [None]:
import numpy as np
np.set_printoptions(suppress=True)  # To make numpy output more readable

# Assuming raw_data is available
# Copy the raw data and add an ID column
tmp_data = raw_data.copy()
tmp_data['id'] = tmp_data.index
tmp_data['Meta UserScore'] = tmp_data['Meta UserScore'] * 10  # Scaling Meta UserScore

# Select only the ID and score columns
tmp_data = tmp_data[['id', 'Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']].to_numpy()

batch_size = 100
start = 0
end = 100

# Define the function to calculate similarities
def calculate_similarities(ratings, batch_start, batch_end):
    # Select the batch of users
    batch_ratings = ratings[batch_start:batch_end]
    
    # Calculate the absolute difference between the batch and all users
    abs_diff = np.abs(ratings - batch_ratings.reshape(batch_end - batch_start, 1, ratings.shape[1]))
    
    # Calculate the mean absolute difference across movies, ignoring NaN values
    mean_diff = np.nanmean(abs_diff, axis=2)
    
    # Compute similarity as the inverse of the mean absolute difference
    similarities = 1 / (mean_diff + 0.001)  # Adding a small epsilon to avoid division by zero
    similarities[np.isnan(similarities)] = 0
    return similarities

# Calculate similarities for the first batch
similarities = calculate_similarities(tmp_data, start, end)

# Prepare for filling missing values in batches
n_movies = tmp_data.shape[0]
filled_ratings = np.empty_like(tmp_data)
num_batches = int(np.ceil(n_movies / batch_size))

# Fill missing values in batches
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, n_movies)
    
    # Calculate similarities for the current batch
    similarities = calculate_similarities(tmp_data, start, end)
    
    # Calculate weights for imputation
    weights = ~np.isnan(tmp_data) * similarities.reshape(end - start, -1, 1)
    weights /= weights.sum(axis=1, keepdims=True)
    
    # Perform imputation
    filled_ratings[start:end] = np.nansum(tmp_data * weights, axis=1)

# Extract imputed values
filled_nanvals = filled_ratings[np.isnan(tmp_data)]
filled_nanvals

# Fill missing values in the original data
tmp_data[np.isnan(tmp_data)] = filled_nanvals

# Turn the filled data into an original DataFrame
filled_df = pd.DataFrame(
    filled_ratings[:, 1:],  # Exclude 'id' column
    columns=['Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']
)

filled_df['Meta UserScore'] /= 10  # Revert scaling
tmp_data_2 = raw_data.copy()

# Fill the NaN values in raw_data with values from filled_df
for col in filled_df.columns:
    tmp_data_2[col].fillna(filled_df[col], inplace=True)

tmp_data_2

Unnamed: 0,Title,Tomatoes CriticScore,Tomatoes UserScore,Link,PlatformReleased,Cast,Director,Genre,Rating,Runtime,Studio,Release Date,Production Budget,Domestic Gross,Worldwide Gross,Metascore,Meta UserScore
0,L.A. Confidential,99.000000,94.000000,https://www.rottentomatoes.com/m/la_confidential,Cinema,"['Kevin Spacey', 'Russell Crowe', 'Guy Pearce'...",['Curtis Hanson'],"['Crime', 'Drama']",R,"{'hours': 2, 'minutes': 16}",Warner Home Vídeo,1997-09-19,35000000.0,64604977.0,1.262050e+08,91.000000,8.600000
1,The Godfather,97.000000,98.000000,https://www.rottentomatoes.com/m/the_godfather,Cinema,"['Marlon Brando', 'Al Pacino', 'James Caan', '...",['Francis Ford Coppola'],"['Crime', 'Drama']",R,"{'hours': 2, 'minutes': 57}",Paramount Pictures,1972-03-15,7000000.0,136479994.0,2.700074e+08,100.000000,9.300000
2,Casablanca,99.000000,95.000000,https://www.rottentomatoes.com/m/1003707-casab...,Cinema,"['Humphrey Bogart', 'Ingrid Bergman', 'Paul He...",['Michael Curtiz'],['Drama'],PG,"{'hours': 1, 'minutes': 42}",Warner Bros. Pictures,1943-01-23,1039000.0,10462500.0,1.056718e+07,100.000000,8.700000
3,Parasite,99.000000,90.000000,https://www.rottentomatoes.com/m/parasite_2019,Cinema,"['Song Kang-ho', 'Lee Sun-kyun', 'Jo Yeo-jeong...",['Bong Joon Ho'],"['Comedy', 'Mystery & Thriller', 'Drama']",R,"{'hours': 2, 'minutes': 12}",Neon,2019-05-30,11800000.0,53369745.0,2.532679e+08,97.000000,8.900000
4,Top Gun: Maverick,96.000000,99.000000,https://www.rottentomatoes.com/m/top_gun_maverick,Cinema,"['Tom Cruise', 'Miles Teller', 'Jennifer Conne...",['Joseph Kosinski'],"['Action', 'Adventure']",PG-13,"{'hours': 2, 'minutes': 11}",Paramount Pictures,2022-05-20,170000000.0,718732821.0,1.464400e+09,78.000000,8.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5101,Antarctic Edge: 70 South,57.000000,50.000000,https://www.rottentomatoes.com/m/antarctic_edg...,Cinema,"['Dena Seidel', 'Richard Ludescher', 'Dena Sei...",['Dena Seidel'],"['Documentary', 'Adventure']",,"{'hours': 1, 'minutes': 12}",First Run,2015-04-17,150000.0,7193.0,7.193000e+03,62.000000,6.308006
5102,8 Days to Hell,57.792858,58.259400,https://www.rottentomatoes.com/m/8_days_to_hell,Cinema,"['Eric Roberts', 'Shane Woodson', 'Drew Hale',...",['Shane Woodson'],['Horror'],,"{'hours': 1, 'minutes': 25}",Pegasus Flying Films,2021-07-30,100000.0,0.0,0.000000e+00,52.376220,6.341546
5103,20 Dates,35.000000,47.000000,https://www.rottentomatoes.com/m/20_dates,Cinema,"['Myles Berkowitz', 'Elisabeth Wagner', 'Richa...",['Myles Berkowitz'],"['Comedy', 'Romance']",R,"{'hours': 1, 'minutes': 32}",Fox,1999-02-26,66000.0,541636.0,6.029200e+05,35.000000,6.228791
5104,Happy 40th,57.097283,58.999071,https://www.rottentomatoes.com/m/happy_40th,Cinema,"['Fernando Acosta', 'Jenni Blong', 'Robyn Cohe...",['Madoka Raine'],['Drama'],,"{'hours': 1, 'minutes': 40}",English,,35000.0,0.0,0.000000e+00,51.232347,6.332519


In [None]:
# Convert the updated data back to a DataFrame
# predicted_data = pd.DataFrame(tmp_data, columns=['id', 'Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore'])
# predicted_data
# # Fill missing values in the original data with the predicted values
# filled_data = raw_data.copy()
# filled_data[['Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']] = predicted_data[['Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']]

<div id = "categorical"> <h1> 3. Categorical columns </h1> </div>