# Table of contents
1. [Libraries](#libraries)
2. [Numerical columns](#numerical)
3. [Categorical columns](#categorical)



<h1 id = "libraries"> 1. Libraries </h1>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
from scipy.stats import sem

from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
raw_data = pd.read_csv('../Data/retyped_data.csv')

<div id = "numerical"> <h1> 2. Numerical columns </h1> </div>

- Impute missing values using KNN strategy

In [3]:
# Prepare the data (numeric columns only)
columns = ['Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']

# Copy raw data and scale 'Meta UserScore'
data_tmp_2 = raw_data.copy()
data_tmp_2['Meta UserScore'] = data_tmp_2['Meta UserScore'] * 10

# Create an imputer
knn_imputer = KNNImputer(n_neighbors=20, weights="uniform")

# Step 1: Apply KNN Imputation
# Impute all missing values
imputed_data = knn_imputer.fit_transform(data_tmp_2[columns])

# Reconstruct the DataFrame with imputed values
imputed_df = pd.DataFrame(imputed_data, columns=columns)

# Step 2: Fill missing values only
for col in columns:
    # Update only rows where the value is NaN
    data_tmp_2[col] = data_tmp_2[col].combine_first(imputed_df[col])

data_tmp_2.isnull().sum()

Title                      0
Tomatoes CriticScore       0
Tomatoes UserScore         0
Link                       0
PlatformReleased           0
Cast                      32
Director                  35
Genre                     46
Rating                  1091
Runtime                   70
Studio                    47
Release Date              75
Production Budget          0
Domestic Gross             0
Worldwide Gross            0
Metascore                  0
Meta UserScore             0
dtype: int64

<div id = "categorical"> <h1> 3. Categorical columns </h1> </div>