# Table of contents
1. [Libraries](#libraries)
2. [Numerical columns](#numerical)
3. [Categorical columns](#categorical)



<h1 id = "libraries"> 1. Libraries </h1>

In [47]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
from scipy.stats import sem

from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from collections import Counter

In [48]:
raw_data = pd.read_csv('../Data/retyped_data.csv')

<div id = "numerical"> <h1> 2. Numerical columns </h1> </div>

- Impute missing values using KNN strategy

In [49]:
# Prepare the data (numeric columns only)
columns = ['Tomatoes CriticScore', 'Tomatoes UserScore', 'Metascore', 'Meta UserScore']

# Copy raw data and scale 'Meta UserScore'
data_tmp_2 = raw_data.copy()
data_tmp_2['Meta UserScore'] = data_tmp_2['Meta UserScore'] * 10

# Create an imputer
knn_imputer = KNNImputer(n_neighbors=20, weights="uniform")

# Step 1: Apply KNN Imputation
# Impute all missing values
imputed_data = knn_imputer.fit_transform(data_tmp_2[columns])

# Reconstruct the DataFrame with imputed values
imputed_df = pd.DataFrame(imputed_data, columns=columns)

# Step 2: Fill missing values only
for col in columns:
    # Update only rows where the value is NaN
    data_tmp_2[col] = data_tmp_2[col].combine_first(imputed_df[col])

data_tmp_2.isnull().sum()

Title                      0
Tomatoes CriticScore       0
Tomatoes UserScore         0
Link                       0
PlatformReleased           0
Cast                      32
Director                  35
Genre                     46
Rating                  1091
Runtime                   70
Studio                    47
Release Date              75
Production Budget          0
Domestic Gross             0
Worldwide Gross            0
Metascore                  0
Meta UserScore             0
dtype: int64

<div id = "categorical"> <h1> 3. Categorical columns </h1> </div>

In [50]:
columns = ['Cast', 'Director', 'Genre', 'Rating', 'Studio', 'Release Date']

<h3>Cast Column<h3>

In [51]:
# First, let's look at the distribution of Cast data
print("Number of missing values in Cast:", data_tmp_2['Cast'].isnull().sum())

# Create helper functions to process Cast
def process_cast(cast_string):
    """Normalize Cast string"""
    if pd.isna(cast_string):
        return []
    # Handle incorrect format cases
    cast_string = str(cast_string)
    cast_string = cast_string.replace('[', '').replace(']', '')  # Remove square brackets
    # Split actors and remove extra whitespace
    return [actor.strip() for actor in cast_string.split(',')]

def get_similar_movies_for_cast(row, data, n_similar=5):
    """Find similar movies based on Director and Genre"""
    # If both Director and Genre are missing, return random sample
    if pd.isna(row['Director']) and pd.isna(row['Genre']):
        return data.sample(n=n_similar)
    
    # Create initial mask for all rows
    mask = pd.Series(True, index=data.index)
    
    # Filter by Director if available
    if not pd.isna(row['Director']):
        mask &= (data['Director'] == row['Director'])
    # Filter by Genre if available  
    if not pd.isna(row['Genre']):
        mask &= (data['Genre'] == row['Genre'])
    
    # Get movies matching both filters
    similar_movies = data[mask]

    # If not enough similar movies, get more movies with same Director
    if len(similar_movies) < n_similar and not pd.isna(row['Director']):
        director_movies = data[data['Director'] == row['Director']]
        # Combine and remove duplicates
        similar_movies = pd.concat([similar_movies, director_movies]).drop_duplicates()
    
    # Return top n similar movies
    return similar_movies.head(n_similar)

def fill_cast(row, data):
    """Fill missing Cast based on similar movies"""
    if pd.isna(row['Cast']):
        similar_movies = get_similar_movies_for_cast(row, data)
        # Get all casts from similar movies
        all_casts = []
        for _, movie in similar_movies.iterrows():
            if not pd.isna(movie['Cast']):
                all_casts.extend(process_cast(movie['Cast']))
        
        # Get the most frequent actors
        if all_casts:
            most_common = Counter(all_casts).most_common(3)
            # Create string list with correct format
            actors = [actor for actor, _ in most_common]
            return f"[{', '.join(f'{actor}' for actor in actors)}]"
        return 'Unknown Cast'
    return row['Cast']

# Save index of rows with missing Cast before filling
missing_cast_idx = data_tmp_2[data_tmp_2['Cast'].isnull()].index

# Apply missing values filling
data_tmp_2['Cast'] = data_tmp_2.apply(lambda row: fill_cast(row, data_tmp_2), axis=1)

print("Number of missing values in Cast after Preprocess:", data_tmp_2['Cast'].isnull().sum())

Number of missing values in Cast: 32
Number of missing values in Cast after Preprocess: 0


<h3>Director Column<h3>

In [52]:
# First look at the distribution of Director
print("Number of missing values in Director:", data_tmp_2['Director'].isnull().sum())

def get_similar_movies_for_director(row, data, n_similar=5):
    """Find similar movies based on Cast and Genre"""
    if pd.isna(row['Cast']) and pd.isna(row['Genre']):
        return data.sample(n=n_similar)
    
    mask = pd.Series(True, index=data.index)
    
    # Use filled Cast
    if not pd.isna(row['Cast']):
        # Get list of actors
        row_cast = set(process_cast(row['Cast']))
        # Find movies with at least 1 matching actor
        cast_mask = data['Cast'].apply(lambda x: bool(row_cast.intersection(set(process_cast(x)))))
        mask &= cast_mask
    
    if not pd.isna(row['Genre']):
        mask &= (data['Genre'] == row['Genre'])
    
    similar_movies = data[mask]
    
    # If not enough similar movies, expand by Genre
    if len(similar_movies) < n_similar and not pd.isna(row['Genre']):
        genre_movies = data[data['Genre'] == row['Genre']]
        similar_movies = pd.concat([similar_movies, genre_movies]).drop_duplicates()
    
    return similar_movies.head(n_similar)

def fill_director(row, data):
    """Fill missing Director based on similar movies"""
    if pd.isna(row['Director']):
        similar_movies = get_similar_movies_for_director(row, data)
        # Get directors from similar movies
        directors = []
        for _, movie in similar_movies.iterrows():
            if not pd.isna(movie['Director']):
                directors.append(movie['Director'])
        
        # Get most frequent director
        if directors:
            most_common = Counter(directors).most_common(1)
            return most_common[0][0]
        return 'Unknown Director'
    return row['Director']

# Save indices of rows with missing Director
missing_director_idx = data_tmp_2[data_tmp_2['Director'].isnull()].index

# Apply fill_director
data_tmp_2['Director'] = data_tmp_2.apply(lambda row: fill_director(row, data_tmp_2), axis=1)

# Print number of missing values after filling
print("Number of missing values in Director after Preprocess:", data_tmp_2['Director'].isnull().sum())

Number of missing values in Director: 35
Number of missing values in Director after Preprocess: 0


<h3>Genre Column<h3>

In [53]:
# First, let's look at the distribution of Genre data
print("Number of missing values in Genre:", data_tmp_2['Genre'].isnull().sum())

# Create helper functions to process Genre
def process_genre(genre_string):
    """Normalize Genre string"""
    if pd.isna(genre_string):
        return []
    # Handle incorrect format cases
    genre_string = str(genre_string)
    genre_string = genre_string.replace('[', '').replace(']', '')  # Remove square brackets
    # Split genres and remove extra whitespace
    return [genre.strip() for genre in genre_string.split(',')]

def get_similar_movies_for_genre(row, data, n_similar=5):
    """Find similar movies based on Cast and Director"""
    if pd.isna(row['Cast']) and pd.isna(row['Director']):
        return data.sample(n=n_similar)
    
    mask = pd.Series(True, index=data.index)
    
    # Filter by Cast if available
    if not pd.isna(row['Cast']):
        mask &= (data['Cast'] == row['Cast'])
    
    # Filter by Director if available
    if not pd.isna(row['Director']):
        mask &= (data['Director'] == row['Director'])
    
    similar_movies = data[mask]
    
    # If not enough similar movies, get more movies with same Director
    if len(similar_movies) < n_similar and not pd.isna(row['Director']):
        director_movies = data[data['Director'] == row['Director']]
        similar_movies = pd.concat([similar_movies, director_movies]).drop_duplicates()
    
    return similar_movies.head(n_similar)


def fill_genre(row, data):
    """Fill missing Genre based on similar movies"""
    if pd.isna(row['Genre']):
        similar_movies = get_similar_movies_for_genre(row, data)
        # Get all genres from similar movies
        all_genres = []
        for _, movie in similar_movies.iterrows():
            if not pd.isna(movie['Genre']):
                all_genres.extend(process_genre(movie['Genre']))
        
        # Get the most frequent genres
        if all_genres:
            most_common = Counter(all_genres).most_common(2)  # Get 2 most common genres
            # Create string list with correct format
            genres = [genre for genre, _ in most_common]
            return f"[{', '.join(f'{genre}' for genre in genres)}]"
        return 'Unknown Genre'
    return row['Genre']

# Save index of rows with missing Genre before filling
missing_genre_idx = data_tmp_2[data_tmp_2['Genre'].isnull()].index

# Apply missing values filling
data_tmp_2['Genre'] = data_tmp_2.apply(lambda row: fill_genre(row, data_tmp_2), axis=1)

print("Number of missing values in Genre after Preprocess:", data_tmp_2['Genre'].isnull().sum())

Number of missing values in Genre: 46
Number of missing values in Genre after Preprocess: 0


'Rating', 'Studio', 'Release Date'

<h3> Studio Column </h3>

In [54]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.metrics import accuracy_score, mean_absolute_error

In [55]:
# Initialize encoders for categorical columns
encoders = {col: LabelEncoder() for col in ['Cast', 'Director', 'Genre', 'Rating', 'Studio']}

# Define the similarity-based filling function
def process_studio(studio_string):
    """Normalize Studio string.""" 
    if pd.isna(studio_string): 
        return 'Unknown Studio'
    return str(studio_string).strip()

def get_similar_movies_for_studio(row, data, n_similar=5):
    """Find similar movies based on Cast, Director, Genre, and Rating."""
    mask = pd.Series(True, index=data.index)
    
    if not pd.isna(row['Genre']):
        mask &= (data['Genre'] == row['Genre'])
    if not pd.isna(row['Cast']):
        mask &= (data['Cast'] == row['Cast'])
    if not pd.isna(row['Director']):
        mask &= (data['Director'] == row['Director'])
    if not pd.isna(row['Rating']):
        mask &= (data['Rating'] == row['Rating'])
    
    similar_movies = data[mask]
    if len(similar_movies) < n_similar and not pd.isna(row['Genre']):
        genre_movies = data[data['Genre'] == row['Genre']]
        similar_movies = pd.concat([similar_movies, genre_movies]).drop_duplicates()
    return similar_movies.head(n_similar)

def fill_studio_with_similarity(row, data):
    """Fill missing Studio using similarity-based method."""
    if pd.isna(row['Studio']):
        similar_movies = get_similar_movies_for_studio(row, data)
        all_studios = [process_studio(movie['Studio']) for _, movie in similar_movies.iterrows()]
        if all_studios:
            most_common = Counter(all_studios).most_common(1)
            return most_common[0][0]
    return None

# Define the ML-based filling function
def fill_studio_with_ml(row, model, encoders):
    """Fill missing Studio using machine learning."""
    if pd.isna(row['Studio']):
        input_data = pd.DataFrame([{col: encoders[col].transform([row[col]])[0] for col in ['Cast', 'Director', 'Genre', 'Rating']}])
        prediction = model.predict(input_data)
        return encoders['Studio'].inverse_transform(prediction)[0]
    return row['Studio']

def hybrid_fill_studio(row, data, model, encoders):
    """Combine similarity-based and machine learning methods."""
    studio = fill_studio_with_similarity(row, data)
    if studio is not None:
        return studio
    return fill_studio_with_ml(row, model, encoders)

# Load data
studio_process_data = raw_data.copy()

# Prepare data for ML model
for col in ['Cast', 'Director', 'Genre', 'Rating', 'Studio']:
    studio_process_data[col] = studio_process_data[col].fillna('Unknown')
    encoders[col].fit(studio_process_data[col])
    studio_process_data[col] = encoders[col].transform(studio_process_data[col])

# Split data into known and unknown Studio values
known_studio = studio_process_data[studio_process_data['Studio'] != encoders['Studio'].transform(['Unknown'])[0]]
X = known_studio[['Cast', 'Director', 'Genre', 'Rating']]
y = known_studio['Studio']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a RandomForestClassifier on the known data
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

# Apply hybrid strategy to fill missing values
studio_process_data['Studio'] = studio_process_data.apply(
    lambda row: hybrid_fill_studio(row, studio_process_data, rf_model, encoders), axis=1
)

# Decode Studio column back to original values
studio_process_data['Studio'] = encoders['Studio'].inverse_transform(studio_process_data['Studio'])

# Output the results
print("Number of missing values in Studio after Preprocess:", studio_process_data['Studio'].isnull().sum())

# Evaluate the model's accuracy using the test set
accuracy = accuracy_score(y_test, rf_model.predict(X_test)) * 100
print(f"Accuracy of the RandomForest model on the test set: {accuracy:.2f}%")


Number of missing values in Studio after Preprocess: 0
Accuracy of the RandomForest model on the test set: 9.75%


## Hybrid Approach for Filling Missing Studio Values (also applied on rating column)

The algorithm fills missing `Studio` values in a dataset using a combination of two strategies:
1. **Similarity-Based Approach** (using Movie attributes like Cast, Director, Genre, and Rating).
2. **Machine Learning Model** (Random Forest Classifier) based on categorical features.

### Overview
We aim to impute missing values in the `Studio` column, which represents the production studio of a movie. The strategy involves two methods that work together:

1. **Similarity-Based Method:**
   - This approach tries to find movies that are similar to the one with the missing `Studio` value based on shared attributes (Cast, Director, Genre, and Rating).
   - For each movie with a missing `Studio`, we find other movies in the dataset with matching attributes. The most frequent `Studio` among these similar movies is used to fill in the missing value.

2. **Machine Learning Model (Random Forest):**
   - When no suitable similar movies can be found, we fall back on using a trained machine learning model to predict the `Studio`.
   - The model is trained on known records where the `Studio` is not missing. We use the `Cast`, `Director`, `Genre`, and `Rating` as features for predicting the `Studio` of a movie.

### Step-by-Step Explanation

#### 1. **Data Preprocessing and Encoding**
   - The categorical columns (e.g., `Cast`, `Director`, `Genre`, `Rating`, and `Studio`) are encoded using **Label Encoding**. This converts the text data into numerical values so that it can be used in machine learning models.
   - The missing values in the `Studio` column are initially marked as "Unknown" to handle cases where the `Studio` is missing.

#### 2. **Similarity-Based Approach**
   - For each movie with a missing `Studio`, the algorithm searches for similar movies based on shared attributes (`Cast`, `Director`, `Genre`, and `Rating`).
   - Movies that match on multiple attributes are considered similar, and the `Studio` value of the most common similar movies is used to fill the missing `Studio`.

   ##### Example:
   - If a movie has `Cast = Actor1`, `Director = Director1`, `Genre = Action`, and `Rating = PG`, the algorithm searches for movies with the same attributes. If it finds movies with the same attributes and known `Studio`, it returns the most frequent `Studio`.

#### 3. **Machine Learning Model (Random Forest)**
   - If the similarity-based approach does not find enough similar movies (or if the `Studio` is still missing), the algorithm uses a **Random Forest Classifier** to predict the missing `Studio`.
   - The model is trained using a dataset of movies where the `Studio` is not missing. The input features used for training are the `Cast`, `Director`, `Genre`, and `Rating`.
   - After training, the model can predict the `Studio` for movies with missing values based on these features.

   ##### Random Forest Model Training:
   - **Features**: `Cast`, `Director`, `Genre`, `Rating`
   - **Target**: `Studio`
   - The model is trained on the available data, and once trained, it can be used to predict the `Studio` for rows with missing values.

#### 4. **Hybrid Approach**
   - The algorithm first attempts to fill the missing `Studio` using the similarity-based method. If the `Studio` cannot be determined through similarity, it falls back on the Random Forest model for prediction.
   - This ensures that the algorithm works well in scenarios where similar movies exist and also uses machine learning when no direct similarity can be found.

#### 5. **Final Output**
   - The `Studio` column is fully populated with either the predicted values from the similarity-based method or the machine learning model.
   - The dataset is then decoded back to its original categorical values for `Studio`.

### Benefits of the Hybrid Approach:
- **Accuracy**: The similarity-based method leverages domain knowledge (shared attributes between movies) to make reasonable inferences. The Random Forest model adds another layer of prediction based on historical patterns.
- **Flexibility**: This hybrid approach can handle missing values in different scenarios, whether based on similarity or machine learning.
- **Robustness**: By combining both methods, the algorithm is more robust to varying data distributions and can adapt to different types of missingness in the `Studio` column.

### Conclusion
This approach allows us to fill in missing `Studio` data intelligently, utilizing both rule-based similarity matching and predictive modeling, leading to more accurate and meaningful imputations for the dataset.



<h3> Rating Column </h3>

In [56]:
# Initialize encoders for categorical columns
encoders = {col: LabelEncoder() for col in ['Cast', 'Director', 'Genre', 'Rating', 'Studio']}

# Define the similarity-based
def process_rating(rating_value):
    """Normalize Rating value."""
    if pd.isna(rating_value):
        return 'Unknown Rating'
    return str(rating_value).strip()

def get_similar_movies_for_rating(row, data, n_similar=5):
    """Find similar movies based on Cast, Director, Genre, and Studio."""
    mask = pd.Series(True, index=data.index)
    
    if not pd.isna(row['Genre']):
        mask &= (data['Genre'] == row['Genre'])
    if not pd.isna(row['Cast']):
        mask &= (data['Cast'] == row['Cast'])
    if not pd.isna(row['Director']):
        mask &= (data['Director'] == row['Director'])
    if not pd.isna(row['Studio']):
        mask &= (data['Studio'] == row['Studio'])
    
    similar_movies = data[mask]
    
    if len(similar_movies) < n_similar and not pd.isna(row['Genre']):
        genre_movies = data[data['Genre'] == row['Genre']]
        similar_movies = pd.concat([similar_movies, genre_movies]).drop_duplicates()
    
    return similar_movies.head(n_similar)

def fill_rating_with_similarity(row, data):
    """Fill missing Rating using similarity-based method."""
    if pd.isna(row['Rating']):
        similar_movies = get_similar_movies_for_rating(row, data)
        all_ratings = [process_rating(movie['Rating']) for _, movie in similar_movies.iterrows()]
        if all_ratings:
            most_common = Counter(all_ratings).most_common(1)
            return most_common[0][0]
    return None

# Define the ML-based
def fill_rating_with_ml(row, model, encoders):
    """Fill missing Rating using machine learning."""
    if pd.isna(row['Rating']):
        input_data = pd.DataFrame([{col: encoders[col].transform([row[col]])[0] for col in ['Cast', 'Director', 'Genre', 'Studio']}])
        prediction = model.predict(input_data)
        return encoders['Rating'].inverse_transform(prediction)[0]
    return row['Rating']

# Combine similarity-based and ML methods (Hybrid method)
def hybrid_fill_rating(row, data, model, encoders):
    """Combine similarity-based and machine learning methods for Rating."""
    rating = fill_rating_with_similarity(row, data)
    if rating is not None:
        return rating
    return fill_rating_with_ml(row, model, encoders)

rating_process_data = raw_data.copy()

for col in ['Cast', 'Director', 'Genre', 'Rating', 'Studio']:
    rating_process_data[col] = rating_process_data[col].fillna('Unknown')
    encoders[col].fit(rating_process_data[col])
    rating_process_data[col] = encoders[col].transform(rating_process_data[col])

# Prepare training data for Random Forest model
known_rating = rating_process_data[rating_process_data['Rating'] != encoders['Rating'].transform(['Unknown'])[0]]
X = known_rating[['Cast', 'Director', 'Genre', 'Studio']]
y = known_rating['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest Classifier model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

# Apply hybrid strategy
rating_process_data['Rating'] = rating_process_data.apply(
    lambda row: hybrid_fill_rating(row, rating_process_data, rf_model, encoders), axis=1
)

# Decode Rating column back to original values
rating_process_data['Rating'] = encoders['Rating'].inverse_transform(rating_process_data['Rating'])

# Check results
print("Accuracy score for Random Forest model:", accuracy_score(y_test, rf_model.predict(X_test)) * 100)


Accuracy score for Random Forest model: 61.2448132780083


<h3> Release Date Column </h3>

In [57]:
date_process_data = raw_data.copy()

# Fill missing values in 'Genre' with 'Unknown'
date_process_data['Genre'] = date_process_data['Genre'].fillna('Unknown')

# extract the year from the 'Release Date'
date_process_data['Release Year'] = pd.to_datetime(date_process_data['Release Date'], errors='coerce').dt.year

# Filter out rows where Release Year is NaN)
genre_popularity = date_process_data.dropna(subset=['Release Year']).groupby(['Genre', 'Release Year']).size().reset_index(name='Movie Count')

# predict based on Genre Popularity Trend
def predict_release_year(row, genre_popularity):
    """Predict the Release Year based on genre popularity trend and return date as 'yyyy-01-01'."""
    if pd.notna(row['Release Year']):
        return pd.to_datetime(f"{int(row['Release Year'])}-01-01")
    
    genre = row['Genre']
    
    # Filter the popularity data
    genre_trend = genre_popularity[genre_popularity['Genre'] == genre]
    
    # fallback
    if genre_trend.empty:
        # Use the median year
        median_year = int(data['Release Year'].median())
        return pd.to_datetime(f"{median_year}-01-01")
    
    # Sort by movie count per year and pick the year with the highest count for the genre
    peak_year = int(genre_trend.loc[genre_trend['Movie Count'].idxmax()]['Release Year'])
    
    return pd.to_datetime(f"{peak_year}-01-01")

# Apply the prediction function
date_process_data['Predicted Release Date'] = date_process_data.apply(lambda row: predict_release_year(row, genre_popularity), axis=1)


date_process_data['Release Date'] = pd.to_datetime(date_process_data['Release Date'], errors='coerce')
date_process_data['Release Date'] = date_process_data['Release Date'].dt.year.apply(lambda x: pd.to_datetime(f"{int(x)}-01-01") if pd.notna(x) else np.nan)

print("Accuracy:", accuracy_score(date_process_data['Release Date'], date_process_data['Predicted Release Date'])*100)

# Fill missing values in 'Release Date' with 'Predicted Release Date'
date_process_data['Release Date'] = date_process_data['Release Date'].combine_first(date_process_data['Predicted Release Date'])


Accuracy: 98.53113983548766


Handling Missing Release Year Based on Genre Popularity
----------------------------------------------------------------------

### Objective:

We aim to predict missing "Release Year" values in a movie dataset using a genre-based popularity trend. For each missing year, we determine the most likely year based on the genre’s peak movie release count. The predicted release year is then formatted as `yyyy-01-01` (the first day of the year).

### Steps to Process the Data:

1.  **Load and Preprocess the Dataset**:
    
    *   The dataset is first loaded, and missing values in the `Genre` column are filled with a placeholder, `'Unknown'`, if applicable.
    *   We extract the year from the `Release Date` column using `pd.to_datetime()`, which allows us to separate the date into year, month, and day.
2.  **Genre Popularity Trend by Year**:
    
    *   To determine which years are most popular for each genre, we group the dataset by `Genre` and `Release Year` and count how many movies were released each year for each genre.
    *   This count of movies per genre per year helps identify the most "popular" year for each genre.
3.  **Prediction of Missing Release Year**:
    
    *   For each movie with a missing release year, we attempt to predict the year based on the genre’s popularity trend. If the genre has a trend for multiple years, the year with the highest movie count is chosen as the predicted year.
    *   If a genre doesn’t have sufficient data or trends (e.g., a genre with no available release year data), we use the median year of all movies as a fallback prediction.
4.  **Handling of Year Format**:
    
    *   The `Release Year` column may sometimes contain float values (e.g., `1997.0`), which can cause errors during date processing. To handle this:
        *   We explicitly cast the `Release Year` as an integer (`int()`) to ensure the year is properly formatted.
        *   The final predicted release date is formatted as `yyyy-01-01`, representing the first day of the predicted year. This format ensures that the data can be processed without error.


In [None]:
# Fill missing values in 'Studio' with 'Studio in Processed Data'
raw_data['Studio'] = studio_process_data['Studio']
# Fill missing values in 'Rating' with 'Rating in Processed Data'
raw_data['Rating'] = rating_process_data['Rating']
# Fill missing values in 'Release Date' with 'Predicted Release Date'
raw_data['Release Date'] = date_process_data['Release Date']

# Check if there are any missing values left
print("Number of missing values of categorical columns in the final dataset:", raw_data[['Cast', 'Director', 'Genre', 'Rating', 'Studio', 'Release Date']].isnull().sum())

Number of missing values of categorical columns in the final dataset: Cast            32
Director        35
Genre           46
Rating           0
Studio           0
Release Date     0
dtype: int64
