# reel driver classifier analysis

## import dependancies

In [1]:
import json
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap, Normalize
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns

## import data

In [3]:
training = pl.read_parquet("../data/01_training.parquet")
results = pl.read_parquet("../data/03_binomial_classifier_results.parquet")

display(training.head())
display(results.head())

imdb_id,tmdb_id,label,media_type,media_title,season,episode,release_year,genre,language,rt_score,metascore,imdb_rating,imdb_votes,created_at,updated_at
str,i64,cat,cat,str,i16,i16,i16,list[str],list[str],i16,i16,f64,i64,"datetime[μs, UTC]","datetime[μs, UTC]"
"""tt0002143""",130523,"""would_not_watch""","""movie""","""Dr. Jekyll and Mr. Hyde""",,,1912,"[""Horror""]","[""xx"", ""en""]",,,60.0,817,2025-05-19 14:46:11.601510 UTC,2025-05-19 14:46:11.601510 UTC
"""tt0006753""",200324,"""would_not_watch""","""movie""","""The Half-Breed""",,,1916,"[""Western""]","[""xx"", ""en""]",,,67.0,219,2025-05-19 14:46:11.601510 UTC,2025-05-19 14:46:11.601510 UTC
"""tt0011565""",27509,"""would_not_watch""","""movie""","""The Penalty""",,,1920,"[""Crime"", ""Drama"", ""Thriller""]","[""xx"", ""en""]",83.0,,73.0,2769,2025-05-19 14:46:11.601510 UTC,2025-05-19 14:46:11.601510 UTC
"""tt0012136""",300769,"""would_not_watch""","""movie""","""Enchantment""",,,1921,"[""Comedy""]","[""xx""]",,,63.0,323,2025-05-19 14:46:11.601510 UTC,2025-05-19 14:46:11.601510 UTC
"""tt0012190""",31432,"""would_not_watch""","""movie""","""The Four Horsemen of the Apoca…",,,1921,"[""War"", ""Romance"", ""Drama""]","[""en"", ""xx""]",83.0,,71.0,3594,2025-05-19 14:46:11.601510 UTC,2025-05-19 14:46:11.601510 UTC


actual,predicted,probability,imdb_id
i32,i64,f32,str
0,0,2.6e-05,"""tt0002143"""
0,0,0.000228,"""tt0006753"""
0,0,0.014118,"""tt0011565"""
0,0,8.7e-05,"""tt0012136"""
0,0,0.007678,"""tt0012190"""


## format data for analysis

In [None]:
# join tables
df = results.join(media, on="hash", how="left")

# coerce predcited to boolean 
df = df.with_columns(pl.col("predicted").cast(pl.Boolean))

# create all confusion matrix fields
df = df.with_columns(
    cm_value = pl.when(pl.col("actual"))
        .then(
            pl.when(pl.col("predicted"))
                .then(pl.lit("TP"))
                .otherwise(pl.lit("FN"))
        ).otherwise(
            pl.when(pl.col("predicted"))
                .then(pl.lit("FP"))
                .otherwise(pl.lit("TN"))        
        )
)

# select only relevant fields
df = df.select(
    "media_title",
    "predicted",
    "actual",
    "cm_value",
    "probability",
    "release_year",
    "rt_score",
    "metascore",
    "imdb_rating",
    "imdb_votes",
    "original_title",
    "hash"
)

display(df.head())

# convert to pandas for compabiltiy 
pdf = df.to_pandas()

## visualize results

In [None]:
# distribution of numeric values by prediction label

# Create a figure with four subplots side by side
fig, axes = plt.subplots(1, 5, figsize=(20, 6))

# Define the colors for the classes
palette = {False: 'red', True: 'blue'}

# plot
sns.violinplot(x='predicted', y='rt_score', data=pdf, ax=axes[0], hue='predicted', palette=palette)
axes[0].set_title('rt_score by predicted')
axes[0].get_legend().remove()

sns.violinplot(x='predicted', y='metascore', data=pdf, ax=axes[1], hue='predicted', palette=palette)
axes[1].set_title('metascore by predicted')
axes[1].get_legend().remove()

sns.violinplot(x='predicted', y='imdb_rating', data=pdf, ax=axes[2], hue='predicted', palette=palette)
axes[2].set_title('imdb_rating by predicted')
axes[2].get_legend().remove()

sns.violinplot(x='predicted', y='imdb_votes', data=pdf, ax=axes[3], hue='predicted', palette=palette)
axes[3].set_title('log(imdb_votes) by predicted')
axes[3].get_legend().remove()
axes[3].set_yscale('log')

sns.violinplot(x='predicted', y='release_year', data=pdf, ax=axes[4], hue='predicted', palette=palette)
axes[4].set_title('release_year by predicted')
axes[4].get_legend().remove()


# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# correlation matrix
corr_df = df.select(
    "probability",
    "release_year",
    "rt_score",
    "metascore",
    "imdb_rating",
    "imdb_votes"
).drop_nulls()

corr_pdf = corr_df.to_pandas()

# Create correlation matrix
corr_matrix = corr_pdf.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Set up the matplotlib figure
plt.figure(figsize=(12, 6))

# Create heatmap
sns.heatmap(
    corr_matrix,
    mask=mask,
    annot=True,  # Show correlation values
    cmap='Spectral',  # Color scheme
    vmin=-1, vmax=1,  # Value range
    center=0,  # Center the colormap at 0
    square=True,  # Make the plot square-shaped
    fmt='.2f',  # Round correlation values to 2 decimal places
    linewidths=0.5,  # Width of the lines between cells
    cbar_kws={'label': 'correlation coefficient'}
)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

# Adjust layout to prevent label cutoff
plt.tight_layout()

# Add title
plt.title('correlation matrix', pad=20)

# Show the plot
plt.show()

In [None]:
# show outliers
# note: the conditional format will not render in github
false_negatives = df.filter(pl.col("cm_value")=="FN").sort("probability").to_pandas()

# Create styled table with conditional formatting
styled_false_negatives = (false_negatives
    .style
    .background_gradient(subset=['probability'], cmap='RdYlGn_r')  # Red to Yellow to Green (reversed)
    .bar(subset=['probability'], color='#4a90e2', align='mid')  # Add bar charts
    .format({'probability': '{:,.0f}', 'probability': '{:.2f}%'})  # Format numbers
    .set_caption('false positives')
)

display(styled_false_negatives)

false_positives = df.filter(pl.col("cm_value")=="FP").sort("probability", descending=True).to_pandas()

# Create styled table with conditional formatting
styled_false_positives = (false_positives
    .style
    .background_gradient(subset=['probability'], cmap='RdYlGn_r')  # Red to Yellow to Green (reversed)
    .bar(subset=['probability'], color='#4a90e2', align='mid')  # Add bar charts
    .format({'probability': '{:,.0f}', 'probability': '{:.2f}%'})  # Format numbers
    .set_caption('false negatives')
)

display(styled_false_positives)

In [None]:
# export false positives and false negaties for further investigation
false_positives = df.filter(pl.col('cm_value')=="FP").sort("probability")

with open("../data/false_positives.json", "w") as file:
    # Convert to records (list of dicts, each representing a row)
    records = [row for row in false_positives.iter_rows(named=True)]
    json.dump(records, file, indent=4)

false_negatives = df.filter(pl.col('cm_value')=="FN").sort("probability", descending=True)

with open("../data/false_negatives.json", "w") as file:
    # Convert to records (list of dicts, each representing a row)
    records = [row for row in false_negatives.iter_rows(named=True)]
    json.dump(records, file, indent=4)


### issues discoverd with training data

- parsing issue for some titles leading to issues with metadata collection
    - investigate parsing issues
- some items seems to have properly parsed values for media_title, but still have not succesfully collected metadata
    - re-run through OMDB API
