In [None]:
import json, gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
from pathlib import Path

data = Path('data')
pd.set_option('display.max_rows', None)

# Load the json file
with gzip.open(data / 'franchises.json.gz', 'rt') as f:
	franchises = json.load(f)

# Convert the franchises list to a dataframe
df = pd.DataFrame(franchises)

# Show the dataframe
df = df.sort_values(by=['my_score'], ascending=False)
df

In [None]:
# Create the figure and subplots for histograms
fig, axs = plt.subplots(1, 2, figsize=(14, 6))

# Histograms of 'score' and 'my_score'
sns.histplot(df['score'], kde=True, bins=30, ax=axs[0])
axs[0].set_title('Score Distribution of Franchises')
axs[0].set_xlabel('Score')
axs[0].set_ylabel('Density')

df['my_score_rounded'] = df['my_score'].round(0)
sns.histplot(df['my_score_rounded'], kde=True, bins=30, ax=axs[1])
axs[1].set_title('My Score Distribution of Franchises')
axs[1].set_xlabel('My Score')
axs[1].set_ylabel('Density')

plt.tight_layout()
plt.show()

# Helper function for generating box plots
def box_plot(data, x, y, title, xlabel, ylabel):
    if data[x].notna().any():  # Check if there are any non-null values
        plt.figure(figsize=(14, 6))
        sns.boxplot(x=x, y=y, data=data, order=order)
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.xticks(rotation=90)
        plt.show()

# Unnest 'genres', 'themes', and 'studios' fields and create box plots
for field in ['genres', 'themes', 'studios', 'demographics']:
    df_unnested = df.explode(field)
    
    # Sort by descending average my_score
    order = df_unnested.groupby(field)['my_score'].median().sort_values(ascending=False).index
    
    box_plot(df_unnested, field, 'my_score', f'My Score Distribution by {field.capitalize()}', field.capitalize(), 'My Score')

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

def draw_co_occurrence_matrix(data, title):
    """Draw a co-occurrence matrix with a title and masks the upper triangle."""
    co_occurrence_matrix = np.dot(data.transpose(), data)
    co_occurrence_df = pd.DataFrame(co_occurrence_matrix, index=data.columns, columns=data.columns)
    mask = np.triu(np.ones_like(co_occurrence_df, dtype=bool))
    plt.figure(figsize=(10, 10))
    sns.heatmap(co_occurrence_df, mask=mask, cmap='coolwarm', center=0, annot=False, fmt='d')
    plt.title(title)
    plt.show()

# Initialize MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Extract and one-hot encode genres and themes
for feature in ['genres', 'themes']:
    feature_list = [franchise[feature] for franchise in franchises]
    feature_encoded = mlb.fit_transform(feature_list)
    df_feature = pd.DataFrame(feature_encoded, columns=mlb.classes_)
    draw_co_occurrence_matrix(df_feature, f'{feature.capitalize()} Co-occurrence Matrix')

# Most unpopular opinions

In [None]:
# Rank scaled score
def scale_scores(scores: pd.DataFrame) -> pd.DataFrame:
	"""Scale scores to a range of 0 to 1 using rank scaling."""
	ranks = scores.rank(ascending=False)
	scaled = 1 - (ranks - 1) / (len(scores) - 1)
	return scaled

# Scaled score to remove my own bias
df['score_scaled'] = scale_scores(df['score'])
df['my_score_scaled'] = scale_scores(df['my_score'])

df['score_difference'] = df['my_score_scaled'] - df['score_scaled']
df['score_difference_abs'] = (df['score_difference']).abs()
df_sorted_abs = df.sort_values(by='score_difference_abs', ascending=False)
df_sorted_abs[['title', 'score_scaled', 'my_score_scaled', 'score_difference']]

# My opinion vs the world's

In [None]:
# Define the color thresholds using List of Tuples
color_thresholds = [
    (0.05, 'green'),
    (0.15, 'orange'),
]

# Define a function to get color based on score difference and the List of Tuples representation
def get_color(score_diff, thresholds):
    for threshold, color in thresholds:
        if score_diff <= threshold:
            return color
    return 'red'

# Apply the function to the dataframe
df['Color'] = df['score_difference_abs'].apply(lambda x: get_color(x, color_thresholds))

# Using the provided code to plot the scatter plot
ax = df.plot.scatter(x='my_score_scaled', y='score_scaled', c=df['Color'], figsize=(10, 10), alpha=0.7, s=50, edgecolor='k')

# y=x guide line
ax.plot([0, 1], [0, 1], 'k--', linewidth=0.5)

plt.title('My Score vs Crowd Score')
plt.xlabel('My Score')
plt.ylabel('Crowd Score')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.show()