In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import re
import numpy as np
import pandas as pd
import matplotlib.cm as cm
from matplotlib.colors import to_hex
import plotly.graph_objects as go
import textwrap
import matplotlib.colors as mcolors
import matplotlib

In [2]:
df = pd.read_csv('../data/8d7ebe8b-e59c-4c20-b5a5-fa075b20c70f.csv')
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace('-', '_')
df.columns = df.columns.str.replace('(', '')
df.columns = df.columns.str.replace(')', '')
df = df.dropna(axis=1, how='all')
REMOVE_COLS = [
    'const', 
    'original_title',
    'date_rated', 
    'url', 
    'imdb_rating', 
    'num_votes', 
    'release_date', 
    'directors'
    ]
df = df.drop(columns=REMOVE_COLS)
df['title_type'] = df['title_type'].apply(lambda x: 'TV' if 'TV' in x else x)
df['title_type'] = df['title_type'].apply(lambda x: x if x in ['Movie', 'TV'] else 'Other')
df = df.sort_values(by='your_rating', ascending=False)
df.rename(columns={'title_type': 'flag', 'genres': 'genre'}, inplace=True)
df['flag'] = df['flag'].replace('Other', 'Movie')

In [3]:
# Step 1: explode genres and compute weights per row
df_genres = df[['flag', 'genre', 'your_rating']].dropna(subset=['genre'])
df_genres = df_genres.assign(
    genre=df_genres['genre'].str.split(',')
)
df_exploded = df_genres.explode('genre')
df_exploded['genre'] = df_exploded['genre'].str.strip()

# Combine sci-fi and fantasy into "sci-fi fantasy"
df_exploded['genre'] = df_exploded['genre'].replace({
    'Sci-Fi': 'Sci-Fi / Fantasy', 'Fantasy': 'Sci-Fi / Fantasy',
    'History': 'History / Biography', 'Biography': 'History / Biography',
    'War': 'War / Western', 'Western': 'War / Western',
    'Mystery': 'Mystery / Thriller', 'Thriller': 'Mystery / Thriller',
    'Action': 'Action / Adventure', 'Adventure': 'Action / Adventure',
    'Music': 'Music / Musical', 'Musical': 'Music / Musical',
    })

# Compute weight = 1 / number of genres per original entry
df_exploded['weight'] = df_exploded.groupby(df_exploded.index)['genre'].transform(lambda x: 1 / len(x))

# Step 2: compute weighted counts and ratings
df_exploded['weighted_rating'] = df_exploded['your_rating'] * df_exploded['weight']

# Step 3: aggregate by flag + genre
df_flag_genre = (
    df_exploded.groupby(['flag', 'genre'], as_index=False)
    .agg(count=('weight', 'sum'),
         rating_sum=('weighted_rating', 'sum'))
)

# Step 4: compute average weighted rating
df_flag_genre['rating'] = df_flag_genre['rating_sum'] / df_flag_genre['count']
df_flag_genre.drop(columns='rating_sum', inplace=True)

# Step 5: optionally group small genres into "Other" per flag
threshold = 30
df_flag_genre['genre'] = df_flag_genre.apply(
    lambda row: row['genre'] if row['count'] >= threshold else 'Other', axis=1
)
df_flag_genre = (
    df_flag_genre.groupby(['flag', 'genre'], as_index=False)
    .agg(count=('count', 'sum'),
         rating=('rating', 'mean'))  # averaging ratings of merged genres
)

df_flag_genre = df_flag_genre.sort_values(['flag', 'count'], ascending=[True, False])

# Step 6: normalize ratings to 10
min_rating = df_flag_genre['rating'].min()
max_rating = df_flag_genre['rating'].max()
if max_rating != min_rating:
    df_flag_genre['rating'] = min_rating + (df_flag_genre['rating'] - min_rating) * (10 - min_rating) / (max_rating - min_rating)

df_flag_genre.sort_values('rating', ascending=False).round(1)

Unnamed: 0,flag,genre,count,rating
12,TV,Mystery / Thriller,32.6,10.0
14,TV,Sci-Fi / Fantasy,32.3,9.7
11,TV,Drama,48.0,9.6
10,TV,Action / Adventure,37.8,9.3
13,TV,Other,96.3,8.8
4,Movie,History / Biography,62.1,8.7
3,Movie,Drama,198.5,8.6
7,Movie,Romance,41.4,8.4
9,Movie,War / Western,36.7,8.3
8,Movie,Sci-Fi / Fantasy,178.6,8.0


In [5]:
flags = df_flag_genre['flag'].unique()
cmap = matplotlib.colormaps['rainbow']
flag_colors = {flag: mcolors.to_hex(cmap(i / (len(flags)-1))) for i, flag in enumerate(flags)}

# Step 4: manually assign colors per flag
flag_colors = {
    'Movie': 'royalblue',
    'TV': 'gold',
    'Other': 'white',
}

import textwrap

# Define min/max line length
min_line_length = 10
max_line_length = 30

# Normalize count to 0-1
count_min = df_flag_genre['count'].min()
count_max = df_flag_genre['count'].max()
df_flag_genre['norm_count'] = (df_flag_genre['count'] - count_min) / (count_max - count_min)

# Compute dynamic line length
df_flag_genre['line_length'] = df_flag_genre['norm_count'] * (max_line_length - min_line_length) + min_line_length
df_flag_genre['line_length'] = df_flag_genre['line_length'].astype(int)

def wrap_genre(row):
    genre = row['genre']
    line_length = row['line_length']
    
    # If the whole genre fits, return as is
    if len(genre) <= line_length:
        return genre
    
    # Split into words
    words = genre.split(' ')
    lines = []
    current_line = ''
    
    for word in words:
        # If adding this word exceeds line_length
        if len(current_line) + len(word) + (1 if current_line else 0) > line_length:
            # Push current line to lines
            if current_line:
                lines.append(current_line)
            current_line = word
        else:
            # Append word to current line
            current_line = f"{current_line} {word}" if current_line else word
    
    # Add last line
    if current_line:
        lines.append(current_line)
    
    return '<br>'.join(lines)

df_flag_genre['genre_wrapped'] = df_flag_genre.apply(wrap_genre, axis=1)
df_flag_genre['id'] = df_flag_genre['flag'] + ' | ' + df_flag_genre['genre_wrapped']  # unique per flag+genre
df_flag_genre['parent'] = ""  # flat hierarchy
df_flag_genre['label'] = df_flag_genre['genre_wrapped']  # show only genre text

# Step 6: create treemap
fig = go.Figure(go.Treemap(
    ids=df_flag_genre['id'],
    labels=df_flag_genre['label'],
    parents=df_flag_genre['parent'],
    values=df_flag_genre['count'],
    marker=dict(
        colors=[flag_colors.get(f, 'gray') for f in df_flag_genre['flag']],
        line=dict(color='black', width=1)
    ),
    textinfo="label+value+percent parent",
    texttemplate="%{label}<br>(%{value:.0f}, %{percentParent:.0%})",
    textposition="middle center",
    hoverinfo='none',  # <-- disables hover
    branchvalues='total'
))

fig.update_layout(
    margin=dict(l=1, r=1, t=1, b=1),
    template='plotly_dark',
    height=500,
    uniformtext=dict(minsize=8, mode='show')
)

# Step 7: compute total counts per flag, sort descending for title
flag_counts = (df_flag_genre.groupby('flag', as_index=False)['count'].sum().sort_values('count', ascending=False))
total_count = flag_counts['count'].sum()
flag_summaries = []
for _, row in flag_counts.iterrows():
    flag = row['flag']
    flag_count = row['count']
    flag_frac = flag_count / total_count
    color = flag_colors.get(flag, 'gray')
    flag_summaries.append(f"<span style='color:{color}'>{flag} ({int(flag_count)}, {flag_frac:.0%})</span>")

title_text = ", ".join(flag_summaries)

fig.update_layout(
    margin=dict(l=1, r=1, t=20, b=1),  # top margin increased for title
    template='plotly_dark',
    font=dict(color='white', size=11),
    height=560,
    uniformtext=dict(minsize=8, mode='show'),
    title=dict(
        text=title_text,
        x=0.5,
        xanchor='center',
        yanchor='top',
        y=0.99,  # stick to the very top inside the margin
        font=dict(size=16)
    )
)

fig.show()

import plotly.io as pio
config = {'displayModeBar': False, 'responsive': True}
pio.write_html(fig, file='../attachments/tree_movies.html', config=config, include_plotlyjs='cdn')

In [None]:
# # Normalize the genre strings
# df['genres'] = (df['genres'].str.lower().str.replace('-', '').str.replace(' ', '').str.replace('_', '').str.strip())
# # Then split and create dummy variables
# genres_dummies = df['genres'].str.get_dummies(sep=',')
# # Rename columns to have 'genre_' prefix
# genres_dummies.columns = ['genre_' + col for col in genres_dummies.columns]
# # Normalize row-wise
# genres_normalized = genres_dummies.div(genres_dummies.sum(axis=1), axis=0)
# # Merge into original df
# df = df.join(genres_normalized).drop(columns=['genres'])

# def merge_genre_columns(df, col1, col2, new_col):
#     df[new_col] = df[col1].fillna(0) + df[col2].fillna(0)
#     df = df.drop(columns=[col1, col2])
#     return df

# df = merge_genre_columns(df, col1='genre_action', col2='genre_adventure', new_col='genre_action/adventure')
# df = merge_genre_columns(df, col1='genre_scifi', col2='genre_fantasy', new_col='genre_scifi/fantasy')
# df = merge_genre_columns(df, col1='genre_mystery', col2='genre_thriller', new_col='genre_mystery/thriller')
# df = merge_genre_columns(df, col1='genre_history', col2='genre_biography', new_col='genre_history/biography')
# df = merge_genre_columns(df, col1='genre_war', col2='genre_western', new_col='genre_war/western')
# df = merge_genre_columns(df, col1='genre_music', col2='genre_musical', new_col='genre_music/musical')
# df = merge_genre_columns(df, col1='genre_realitytv', col2='genre_talkshow', new_col='genre_reality/talkshow')

In [2]:
df_ = df
genre_cols = [col for col in df_.columns if col.startswith('genre_')]

# Sum weighted counts per genre
df_by_genre = df_[genre_cols].sum(axis=0).reset_index()
df_by_genre.columns = ['genre', 'weighted_count']
df_by_genre['genre'] = df_by_genre['genre'].str.replace('^genre_', '', regex=True)

# Raw counts per genre
raw_counts = [(df_[col] > 0).sum() for col in genre_cols]
df_by_genre['raw_count'] = raw_counts

# Average rating per genre
means = []
for col in genre_cols:
    mask = df_[col] > 0
    mean_rating = df_.loc[mask, 'your_rating'].mean() if mask.sum() > 0 else None
    means.append(mean_rating)
df_by_genre['avg_rating'] = means

# Min and max rating per genre (needed for diamond vertical extent)
mins = []
maxs = []
for col in genre_cols:
    mask = df_[col] > 0
    min_rating = df_.loc[mask, 'your_rating'].min() if mask.sum() > 0 else None
    max_rating = df_.loc[mask, 'your_rating'].max() if mask.sum() > 0 else None
    mins.append(min_rating)
    maxs.append(max_rating)
df_by_genre['min_rating'] = mins
df_by_genre['max_rating'] = maxs

# Helper function to wrap long lines for hover text
def wrap_line(line, width=45):
    return '<br>'.join(textwrap.wrap(line, width=width))

# Format top titles with line breaks for wrapping
def format_top_titles(titles, is_truncated, top_n=15):
    displayed_titles = titles[:top_n] if len(titles) > top_n else titles
    lines = [f"Top {len(displayed_titles)}:"] + displayed_titles
    wrapped_lines = [wrap_line(line) for line in lines]
    text = '<br>'.join(wrapped_lines)
    return text + ("<br>...and more" if is_truncated else "")

# Precompute top movies per genre
top_n = 15
top_movies_per_genre = {}
for col in genre_cols:
    mask = df_[col] > 0
    filtered = df_.loc[mask, ['title', 'year', 'your_rating']]
    total_movies = filtered.shape[0]
    top_rated = filtered.sort_values('your_rating', ascending=False).head(top_n)
    top_sorted = top_rated.sort_values(['year', 'title'])
    formatted_titles = [
        f"{int(row['year'])}: {row['title']} ({row['your_rating']}/10)"
        for _, row in top_sorted.iterrows()
    ]
    is_truncated = total_movies > top_n
    top_movies_per_genre[col] = (formatted_titles, is_truncated)

df_by_genre['top'] = df_by_genre['genre'].apply(
    lambda g: format_top_titles(*top_movies_per_genre.get(f'genre_{g}', ([], False)))
)

# Sort descending by weighted_count (largest on top)
df_by_genre = df_by_genre.sort_values('weighted_count', ascending=False).reset_index(drop=True)
# Removed reversal: df_by_genre = df_by_genre[::-1]

# Rescale avg_rating, min_rating, max_rating to original rating scale for x-axis
min_orig = df_['your_rating'].min()
max_orig = df_['your_rating'].max()

# Marker area proportional to weighted count
size_factor = 4e-1
areas = np.sqrt(df_by_genre['weighted_count'] * size_factor)

# Assign colors using reversed rainbow colormap
n_genres = len(df_by_genre)
spectral = cm.get_cmap('rainbow')
color_vals = np.linspace(0, 1, n_genres)
genre_list = df_by_genre['genre'].tolist()
genre_colors = {
    genre: to_hex(spectral(val))
    for genre, val in zip(genre_list, color_vals)
}

# Prepare hover text strings
hover_texts = []
for _, row in df_by_genre.iterrows():
    hover_text = (
        f"Genre: {row['genre']}<br>"
        f"Raw count: {row['raw_count']}<br>"
        f"Weighted count: {row['weighted_count']:.0f}<br>"
        f"Avg rating: {row['avg_rating']:.1f}/10<br>"
        f"Range: {row['min_rating']:.0f} - {row['max_rating']:.0f}<br><br>"
        f"{row['top']}"
    )
    hover_texts.append(hover_text)

# Build diamond shapes as polygons, y is numeric index
diamond_traces = []
for i, row in df_by_genre.iterrows():
    genre = row['genre']
    avg = row['avg_rating']
    min_r = row['min_rating']
    max_r = row['max_rating']
    area = np.sqrt(row['weighted_count'] * size_factor)
    color = genre_colors[genre]

    y = i  # y-axis numeric position

    # Calculate diamond vertical height from rating range and area
    vertical_height = area/(max_r - min_r)
    if vertical_height == 0:
        vertical_height = 0.01  # fallback minimal height to avoid zero area
    # Area of diamond = width * height; width = area / height
    half_height = vertical_height / 2
    half_width = area / vertical_height / 2

    # Diamond vertices coordinates (clockwise):
    # Top vertex: (avg, y + half_height)
    # Right vertex: (avg + half_width, y)
    # Bottom vertex: (avg, y - half_height)
    # Left vertex: (avg - half_width, y)
    # Close polygon back to top vertex
    x_pts = [avg, max_r, avg, min_r, avg]
    y_pts = [y + half_height, y, y - half_height, y, y + half_height]

    diamond_trace = go.Scatter(
        x=x_pts,
        y=y_pts,
        mode='lines',
        fill='toself',
        line=dict(color=color),
        fillcolor=color,
        hoverinfo='text',
        text=hover_texts[i],
        showlegend=False,
        hovertemplate=hover_texts[i] + "<extra></extra>",
    )
    diamond_traces.append(diamond_trace)

# Prepare y-axis ticks but keep labels blank (annotations used instead)
y_positions = list(range(len(df_by_genre)))
y_labels = df_by_genre['genre'].tolist()

annotations = [
    dict(
        xref='paper',
        x=0.005,
        y=y_pos,
        xanchor='right',
        yanchor='middle',
        text=genre,
        font=dict(color=genre_colors[genre], size=12),
        showarrow=False,
        align='right'
    )
    for y_pos, genre in zip(y_positions, y_labels)
]

fig = go.Figure(data=diamond_traces)

fig.update_layout(
    title=dict(
        text=f'{df.title.nunique()} movies/TVs by genre and rating',
        x=0.5,
        y=0.98,
        xanchor='center',
        font=dict(size=16)
    ),
    xaxis=dict(
        range=[min_orig - 0.2, max_orig + 0.2],
        showgrid=False,
        fixedrange=True,
    ),
    yaxis=dict(
        tickvals=y_positions,
        ticktext=[''] * len(y_positions),
        showgrid=False,
        fixedrange=True,
        zeroline=False,
        autorange='reversed'  # Reverse y-axis so highest weighted count is at top
    ),
    annotations=annotations,
    template='plotly_dark',
    height=520,
    margin=dict(t=10, b=10, l=108, r=10),
    dragmode=False,
    hoverlabel=dict(
        align='left',
        font_size=12,
    )
)

fig.show(config=dict(scrollZoom=False, doubleClick=False, displayModeBar=True))


  spectral = cm.get_cmap('rainbow')
  vertical_height = area/(max_r - min_r)


In [6]:
df_by_genre

Unnamed: 0,genre,weighted_count,raw_count,avg_rating,min_rating,max_rating,top
0,action/adventure,395.761183,974,7.216632,3,10,Top 15:<br>1991: Terminator 2: Judgment Day (9...
1,drama,246.486544,776,7.463918,5,10,Top 15:<br>1994: The Shawshank Redemption (9/1...
2,mystery/thriller,240.983802,718,7.267409,5,10,Top 15:<br>2000: Memento (9/10)<br>2001: A Bea...
3,scifi/fantasy,210.872691,679,7.34757,3,10,Top 15:<br>1999: The Matrix (9/10)<br>2003: Bi...
4,comedy,106.9921,328,7.131098,5,9,Top 15:<br>1989: Dead Poets Society (9/10)<br>...
5,crime,91.445635,339,7.176991,5,10,Top 15:<br>1996: Eraser (8/10)<br>2002: Catch ...
6,history/biography,69.385714,183,7.464481,5,9,Top 15:<br>1998: The Thin Red Line (9/10)<br>2...
7,documentary,47.22619,61,7.639344,6,10,Top 15:<br>2006: Nova: Monster of the Milky Wa...
8,romance,44.573016,140,7.364286,5,9,Top 15:<br>1985: Out of Africa (8/10)<br>1990:...
9,war/western,40.277381,136,7.375,5,9,Top 15:<br>1987: Empire of the Sun (8/10)<br>1...


In [None]:
import plotly.io as pio
config = {'displayModeBar': False, 'responsive': True}
pio.write_html(fig, file='../attachments/IMDb.html', config=config, include_plotlyjs='cdn')