In [1]:
import pandas as pd
import sys
sys.path.append('/')
from utils_processing import transform_counts_to_calendar
from utils_plot import plot_bar_time_series, create_map, create_tree

In [2]:
df = pd.read_csv('../data/movie_record.csv')
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace('-', '_')
df.columns = df.columns.str.replace('(', '')
df.columns = df.columns.str.replace(')', '')
df = df.dropna(axis=1, how='all')
REMOVE_COLS = [
    'const', 
    'original_title',
    'url', 
    'imdb_rating', 
    'num_votes', 
    'release_date', 
    'directors'
    ]
df = df.drop(columns=REMOVE_COLS)
df['title_type'] = df['title_type'].apply(lambda x: 'TV' if 'TV' in x else x)
df['title_type'] = df['title_type'].apply(lambda x: x if x in ['Movie', 'TV'] else 'Other')
df = df.sort_values(by='your_rating', ascending=False)
df.rename(columns={'title_type': 'flag', 'genres': 'genre'}, inplace=True)
df['flag'] = df['flag'].replace('Other', 'Movie')
df['date_rated'] = pd.to_datetime(df['date_rated']).dt.year
df

Unnamed: 0,your_rating,date_rated,title,flag,runtime_mins,year,genre
1577,10,2015,The Dark Knight,Movie,152.0,2008,"Drama, Crime, Thriller, Action"
909,10,2015,Home,Movie,118.0,2009,"Documentary, Family"
543,10,2019,Avengers: Endgame,Movie,181.0,2019,"Action, Sci-Fi, Adventure"
1573,10,2015,Inception,Movie,148.0,2010,"Sci-Fi, Thriller, Action, Adventure"
111,10,2024,Arrival,Movie,116.0,2016,"Sci-Fi, Drama, Mystery"
...,...,...,...,...,...,...,...
751,5,2017,Resident Evil: The Final Chapter,Movie,107.0,2016,"Action, Horror, Sci-Fi"
669,5,2017,Guardians,Movie,89.0,2017,"Sci-Fi, Action, Adventure, Fantasy, Thriller, ..."
790,4,2017,Starship Troopers 3: Marauder,Movie,105.0,2008,"Sci-Fi, Action, Adventure"
454,3,2020,Shanghai Fortress,Movie,107.0,2019,Sci-Fi


In [3]:
plot_bar_time_series(df, select_type='book', time_period='year', title='Movies by Release Year')

In [4]:
# Step 1: explode genres and compute weights per row
df_genres = df[['flag', 'genre', 'your_rating']].dropna(subset=['genre'])
df_genres = df_genres.assign(
    genre=df_genres['genre'].str.split(',')
)
df_exploded = df_genres.explode('genre')
df_exploded['genre'] = df_exploded['genre'].str.strip()

# Combine sci-fi and fantasy into "sci-fi fantasy"
df_exploded['genre'] = df_exploded['genre'].replace({
    'Sci-Fi': 'Sci-Fi / Fantasy', 'Fantasy': 'Sci-Fi / Fantasy',
    'History': 'History / Biography', 'Biography': 'History / Biography',
    'War': 'War / Western', 'Western': 'War / Western',
    'Mystery': 'Mystery / Thriller', 'Thriller': 'Mystery / Thriller',
    'Action': 'Action / Adventure', 'Adventure': 'Action / Adventure',
    'Music': 'Music / Musical', 'Musical': 'Music / Musical',
    })

# Compute weight = 1 / number of genres per original entry
df_exploded['weight'] = df_exploded.groupby(df_exploded.index)['genre'].transform(lambda x: 1 / len(x))

# Step 2: compute weighted counts and ratings
df_exploded['weighted_rating'] = df_exploded['your_rating'] * df_exploded['weight']

# Step 3: aggregate by flag + genre
df_flag_genre = (
    df_exploded.groupby(['flag', 'genre'], as_index=False)
    .agg(count=('weight', 'sum'),
         rating_sum=('weighted_rating', 'sum'))
)

# Step 4: compute average weighted rating
df_flag_genre['rating'] = df_flag_genre['rating_sum'] / df_flag_genre['count']
df_flag_genre.drop(columns='rating_sum', inplace=True)

# Step 5: normalize ratings to 10
min_rating = df_flag_genre['rating'].min()
max_rating = df_flag_genre['rating'].max()
if max_rating != min_rating:
    df_flag_genre['rating'] = min_rating + (df_flag_genre['rating'] - min_rating) * (10 - min_rating) / (max_rating - min_rating)

df_flag_genre.sort_values('rating', ascending=False).round(1)

Unnamed: 0,flag,genre,count,rating
35,TV,Talk-Show,0.3,10.0
29,TV,News,0.3,10.0
27,TV,Music / Musical,0.3,10.0
18,TV,Animation,5.3,9.9
21,TV,Documentary,27.2,9.7
28,TV,Mystery / Thriller,32.6,9.6
34,TV,Sport,0.5,9.5
32,TV,Sci-Fi / Fantasy,32.3,9.4
22,TV,Drama,48.0,9.4
20,TV,Crime,17.2,9.3


In [5]:
create_tree(
    df=df_flag_genre,
    feat='genre',
    var='count',
    flag='flag',
    threshold=30,
    threshold_global=False,
    color_dict={'Movie': 'royalblue', 'TV': 'gold', 'Other': 'white'},
    #save_path='../attachments/tree_movies.html',
    save_path=None
)