In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import sys
sys.path.append('/')
from utils_processing import transform_counts_to_calendar
from utils_plot import plot_book_count, create_map

In [2]:
df_0 = pd.read_excel("../data/20251011_book_record.xlsx")
plot_book_count(df_0)

In [None]:
# --- STEP 1: Weighted genre counts (unchanged) ---
df_genres = df_0['genre'].dropna().apply(lambda x: [g.strip() for g in x.split(',')])

genre_count = {}
# --- STEP 1: Collect weighted counts by genre and by type ---
genre_type_weights = {}   # fiction/nonfiction weight distribution

for _, row in df_0.dropna(subset=['genre']).iterrows():
    genres = [g.strip() for g in row['genre'].split(',')]
    weight = 1 / len(genres)
    book_type = row.get('type', None)

    for g in genres:
        if g not in genre_type_weights:
            genre_type_weights[g] = {'Fiction': 0.0, 'Nonfiction': 0.0}

        if book_type in ('Fiction', 'Nonfiction'):
            genre_type_weights[g][book_type] += weight


# --- STEP 2: Compute global totals (same as before) ---
total_fiction = sum(v['Fiction'] for v in genre_type_weights.values())
total_nonfiction = sum(v['Nonfiction'] for v in genre_type_weights.values())
total = total_fiction + total_nonfiction

fiction_flag = f'Fiction ({total_fiction:.0f}, {total_fiction/total:.0%})'
nonfiction_flag = f'Nonfiction ({total_nonfiction:.0f}, {total_nonfiction/total:.0%})'

# --- STEP 3: Create rows with type-specific counts (NEW LOGIC) ---
rows = []
for g, w in genre_type_weights.items():

    # Fiction row
    if w['Fiction'] > 0:
        rows.append({
            'genre': g,
            'count': w['Fiction'],       # <-- type-specific count
            'flag': fiction_flag
        })

    # Nonfiction row
    if w['Nonfiction'] > 0:
        rows.append({
            'genre': g,
            'count': w['Nonfiction'],    # <-- type-specific count
            'flag': nonfiction_flag
        })

df_genre_counts = pd.DataFrame(rows)


# --- STEP 4: Top titles (same logic, repeated for each genre row) ---
df_genre_counts['top'] = df_genre_counts['genre'].apply(
    lambda g: df_0[
        df_0['rating'].eq(10) &
        df_0['genre'].notna() &
        df_0['genre'].apply(lambda x: g in [s.strip() for s in x.split(',')])
    ]['title'].tolist()
)

df_genre_counts.sort_values(by='count', ascending=False)

Unnamed: 0,genre,count,flag,top
9,Science Fiction,50.0,"Fiction (85, 42%)","[The Dark Forest, Death's End, Stories of Your..."
1,Philosophy,36.833333,"Nonfiction (117, 58%)","[A History of Western Philosophy, The Social C..."
8,Science,31.166667,"Nonfiction (117, 58%)","[The Universe in a Nutshell, A Brief History o..."
3,History,18.833333,"Nonfiction (117, 58%)","[A History of Western Philosophy, The Art of G..."
4,Sociology,15.5,"Nonfiction (117, 58%)",[The Social Contract]
6,Classic,12.0,"Fiction (85, 42%)","[The Left Hand of Darkness, The Dispossessed, ..."
16,Speculative Fiction,11.0,"Fiction (85, 42%)","[Stories of Your Life and Others, Childhood's ..."
5,Architecture,6.833333,"Nonfiction (117, 58%)","[The Art of Gothic: Architecture, Sculpture, P..."
11,Thriller,4.0,"Fiction (85, 42%)",[]
10,Young Adult,3.5,"Fiction (85, 42%)",[]


In [4]:
count_display_threshold = 5
grouped_list = []
for flag, group in df_genre_counts.groupby('flag'):
    top_n = group[group['count'] >= count_display_threshold].copy()
    other_sum = group[~group.index.isin(top_n.index)]['count'].sum()
    if other_sum > 0:
        top_n = pd.concat([top_n, pd.DataFrame({'genre': 'Other', 'count': [other_sum], 'flag': [flag]})])
    grouped_list.append(top_n)

df_genre_plot = pd.concat(grouped_list).reset_index(drop=True)
df_genre_plot['count_frac'] = df_genre_plot['count'] / df_genre_plot['count'].sum()
df_genre_plot = df_genre_plot.round(2).sort_values(by=['flag','count_frac'], ascending=[True, False])
df_genre_plot['dummy'] = [f'{df_genre_plot["count"].sum():.0f} books by genre'] * len(df_genre_plot)
df_genre_plot

Unnamed: 0,genre,count,flag,top,count_frac,dummy
1,Science Fiction,50.0,"Fiction (85, 42%)","[The Dark Forest, Death's End, Stories of Your...",0.25,202 books by genre
0,Classic,12.0,"Fiction (85, 42%)","[The Left Hand of Darkness, The Dispossessed, ...",0.06,202 books by genre
3,Other,12.0,"Fiction (85, 42%)",,0.06,202 books by genre
2,Speculative Fiction,11.0,"Fiction (85, 42%)","[Stories of Your Life and Others, Childhood's ...",0.05,202 books by genre
4,Philosophy,36.83,"Nonfiction (117, 58%)","[A History of Western Philosophy, The Social C...",0.18,202 books by genre
8,Science,31.17,"Nonfiction (117, 58%)","[The Universe in a Nutshell, A Brief History o...",0.15,202 books by genre
5,History,18.83,"Nonfiction (117, 58%)","[A History of Western Philosophy, The Art of G...",0.09,202 books by genre
6,Sociology,15.5,"Nonfiction (117, 58%)",[The Social Contract],0.08,202 books by genre
9,Other,7.83,"Nonfiction (117, 58%)",,0.04,202 books by genre
7,Architecture,6.83,"Nonfiction (117, 58%)","[The Art of Gothic: Architecture, Sculpture, P...",0.03,202 books by genre


In [None]:
import plotly.express as px
import pandas as pd
import plotly.io as pio
import textwrap

def create_tree(
        df,
        feat,
        var,
        color_dict={fiction_flag: 'royalblue', nonfiction_flag: 'gold'},
        save_path=None
        ):

    # Define min/max line length
    min_line_length = 10
    max_line_length = 30

    # Normalize count to 0-1
    count_min = df[var].min()
    count_max = df[var].max()
    norm_var = f'norm_{var}'
    df[norm_var] = (df[var] - count_min) / (count_max - count_min)

    # Compute dynamic line length
    df['line_length'] = df[norm_var] * (max_line_length - min_line_length) + min_line_length
    df['line_length'] = df['line_length'].astype(int)

    # Wrap genre names dynamically, but skip single-word labels
    def wrap_feat(row):
        if ' ' in row[feat]:  # only wrap multi-word genres
            return '<br>'.join(textwrap.wrap(row[feat], width=row['line_length']))
        else:
            return row[feat]  # leave single-word genres intact

    feat_wrapped = f'{feat}_wrapped'
    df[f'{feat}_wrapped'] = df.apply(wrap_feat, axis=1)
    df['id'] = df['flag'] + ' | ' + df[feat_wrapped]  # unique per flag+genre
    df['parent'] = ""  # flat hierarchy
    df['label'] = df[feat_wrapped]  # show only genre text

    # Create treemap
    fig = go.Figure(go.Treemap(
        ids=df['id'],
        labels=df['label'],
        parents=df['parent'],
        values=df['count'],
        marker=dict(
            colors=[color_dict.get(f, 'gray') for f in df['flag']],
            line=dict(color='black', width=1)
        ),
        textinfo="label+value+percent parent",
        texttemplate="%{label}<br>(%{value:.0f}, %{percentParent:.0%})",
        textposition="middle center",
        hoverinfo='none',  # <-- disables hover
        branchvalues='total'
    ))

    fig.update_traces(
        texttemplate="%{label}<br>(%{value:.0f}, %{percentRoot:.0%})",
        textposition='middle center',
        hoverinfo='skip',
        hovertemplate=None,
        marker=dict(line=dict(color='black', width=1))
    )

    fig.update_layout(
        margin=dict(l=1, r=1, t=1, b=1),  # reduced top margin
        template='plotly_dark',
        height=500,
        uniformtext=dict(minsize=8, mode='show')
    )

    title_text = (f'<span style="color:{color_dict[fiction_flag]}">{fiction_flag}</span>, '
                f'<span style="color:{color_dict[nonfiction_flag]}">{nonfiction_flag}</span>')

    fig.update_layout(
        margin=dict(l=1, r=1, t=20, b=1),  # top margin increased for title
        template='plotly_dark',
        font=dict(color='white', size=12),
        height=520,
        uniformtext=dict(minsize=8, mode='show'),
        title=dict(
            text=title_text,
            x=0.5,
            xanchor='center',
            yanchor='top',
            y=0.99,  # stick to the very top inside the margin
            font=dict(size=16)
        )
    )

    config = {'displayModeBar': False, 'responsive': True}
    if save_path:    
        pio.write_html(fig, file=save_path, config=config, include_plotlyjs='cdn')

    return fig.show(config=config)

create_tree(
    df=df_genre_plot,
    feat='genre',
    var='count',
    color_dict={fiction_flag: 'royalblue', nonfiction_flag: 'gold'},
    #save_path='../attachments/tree_book.html',
    save_path=None
    )

In [7]:
df = transform_counts_to_calendar(df_0)
VAR = 'total_books'

create_map(
    df, 
    var=VAR, 
    code_convention='code3', 
    bins=[0, 1, 10, 30, df[VAR].max()],
    color='royalblue',
    projection_type="orthographic",
    tooltip_mode='raw',
    #save_path='../attachments/map_books.html'
    save_path=None
    )