In [15]:
# --- Cell 0: Setup and Cleaning ---

import os
import requests
import pandas as pd
import numpy as np
import plotly.express as px
from requests.exceptions import ChunkedEncodingError, RequestException

API_KEY = "aec18ef8c1196ce554cf50aec7f5ef75"

if not API_KEY or "PASTE_YOUR_KEY_HERE" in API_KEY:
    raise ValueError("Please paste your Rebrickable API key in API_KEY first.")

BASE_URL = "https://rebrickable.com/api/v3/lego"


def fetch_all_sets(max_pages=10, min_year=1950, page_size=500):
    """
    Fetch sets from Rebrickable /lego/sets/.
    More robust: catches truncated responses and stops gracefully.
    """
    all_sets = []

    for page in range(1, max_pages + 1):
        params = {
            "key": API_KEY,
            "page": page,
            "page_size": page_size,
            "min_year": min_year,
        }
        print(f"Sets page {page} …", end=" ")

        try:
            r = requests.get(f"{BASE_URL}/sets/", params=params, timeout=30)
            print(r.status_code)
            r.raise_for_status()
            data = r.json()
        except ChunkedEncodingError as e:
            print(f"\n⚠️ Got truncated response on page {page}. Stopping here.")
            break
        except RequestException as e:
            print(f"\n⚠️ Request error on page {page}: {e}. Stopping here.")
            break

        results = data.get("results", [])
        if not results:
            break

        all_sets.extend(results)

        # If there is no "next" link, we've reached the end
        if not data.get("next"):
            break

    df = pd.DataFrame(all_sets)
    if df.empty:
        raise RuntimeError("No sets fetched – check your API key or connection.")

    # Normalize column names
    df = df.rename(columns={"name": "set_name"})
    keep_cols = [c for c in ["set_num", "set_name", "year", "num_parts", "theme_id"] if c in df.columns]
    return df[keep_cols]


def fetch_themes(page_size=500):
    """Fetch theme lookup from /lego/themes/ with similar robustness."""
    all_themes = []
    page = 1

    while True:
        params = {"key": API_KEY, "page": page, "page_size": page_size}
        print(f"Themes page {page} …", end=" ")
        try:
            r = requests.get(f"{BASE_URL}/themes/", params=params, timeout=30)
            print(r.status_code)
            r.raise_for_status()
            data = r.json()
        except ChunkedEncodingError:
            print("\n⚠️ Got truncated response while fetching themes. Stopping here.")
            break
        except RequestException as e:
            print(f"\n⚠️ Request error while fetching themes: {e}. Stopping here.")
            break

        results = data.get("results", [])
        if not results:
            break

        all_themes.extend(results)
        if not data.get("next"):
            break
        page += 1

    df = pd.DataFrame(all_themes)
    if df.empty:
        raise RuntimeError("No themes fetched – check API or connection.")

    df = df.rename(columns={"name": "theme_name"})
    return df[["id", "theme_name", "parent_id"]]


# --- Actually fetch & merge ---

sets_raw = fetch_all_sets(max_pages=12, min_year=1950, page_size=500)
themes_df = fetch_themes(page_size=500)

print(f"\nFetched {len(sets_raw)} sets and {len(themes_df)} themes.")

# Merge in theme_name
sets_df = sets_raw.merge(
    themes_df,
    left_on="theme_id",
    right_on="id",
    how="left"
)

# Basic cleaning
sets_df = (
    sets_df
    .dropna(subset=["year", "num_parts"])
    .query("year >= 1950 and year <= 2025")
    .reset_index(drop=True)
)

def assign_era(year):
    year = int(year)
    if year <= 1990:
        return "Classic System (1950–1990)"
    elif year <= 2000:
        return "Transition (1991–2000)"
    elif year <= 2010:
        return "Franchise Shift (2001–2010)"
    else:
        return "AFOL & IP Era (2011–present)"

sets_df["era"] = sets_df["year"].astype(int).apply(assign_era)

licensed_keywords = [
    "Star Wars", "Harry Potter", "Marvel", "DC",
    "Disney", "Jurassic", "Super Heroes", "Lord of the Rings",
    "Hobbit", "Ninjago", "Ideas", "The LEGO Movie"
]

def is_licensed(name):
    if pd.isna(name):
        return False
    lower = name.lower()
    return any(k.lower() in lower for k in licensed_keywords)

sets_df["is_licensed"] = sets_df["theme_name"].apply(is_licensed)

# LEGO palette + theme helper
lego_palette = ["#ffcf00", "#e3000b", "#006cb7", "#00923f", "#333333"]

def apply_lego_theme(fig, title, subtitle=None):
    # Use a nicer, modern font (browser will fall back to Arial if needed)
    base_font = dict(family="Roboto, Arial, sans-serif", size=13)

    if subtitle:
        full_title = (
            f"{title}"
            "<br><span style='font-size:12px;color:#666;'>"
            f"{subtitle}</span>"
        )
    else:
        full_title = title

    fig.update_layout(
        template="simple_white",
        colorway=lego_palette,
        title=dict(
            text=full_title,
            x=0.02,
            xanchor="left",
            y=0.97,
            font=dict(family="Roboto, Arial, sans-serif", size=22)
        ),
        font=base_font,
        margin=dict(l=70, r=40, t=80, b=70),
        plot_bgcolor="white",
        paper_bgcolor="white",
        xaxis=dict(
            showgrid=True,
            gridcolor="rgba(0,0,0,0.06)",
            zeroline=False,
            ticks="outside",
            showline=True,
            linecolor="rgba(0,0,0,0.25)",
            tickangle=0,
            tickfont=dict(size=11),
            automargin=True
        ),
        yaxis=dict(
            showgrid=True,
            gridcolor="rgba(0,0,0,0.06)",
            zeroline=False,
            ticks="outside",
            showline=True,
            linecolor="rgba(0,0,0,0.25)",
            tickfont=dict(size=11),
            automargin=True
        ),
        legend=dict(
            title="",
            orientation="h",
            yanchor="top",
            y=-0.1,  # Moved legend below the plot
            xanchor="center",
            x=0.5, # Centered horizontally
            bgcolor="rgba(255,255,255,0.9)",
            font=dict(size=10) # Slightly smaller font for compactness
        ),
        shapes=[
            dict(
                type="line",
                xref="paper", yref="paper",
                x0=0, y0=-0.05, x1=1, y1=-0.05, # Positioned between plot and legend
                line=dict(color="rgba(0,0,0,0.1)", width=1)
            )
        ],
        height=520
    )
    return fig

sets_df.head()


Sets page 1 … 200
Sets page 2 … 200
Sets page 3 … 200
Sets page 4 … 200
Sets page 5 … 200
Sets page 6 … 200
Sets page 7 … 200
Sets page 8 … 200
Sets page 9 … 200
Sets page 10 … 200
Sets page 11 … 200
Sets page 12 … 200
Themes page 1 … 200

Fetched 6000 sets and 482 themes.


Unnamed: 0,set_num,set_name,year,num_parts,theme_id,id,theme_name,parent_id,era,is_licensed
0,0003977811-1,Ninjago: Book of Adventures,2022,1,761,761,Activity Books with LEGO Parts,497.0,AFOL & IP Era (2011–present),False
1,001-1,Gears,1965,43,756,756,Samsonite,365.0,Classic System (1950–1990),False
2,0011-2,Town Mini-Figures,1979,12,67,67,Classic Town,50.0,Classic System (1950–1990),False
3,0011-3,Castle 2 for 1 Bonus Offer,1987,0,199,199,Lion Knights,186.0,Classic System (1950–1990),False
4,0012-1,Space Mini-Figures,1979,12,143,143,Supplemental,126.0,Classic System (1950–1990),False


In [16]:
plot_df = sets_df.copy()

fig2 = px.scatter(
    plot_df,
    x="year",
    y="num_parts",
    color="era",
    opacity=0.35,
    labels={
        "year": "Year<br>",
        "num_parts": "Pieces per set",
        "era": "Era"
    },
    custom_data=["set_name", "theme_name", "set_num"]
)

fig2.update_traces(
    marker=dict(
        size=7,
        symbol="circle"
    ),
    hovertemplate=(
        "<b>%{customdata[0]}</b><br>"
        "Year: %{x}<br>"
        "Pieces: %{y}<br>"
        "Theme: %{customdata[1]}<br>"
        "Set #: %{customdata[2]}<extra></extra>"
    )
)

fig2 = apply_lego_theme(
    fig2,
    "<br>A Crowd of Bricks",
    "Each dot is a LEGO set; later years show both more sets and more large builds<br>"
)

fig2.update_layout(
    xaxis=dict(title_standoff=28),
    legend=dict(
        orientation="h",
        yanchor="top",
        y=-0.25,
        xanchor="center",
        x=0.5
    ),
    margin=dict(t=140, b=120)
)

fig2.show()


In [17]:
bins = [0, 200, 500, 1000, 2000, 10000]
labels = ["<200", "200–499", "500–999", "1000–1999", "2000+"]

sets_df["size_bucket"] = pd.cut(
    sets_df["num_parts"],
    bins=bins,
    labels=labels,
    right=False,
    include_lowest=True
)

era_size = (
    sets_df
    .dropna(subset=["size_bucket"])
    .groupby(["era", "size_bucket"], observed=True)
    .size()
    .reset_index(name="num_sets")
)

era_totals = era_size.groupby("era")["num_sets"].transform("sum")
era_size["pct_sets"] = era_size["num_sets"] / era_totals * 100

era_order = [
    "Classic System (1950–1990)",
    "Transition (1991–2000)",
    "Franchise Shift (2001–2010)",
    "AFOL & IP Era (2011–present)"
]

era_size["era"] = pd.Categorical(era_size["era"], categories=era_order, ordered=True)
era_size = era_size.sort_values(["era", "size_bucket"])

bucket_colors = {
    "<200": "#ffcf00",
    "200–499": "#00923f",
    "500–999": "#006cb7",
    "1000–1999": "#e3000b",
    "2000+": "#333333"
}

fig3 = px.bar(
    era_size,
    x="era",
    y="pct_sets",
    color="size_bucket",
    barmode="stack",
    category_orders={
        "era": era_order,
        "size_bucket": labels
    },
    color_discrete_map=bucket_colors,
    labels={
        "era": "",
        "pct_sets": "% of sets in era",
        "size_bucket": "Set size (pieces)"
    }
)

fig3.update_traces(
    opacity=0.7,
    marker=dict(
        line=dict(width=0.6, color="white")
    ),
    hovertemplate="<b>%{x}</b><br>Size: %{legendgroup}<br>% of sets: %{y:.1f}%<extra></extra>"
)

fig3 = apply_lego_theme(
    fig3,
    "<br>The Rise of Big LEGO Sets",
    "Stacked share of sets by size bucket in each era"
)

fig3.update_layout(
    bargap=0.25,
    xaxis=dict(title=""),
    yaxis=dict(title="% of sets in era")
)

fig3.show()


In [18]:
recent = sets_df.query("year >= 2010").dropna(subset=["theme_name"])

themes_recent = (
    recent
    .groupby(["theme_name", "is_licensed"], as_index=False)
    .agg(num_sets=("set_num", "size"))
)

themes_recent = themes_recent.sort_values("num_sets", ascending=False).head(15)

themes_recent["theme_type"] = themes_recent["is_licensed"].map({
    True: "Licensed / IP",
    False: "Original in-house"
})

fig4 = px.treemap(
    themes_recent,
    path=["theme_type", "theme_name"],
    values="num_sets",
    color="theme_type",
    color_discrete_map={
        "Licensed / IP": "#e3000b",   # LEGO red
        "Original in-house": "#ffcf00"  # LEGO yellow
    }
)

fig4.update_traces(
    textinfo="label+value",
    texttemplate="<b>%{label}</b><br>%{value} sets",
    textfont=dict(size=14),
    hovertemplate="<b>%{label}</b><br>%{parent}<br>Sets since 2010: %{value}<extra></extra>",
    marker=dict(
        line=dict(width=1.5, color="white")
    ),
    opacity=0.78
)

fig4 = apply_lego_theme(
    fig4,
    "Who LEGO Is Building For in the 2010s+",
    "Each rectangle is a theme; size shows how many sets since 2010, grouped by licensed vs original"
)

fig4.update_layout(
    margin=dict(l=30, r=30, t=90, b=50),
    uniformtext=dict(minsize=11, mode="show"),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.04,
        xanchor="left",
        x=0
    ),
    height=550
)

fig4.show()


In [19]:
threshold = np.percentile(sets_df["num_parts"], 98)
flagships = sets_df[sets_df["num_parts"] >= threshold].copy()

flagships["theme_type"] = flagships["is_licensed"].map({
    True: "Licensed / IP",
    False: "Original in-house"
})

fig5 = px.scatter(
    flagships,
    x="year",
    y="num_parts",
    color="theme_type",
    size="num_parts",
    size_max=32,
    opacity=0.7,
    labels={
        "year": "<br><br>Year",
        "num_parts": "Pieces",
        "theme_type": "Type"
    },
    custom_data=["set_name", "theme_name", "set_num"]
)

fig5.update_traces(
    hovertemplate="<b>%{customdata[0]}</b><br>Year: %{x}<br>Pieces: %{y}<br>Theme: %{customdata[1]}<br>Set #: %{customdata[2]}<extra></extra>"
)

iconic_keywords = [
    "Millennium Falcon",
    "Colosseum",
    "Titanic",
    "Hogwarts",
    "Star Destroyer",
    "Porsche",
    "Bugatti",
    "Eiffel",
    "Liebherr"
]

highlight_sets = flagships[
    flagships["set_name"].str.contains("|".join(iconic_keywords), case=False, na=False)
].copy()

offsets = [
    (40, -40),
    (-40, 40),
    (60, 20),
    (-60, -20),
    (30, 60),
    (-30, -60),
    (80, -10),
    (-80, 10)
]

for i, (_, row) in enumerate(highlight_sets.iterrows()):
    dx, dy = offsets[i % len(offsets)]

    fig5.add_annotation(
        x=row["year"],
        y=row["num_parts"],
        text=f"<b>{row['set_name']}</b>",
        showarrow=True,
        arrowhead=1,
        arrowsize=1,
        arrowwidth=1,
        arrowcolor="#777",
        font=dict(size=11, color="#222"),
        bgcolor="rgba(255,255,255,0.9)",
        bordercolor="rgba(0,0,0,0.1)",
        borderpad=6,
        ax=dx,
        ay=dy
    )

fig5 = apply_lego_theme(
    fig5,
    "<br>LEGO's Flagship Beasts",
    "Culturally iconic mega-sets – where LEGO becomes display object and collector artifact"
)

fig5.update_layout(
    margin=dict(t=120),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.25,
        xanchor="center",
        x=0.5
    )
)

fig5.show()


In [20]:
num_sets_yearly = (
    sets_df
    .groupby("year", as_index=False)
    .agg(num_sets=("set_num", "size"))
    .sort_values("year")
)

fig_sets_per_year = px.line(
    num_sets_yearly,
    x="year",
    y="num_sets",
    markers=True,
    labels={
        "year": "Year",
        "num_sets": "Number of Sets"
    }
)

fig_sets_per_year.update_traces(
    hovertemplate="<b>Year:</b> %{x}<br><b>Number of sets:</b> %{y}<extra></extra>"
)

fig_sets_per_year = apply_lego_theme(
    fig_sets_per_year,
    "The LEGO Catalog Expands",
    "Number of Sets Released Per Year"
)

fig_sets_per_year.show()

In [21]:
small_sets_df = sets_df.query("num_parts < 200")

small_sets_yearly = (
    small_sets_df
    .groupby("year", as_index=False)
    .agg(num_small_sets=("set_num", "size"))
    .sort_values("year")
)

# Merge with total sets per year
merged_yearly_data = num_sets_yearly.merge(
    small_sets_yearly,
    on="year",
    how="left"
)

# Fill NaN values for num_small_sets with 0 (years where no small sets were found)
merged_yearly_data["num_small_sets"] = merged_yearly_data["num_small_sets"].fillna(0)

# Calculate the percentage of small sets
merged_yearly_data["pct_small_sets"] = (
    merged_yearly_data["num_small_sets"] / merged_yearly_data["num_sets"]
) * 100

merged_yearly_data.head()

Unnamed: 0,year,num_sets,num_small_sets,pct_small_sets
0,1955,35,35,100.0
1,1956,16,16,100.0
2,1957,18,18,100.0
3,1958,62,62,100.0
4,1959,3,3,100.0


In [22]:
fig_small_sets_pct = px.line(
    merged_yearly_data,
    x="year",
    y="pct_small_sets",
    markers=True,
    labels={
        "year": "Year",
        "pct_small_sets": "Percentage of Small Sets (<200 pieces)"
    }
)

fig_small_sets_pct.update_traces(
    hovertemplate="<b>Year:</b> %{x}<br><b>% Small Sets:</b> %{y:.2f}%<extra></extra>"
)

fig_small_sets_pct = apply_lego_theme(
    fig_small_sets_pct,
    "The Disappearing Small Set",
    "Proportion of Sets with Under 200 Pieces"
)

fig_small_sets_pct.show()

In [23]:
first_appearance_year = sets_df.groupby('theme_id')['year'].min().reset_index()
first_appearance_year.columns = ['theme_id', 'first_year_introduced']

new_themes_yearly = (
    first_appearance_year
    .groupby('first_year_introduced', as_index=False)
    .agg(num_new_themes=('theme_id', 'size'))
    .sort_values('first_year_introduced')
)

new_themes_yearly.head()

Unnamed: 0,first_year_introduced,num_new_themes
0,1955,4
1,1957,1
2,1960,1
3,1961,1
4,1964,1


In [24]:
licensed_original_yearly = (
    sets_df
    .groupby(['year', 'is_licensed'], as_index=False)
    .agg(num_sets=('set_num', 'size'))
)

total_sets_per_year = licensed_original_yearly.groupby('year')['num_sets'].transform('sum')

licensed_original_yearly['percentage'] = (licensed_original_yearly['num_sets'] / total_sets_per_year) * 100

licensed_original_yearly["theme_type"] = licensed_original_yearly["is_licensed"].map({
    True: "Licensed / IP",
    False: "Original in-house"
})

licensed_original_yearly.head()

Unnamed: 0,year,is_licensed,num_sets,percentage,theme_type
0,1955,False,35,100.0,Original in-house
1,1956,False,16,100.0,Original in-house
2,1957,False,18,100.0,Original in-house
3,1958,False,62,100.0,Original in-house
4,1959,False,3,100.0,Original in-house


In [25]:
fig_ip_invasion = px.area(
    licensed_original_yearly,
    x="year",
    y="percentage",
    color="theme_type",
    labels={
        "year": "Year",
        "percentage": "Percentage of Sets (%)",
        "theme_type": "Theme Type"
    },
    line_group="theme_type",
    color_discrete_map={
        "Licensed / IP": lego_palette[1], # Red
        "Original in-house": lego_palette[0] # Yellow
    }
)

fig_ip_invasion.update_traces(
    hovertemplate="<b>Year:</b> %{x}<br><b>Theme Type:</b> %{customdata[0]}<br><b>Percentage:</b> %{y:.2f}%<extra></extra>",
    customdata=licensed_original_yearly[['theme_type']]
)

fig_ip_invasion = apply_lego_theme(
    fig_ip_invasion,
    "The IP Invasion",
    "The Shifting Balance of Licensed vs. Original Sets"
)

fig_ip_invasion.show()

In [26]:
fig_complexity_distribution = px.scatter(
    sets_df,
    x="year",
    y="num_parts",
    color="era",
    log_y=True,
    opacity=0.4, # Between 0.3 and 0.5 as requested
    labels={
        "year": "<br>Year",
        "num_parts": "Number of Pieces (log scale)",
        "era": "Era"
    },
    hover_data=["set_name", "theme_name", "set_num"]
)

fig_complexity_distribution.update_traces(
    marker=dict(symbol='square', size=4), # Square symbol and size 4
    hovertemplate=(
        "<b>%{customdata[0]}</b><br>" # set_name
        "Year: %{x}<br>" # year
        "Pieces: %{y}<br>" # num_parts
        "Theme: %{customdata[1]}<br>" # theme_name
        "Set #: %{customdata[2]}" # set_num
        "<extra></extra>" # Removes default hover box
    )
)

fig_complexity_distribution = apply_lego_theme(
    fig_complexity_distribution,
    "<br>The LEGO Landscape: Evolution of Set Complexity Distribution",
    "Full spectrum of set sizes released each year, logarithmic piece counts<br><br><br>"
)

fig_complexity_distribution.show()

In [27]:
theme_set_counts = (
    sets_df
    .groupby('theme_name')['set_num']
    .size()
    .reset_index(name='total_sets')
    .sort_values('total_sets', ascending=False)
)

top_15_themes = theme_set_counts.head(15)['theme_name'].tolist()

filtered_top_themes_df = sets_df[sets_df['theme_name'].isin(top_15_themes)].copy()

prolific_themes_yearly = (
    filtered_top_themes_df
    .groupby(['year', 'theme_name', 'is_licensed'], as_index=False)
    .agg(num_sets_yearly_theme=('set_num', 'size'))
)

prolific_themes_yearly['theme_type'] = prolific_themes_yearly['is_licensed'].map({
    True: 'Licensed / IP',
    False: 'Original in-house'
})

prolific_themes_yearly = (
    prolific_themes_yearly
    .sort_values(['year', 'theme_name'])
    .reset_index(drop=True)
)

prolific_themes_yearly.head()

Unnamed: 0,year,theme_name,is_licensed,num_sets_yearly_theme,theme_type
0,1955,Supplemental,False,20,Original in-house
1,1956,Supplemental,False,8,Original in-house
2,1957,Supplemental,False,6,Original in-house
3,1958,Supplemental,False,35,Original in-house
4,1959,Supplemental,False,2,Original in-house


In [29]:
sets_df['theme_type'] = sets_df['is_licensed'].map({
    True: 'Licensed / IP',
    False: 'Original in-house'
})

fig_complexity_volume_scatter = px.scatter(
    sets_df,
    x="year",
    y="num_parts",
    color="era", # Using era for color as per previous similar plots in the notebook
    log_y=True,
    opacity=0.4,
    facet_col="theme_type",
    labels={
        "year": "Year",
        "num_parts": "Number of Pieces (log scale)",
        "era": "Era"
    },
    hover_data=["set_name", "theme_name", "set_num"]
)

fig_complexity_volume_scatter.update_traces(
    marker=dict(symbol='square', size=4),
    hovertemplate=(
        "<b>%{customdata[0]}</b><br>" + # set_name
        "Year: %{x}<br>" + # year
        "Pieces: %{y}<br>" + # num_parts
        "Theme: %{customdata[1]}<br>" + # theme_name
        "Set #: %{customdata[2]}" + # set_num
        "<extra></extra>"
    )
)

fig_complexity_volume_scatter = apply_lego_theme(
    fig_complexity_volume_scatter,
    "The LEGO Brick Wall<br>",
    "Complexity & Volume by Theme Type"
)

fig_complexity_volume_scatter.show()

In [30]:
import plotly.graph_objects as go
import plotly.express as px

# --------- Era assignment (aligned with your written story) ----------
def assign_era_story(year):
    year = int(year)
    if year <= 1990:
        return "System of Play (≤1990)"
    elif year <= 2010:
        return "Complexity & Realism (1991–2010)"
    else:
        return "AFOL & Collectible (2011–present)"

if "era_story" not in sets_df.columns:
    sets_df["era_story"] = sets_df["year"].astype(int).apply(assign_era_story)

# --------- Aggregate: average pieces per set by year ----------
required_cols = {"year", "num_parts"}
missing = [c for c in required_cols if c not in sets_df.columns]
if missing:
    print(f"Skipping Chart 1: missing columns {missing}")
else:
    yearly_size = (
        sets_df
        .dropna(subset=["year", "num_parts"])
        .groupby("year", as_index=False)
        .agg(avg_pieces=("num_parts", "mean"),
             num_sets=("set_num", "size"))
        .sort_values("year")
    )

    # Restrict to plausible LEGO years
    yearly_size = yearly_size[yearly_size["year"] >= 1950]

    # --------- Build figure ----------
    fig1 = px.line(
        yearly_size,
        x="year",
        y="avg_pieces",
        markers=True,
        labels={
            "year": "Year",
            "avg_pieces": "Average Pieces per Set"
        }
    )

    # Era bands as vertical colored stripes (yref='paper' to span full height)
    min_year = int(yearly_size["year"].min())
    max_year = int(yearly_size["year"].max())

    era_bands = [
        dict(
            type="rect",
            xref="x",
            yref="paper",
            x0=min_year,
            x1=1990,
            y0=0,
            y1=1,
            fillcolor="#ffcf00",
            opacity=0.10,
            line_width=0,
            layer="below"
        ),
        dict(
            type="rect",
            xref="x",
            yref="paper",
            x0=1991,
            x1=2010,
            y0=0,
            y1=1,
            fillcolor="#006cb7",
            opacity=0.08,
            line_width=0,
            layer="below"
        ),
        dict(
            type="rect",
            xref="x",
            yref="paper",
            x0=2011,
            x1=max_year,
            y0=0,
            y1=1,
            fillcolor="#e3000b",
            opacity=0.07,
            line_width=0,
            layer="below"
        ),
    ]

    fig1.update_traces(
        line=dict(width=3),
        marker=dict(size=5)
    )

    fig1.update_layout(
        shapes=era_bands,
        hovermode="x unified"
    )

    fig1 = apply_lego_theme(
        fig1,
        "From Bricks to Beasts",
        "Average LEGO pieces per set over time, with eras from 'System of Play' to 'AFOL collectible'"
    )

    fig1.show()


In [31]:
# --------- Licensed vs original share by year ----------
required_cols = {"year", "is_licensed"}
missing = [c for c in required_cols if c not in sets_df.columns]
if missing:
    print(f"Skipping Chart 4: missing columns {missing}")
else:
    df_lic = sets_df.dropna(subset=["year"]).copy()
    df_lic["era_story"] = df_lic["year"].astype(int).apply(assign_era_story)

    yearly_lic = (
        df_lic
        .groupby("year", as_index=False)
        .agg(
            total_sets=("set_num", "size"),
            licensed_sets=("is_licensed", "sum")
        )
        .sort_values("year")
    )

    yearly_lic = yearly_lic[yearly_lic["year"] >= 1970]

    yearly_lic["original_sets"] = yearly_lic["total_sets"] - yearly_lic["licensed_sets"]

    long_lic = (
        yearly_lic
        .melt(
            id_vars=["year", "total_sets"],
            value_vars=["licensed_sets", "original_sets"],
            var_name="kind",
            value_name="count"
        )
    )

    kind_labels = {
        "licensed_sets": "Licensed / IP Themes",
        "original_sets": "Original LEGO Themes"
    }
    long_lic["kind_label"] = long_lic["kind"].map(kind_labels)

    long_lic["share"] = (long_lic["count"] / long_lic["total_sets"]) * 100

    fig4 = px.area(
        long_lic,
        x="year",
        y="share",
        color="kind_label",
        groupnorm=None,
        labels={
            "year": "Year",
            "share": "% of yearly releases",
            "kind_label": ""
        },
        color_discrete_map={
            "Licensed / IP Themes": "#e3000b",
            "Original LEGO Themes": "#006cb7"
        }
    )

    fig4.update_traces(
        hovertemplate="<b>%{x}</b><br>%{y:.1f}% %{legendgroup}<extra></extra>"
    )

    fig4 = apply_lego_theme(
        fig4,
        "From Town to Star Wars",
        "Licensed IP gradually takes up more of LEGO's yearly releases"
    )

    fig4.show()


In [32]:
output_filenames = []

# List of figures and their desired filenames
figures_to_save = {
    "fig_sets_per_year": "fig_sets_per_year.html",
    "fig_small_sets_pct": "fig_small_sets_pct.html",
    "fig_ip_invasion": "fig_ip_invasion.html",
    "fig_complexity_distribution": "fig_complexity_distribution.html",
    "fig1": "fig1_avg_pieces_per_set.html",
    "fig2": "fig2_crowd_of_bricks.html",
    "fig3": "fig3_big_lego_sets.html",
    "fig4": "fig4_licensed_vs_original_share.html",
    "fig5": "fig5_flagship_beasts.html",
    "fig_complexity_volume_scatter": "fig_complexity_volume_scatter.html"
}

# Loop through each figure and save it
for fig_name, filename in figures_to_save.items():
    try:
        # Get the figure object from the current scope (globals() or locals())
        fig = globals()[fig_name]
        fig.write_html(filename)
        output_filenames.append(filename)
        print(f"Saved {fig_name} to {filename}")
    except KeyError:
        print(f"Figure '{fig_name}' not found. Skipping.")
    except Exception as e:
        print(f"Error saving {fig_name} to {filename}: {e}")

print("\nAll saved filenames:")
for f in output_filenames:
    print(f)

Saved fig_sets_per_year to fig_sets_per_year.html
Saved fig_small_sets_pct to fig_small_sets_pct.html
Saved fig_ip_invasion to fig_ip_invasion.html
Saved fig_complexity_distribution to fig_complexity_distribution.html
Saved fig1 to fig1_avg_pieces_per_set.html
Saved fig2 to fig2_crowd_of_bricks.html
Saved fig3 to fig3_big_lego_sets.html
Saved fig4 to fig4_licensed_vs_original_share.html
Saved fig5 to fig5_flagship_beasts.html
Saved fig_complexity_volume_scatter to fig_complexity_volume_scatter.html

All saved filenames:
fig_sets_per_year.html
fig_small_sets_pct.html
fig_ip_invasion.html
fig_complexity_distribution.html
fig1_avg_pieces_per_set.html
fig2_crowd_of_bricks.html
fig3_big_lego_sets.html
fig4_licensed_vs_original_share.html
fig5_flagship_beasts.html
fig_complexity_volume_scatter.html


In [33]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: 