In [40]:
import pandas as pd

import matplotlib.pyplot as plt

import numpy as np

import ipywidgets as widgets

In [42]:
df = pd.read_csv("phds_country.csv")

disciplines = [
    "All Ph.D.s",
    "All Science & Engineering",
    "Physical & Biological",
    "Math & Computer Science",
    "Agricultural",
    "Social & Behavioral",
    "Engineering",
    "Non-Science Ph.D.s"
]

records = []
for _, row in df.iterrows():
    for discipline in disciplines:
        female_count = row[f"{discipline} - Female"]
        male_count = row[f"{discipline} - Male"]
        total = female_count + male_count
        records.append({
            "Country": row["Country"],
            "Clean Country": row["Country"].split("(")[0].strip(),
            "Region": row["Region"],
            "Discipline": discipline,
            "Female Count": female_count,
            "Male Count": male_count,
            "Total Count": total,
            "Female %": round(100 * female_count / total, 2) if total > 0 else 0,
            "Male %": round(100 * male_count / total, 2) if total > 0 else 0
        })

phd_long = pd.DataFrame(records)

all_phds_total = phd_long[phd_long["Discipline"] == "All Ph.D.s"][["Country", "Total Count"]]
all_phds_total = dict(zip(all_phds_total["Country"], all_phds_total["Total Count"]))

discipline_dropdown = widgets.Dropdown(
    options=sorted(phd_long['Discipline'].unique()),
    value="All Ph.D.s",
    description='View Type:',
    layout=widgets.Layout(width='300px')
)

show_female = widgets.Checkbox(value=True, description='Female')
show_male = widgets.Checkbox(value=True, description='Male')
show_background = widgets.Checkbox(value=True, description='All Ph.D.s')

sort_toggle = widgets.ToggleButtons(
    options=['Ascending % of Females', 'Region'],
    description='Order by:',
    button_style=''
)

checkboxes = widgets.VBox([show_female, show_male, show_background])

def update_plot(discipline, show_f, show_m, show_bg, sort_by):
    filtered = phd_long[phd_long["Discipline"] == discipline].copy()
    filtered = filtered[filtered["Total Count"] > 0]

    if sort_by == "Region":
        region_order = ["Africa", "America", "Asia", "Europe", "Middle East", "Oceania"]
        filtered["Region"] = pd.Categorical(filtered["Region"], categories=region_order, ordered=True)
        filtered.sort_values(by=["Region", "Female %"], inplace=True)
    else:
        filtered.sort_values("Female %", inplace=True)

    countries = filtered["Country"].values
    x = np.arange(len(countries))
    x_labels = filtered["Country"].values

    plt.figure(figsize=(18, 10))
    ax = plt.gca()

    if show_bg and (show_f or show_m):
        background_sizes = [all_phds_total.get(c, 0) / 40 for c in filtered["Country"]]
        if show_m:
            ax.scatter(x, filtered["Male %"], s=background_sizes, color='#888888', alpha=0.4)
        if show_f:
            ax.scatter(x, filtered["Female %"], s=background_sizes, color='#888888', alpha=0.4)

    if show_m:
        ax.scatter(x, filtered["Male %"], s=filtered["Male Count"] / 15, color='#FF7D00', label='Male')
    if show_f:
        ax.scatter(x, filtered["Female %"], s=filtered["Female Count"] / 15, color='#31D3BE', label='Female')

    if show_f or show_m:
        filtered["Gap"] = abs(filtered["Female %"] - filtered["Male %"])
        max_idx = filtered["Gap"].idxmax()
        min_idx = filtered["Gap"].idxmin()

        for idx, label in zip([max_idx, min_idx], ["Largest PhD Gender Gap", "Smallest PhD Gender Gap"]):
            c = filtered.loc[idx, "Country"]
            x_idx = np.where(countries == c)[0][0]
            ax.axvline(x=x_idx, color='gray', linestyle='dashed')
            ax.text(x_idx + 0.75, 90, label, ha='left', va='bottom', fontsize=10)

            if show_m:
                ax.scatter(x_idx, filtered.loc[idx, "Male %"], 
                           s=filtered.loc[idx, "Male Count"] / 15 + 10,
                           facecolors='#FF7D00', linewidth=1.5, zorder=5)
            if show_f:
                ax.scatter(x_idx, filtered.loc[idx, "Female %"], 
                           s=filtered.loc[idx, "Female Count"] / 15 + 10,
                           facecolors='#31D3BE', linewidth=1.5, zorder=6)

    if sort_by == "Region":
        region_positions = []
        prev_region = filtered["Region"].values[0]
        start_idx = 0
        for i, region in enumerate(filtered["Region"].values):
            if region != prev_region:
                region_positions.append((prev_region, start_idx, i - 1))
                start_idx = i
                prev_region = region
        region_positions.append((prev_region, start_idx, len(filtered) - 1))

        for region, start, end in region_positions:
            ax.axvline(x=start - 0.5, color='#BeBeBe')
            mid = (start + end) / 2
            ax.text(mid, 102, region, ha='center', va='bottom', fontsize=10, fontweight='bold')

    ax.set_xticks(x)
    ax.set_xticklabels(x_labels, rotation=90)
    ax.set_ylabel("PERCENT OF TOTAL", fontsize=12)
    ax.set_xlabel("PhD Granting Country", fontsize=12)
    ax.set_ylim(-5, 105)
    ax.set_yticks(np.arange(0, 101, 10))
    ax.set_title("Global Ph.D.s Gender Gap (2010)", weight='bold', fontsize=20, loc='left')
    plt.tight_layout()
    plt.show()

def toggle_background_visibility(*args):
    if discipline_dropdown.value == "All Ph.D.s":
        show_background.layout.display = 'none'
        show_background.value = False
    else:
        show_background.layout.display = None

toggle_background_visibility()
discipline_dropdown.observe(toggle_background_visibility, names='value')

ui = widgets.VBox([
    discipline_dropdown,
    checkboxes,
    sort_toggle,
])

output = widgets.interactive_output(
    update_plot,
    {
        'discipline': discipline_dropdown,
        'show_f': show_female,
        'show_m': show_male,
        'show_bg': show_background,
        'sort_by': sort_toggle
    }
)

display(ui, output)


VBox(children=(Dropdown(description='View Type:', index=1, layout=Layout(width='300px'), options=('Agricultura…

Output()