In [1]:
import os
import json
import pandas as pd
import numpy as np
import ipywidgets as W
from IPython.display import display, clear_output
import plotly.graph_objects as go

# 1. Load Data
if os.path.exists("clean_data.csv"):
    df = pd.read_csv("clean_data.csv")
    print(f"Loaded data: {df.shape}")
    # Convert earnings to numeric, coercing errors to NaN
    df = df.replace("PS", pd.NA)
    df = df.groupby(["CIPDESC", "CREDLEV"]).filter(lambda x: len(x) >= 1000)
    print(f"removed unhelpful data: {df.shape}")
    df["EARN_MDN_1YR"] = pd.to_numeric(df["EARN_MDN_1YR"], errors="coerce")
    print(f"cleaned data: {df.shape}")
else:
    print("Error: clean_data.csv not found.")


# 2. Helper Functions
def list_majors(df: pd.DataFrame) -> list[str]:
    return sorted(df["CIPDESC"].dropna().astype(str).str.strip().unique(), key=str.lower)


def filter_majors(df: pd.DataFrame, query: str, limit: int = 200) -> list[str]:
    if not query:
        return list_majors(df)[:limit]
    q = query.lower()
    all_majors = list_majors(df)
    return [m for m in all_majors if q in m.lower()][:limit]


def slice_series(df: pd.DataFrame, major: str, credlev: int | None) -> pd.Series:
    mask = df["CIPDESC"].astype(str).str.contains(major, case=False, na=False)
    if credlev is not None and "CREDLEV" in df.columns:
        mask &= (pd.to_numeric(df["CREDLEV"], errors="coerce") == credlev)
    return df.loc[mask, "EARN_MDN_1YR"].dropna()


# 3. Interactive Picker (Plotly in Notebook)
search_box = W.Text(value="biology", description="Search:", layout=W.Layout(width="300px"))
major_dd = W.Dropdown(options=[], description="Major:", layout=W.Layout(width="500px"))
cred_dd = W.Dropdown(options=[None, 1, 2, 3, 4, 5, 6, 7, 8], value=3, description="CREDLEV:")
run_btn = W.Button(description="Analyze", button_style="primary")
out = W.Output()


def on_search_change(change):
    if change["name"] == "value":
        options = filter_majors(df, change["new"])
        major_dd.options = options
        if options:
            major_dd.value = options[0]


def on_analyze_clicked(_):
    with out:
        clear_output()
        major = major_dd.value
        cred = cred_dd.value
        if not major:
            print("Please select a major.")
            return

        s = slice_series(df, major, cred)
        if s.empty:
            print(f"No data found for {major} (CREDLEV={cred})")
            return

        # Create Plotly Figure
        fig = go.Figure()
        fig.add_trace(go.Histogram(
            x=s,
            nbinsx=50,
            marker_color='rgba(31, 119, 180, 0.7)',
            name='Count'
        ))

        # Add Percentile Lines
        pcts = [10, 25, 50, 75, 90]
        vals = np.percentile(s, pcts)
        colors = ["#888", "#888", "crimson", "#888", "#888"]

        for p, v, c in zip(pcts, vals, colors):
            fig.add_vline(
                x=v,
                line_width=2 if p == 50 else 1,
                line_dash="dash" if p != 50 else "solid",
                line_color=c,
                annotation_text=f"p{p}",
                annotation_position="top"
            )

        fig.update_layout(
            title=f"{major} (CredLev {cred}) — 1-Year Median Earnings",
            xaxis_title="Earnings ($)",
            yaxis_title="Count",
            bargap=0.05,
            height=500,
            margin=dict(l=40, r=40, t=60, b=40)
        )
        fig.show()


search_box.observe(on_search_change)
run_btn.on_click(on_analyze_clicked)

# Initialize
on_search_change({"name": "value", "new": search_box.value})
display(W.VBox([W.HBox([search_box, major_dd, cred_dd, run_btn]), out]))


# 4. Export Data for GitHub Pages (docs/data.json)
# This generates the JSON file required by docs/graph.html

def to_payload(s: pd.Series, bins: int = 50) -> dict:
    if s.empty:
        return {"bins": [], "pcts": {}}
    counts, edges = np.histogram(s.values, bins=bins)
    bins_list = [
        {"x0": round(float(edges[i]), 2), "x1": round(float(edges[i + 1]), 2), "count": int(counts[i])}
        for i in range(len(counts))
    ]
    pcts = {f"p{k}": round(float(np.percentile(s.values, k)), 2) for k in (10, 25, 50, 75, 90)}
    return {"bins": bins_list, "pcts": pcts}


print("Generating JSON export (this may take a few seconds)...")
majors_all = list_majors(df)
credlevs_all = sorted(df["CREDLEV"].dropna().astype(int).unique().tolist())

export_data = {
    "majors": majors_all,
    "credlevs": credlevs_all,
    "payload": {}
}

for m in majors_all:
    export_data["payload"][m] = {}

    # Export 'All' cred levels combined (optional, mapped to null/None key in some logics)
    s_all = slice_series(df, m, None)
    # We use a specific string key or null depending on how your HTML expects it.
    # Your HTML handled 'All' via selector logic, but usually expects keys to match inputs.

    for c in credlevs_all:
        s_c = slice_series(df, m, c)
        # Only export if data exists to save space
        if not s_c.empty:
            export_data["payload"][m][int(c)] = to_payload(s_c)

    # Also export the 'All' slice if not empty
    if not s_all.empty:
        export_data["payload"][m]["__ALL__"] = to_payload(s_all)  # Matching HTML '__ALL__' logic if used

os.makedirs("docs", exist_ok=True)
json_path = os.path.join("docs", "data.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(export_data, f)

print(f"Wrote {json_path}. Ready to commit and push.")

Loaded data: (229188, 32)
removed unhelpful data: (45260, 32)
cleaned data: (45260, 32)


VBox(children=(HBox(children=(Text(value='biology', description='Search:', layout=Layout(width='300px')), Drop…

Generating JSON export (this may take a few seconds)...
Wrote docs/data.json. Ready to commit and push.
