In [1]:
from dash import Dash, html, dcc, Input, Output, State, callback_context
import dash_bootstrap_components as dbc
import polars as pl
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import json

In [2]:

# Load datasets
df = pl.read_parquet("/Users/tanishq/Documents/STA631/Project_Information_Visualization/data/processed/jobs_cleaned.parquet")
rent_df = pd.read_csv("/Users/tanishq/Documents/STA631/Project_Information_Visualization/data/raw/county_average_rates.csv")
rent_df["Average_Rent"] = rent_df["Average_Rent"].str.replace(",", "").str.strip().astype(float)
rent_df["County"] = rent_df["County"].str.strip()
rent_df = rent_df[rent_df["County"].notna()]
rent_df["GeoCounty"] = rent_df["County"] + " County"

In [3]:

crime_df = pd.read_csv("/Users/tanishq/Documents/STA631/Project_Information_Visualization/data/raw/crime_data_w_population_and_crime_rate.csv")
ny_crime_df = crime_df[crime_df["county_name"].str.contains(", NY")].copy()
ny_crime_df["county_name"] = ny_crime_df["county_name"].str.replace(" County, NY", "", regex=False).str.strip()
ny_crime_df["GeoCounty"] = ny_crime_df["county_name"] + " County"

In [4]:

with open("/Users/tanishq/Documents/STA631/Project_Information_Visualization/data/raw/new-york-counties.geojson", "r") as f:
    geojson_data = json.load(f)

In [5]:

# Process skills
df_skills = df.select([
    pl.col("Salary Range From").alias("Salary From"),
    pl.col("Salary Range To").alias("Salary To"),
    pl.col("Preferred Skills").alias("Skills")
]).drop_nulls().with_columns([
    pl.col("Skills").str.split(",").alias("Skill List")
]).explode("Skill List").with_columns([
    pl.col("Skill List").str.strip_chars().str.to_lowercase().alias("Skill"),
    ((pl.col("Salary From") + pl.col("Salary To")) / 2).alias("Avg Salary")
])

skill_salary_summary = df_skills.group_by("Skill").agg([
    pl.col("Avg Salary").mean().round(0).alias("Avg Salary"),
    pl.len().alias("Job Count")
]).sort("Avg Salary", descending=True)

In [6]:

# Get summary statistics
total_jobs = len(df)
avg_salary = int(df.select((pl.col("Salary Range From") + pl.col("Salary Range To")) / 2).mean()[0, 0])
top_hiring_dept = df.group_by("Agency").agg(pl.count()).sort("count", descending=True)[0, "Agency"]

  top_hiring_dept = df.group_by("Agency").agg(pl.count()).sort("count", descending=True)[0, "Agency"]


In [7]:

# App
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# UI Components
header = dbc.Navbar(
    dbc.Container([
        dbc.NavbarBrand("NYC Job Explorer", className="fw-bold"),
    ]),
    color="primary",
    dark=True,
    className="mb-4"
)

filters = dbc.Card([
    dbc.CardHeader("Filter Jobs", className="fw-bold"),
    dbc.CardBody([
        dbc.Input(
            id="job-search",
            placeholder="Search by job title...",
            type="text",
            className="mb-3"
        ),
        dbc.Label("Career Level"),
        dcc.Dropdown(
            id="career-level",
            options=[
                {"label": level, "value": level} for level in [
                    "Student", "Entry-Level", "Experienced (non-manager)", "Executive", "Manager"
                ]
            ],
            placeholder="Select level",
            className="mb-3"
        ),
        dbc.Label("Salary Range ($)"),
        dcc.RangeSlider(
            id="salary-slider",
            min=40000, max=250000, step=10000, value=[50000, 150000],
            marks={i: f"${int(i/1000)}k" for i in range(50000, 275000, 50000)},
            className="mb-3",
            tooltip={"placement": "bottom", "always_visible": True}
        ),
        dbc.Label("Job Type"),
        dbc.RadioItems(
            id="job-type",
            options=[
                {"label": "All", "value": "all"},
                {"label": "Full-time", "value": "F"},
                {"label": "Part-time", "value": "P"}
            ],
            value="all",
            inline=True,
            className="mb-3"
        ),
        dbc.Label("Data Overlay"),
        dbc.RadioItems(
            id="overlay-type",
            options=[
                {"label": "Average Rent", "value": "rent"},
                {"label": "Crime Rate", "value": "crime"}
            ],
            value="rent",
            className="mb-4"
        ),
        dbc.Button("Apply Filters", id="search-btn", color="primary", className="me-2"),
        dbc.Button("Reset", id="reset-btn", color="secondary", outline=True)
    ])
], className="mb-4")

summary_cards = dbc.Row([
    dbc.Col(
        dbc.Card([
            dbc.CardBody([
                html.H6("Total Jobs", className="card-title text-center text-muted"),
                html.H3(id="total-jobs-display", children=f"{total_jobs:,}", className="text-center")
            ])
        ]),
        md=4, className="mb-3"
    ),
    dbc.Col(
        dbc.Card([
            dbc.CardBody([
                html.H6("Top Hiring Department", className="card-title text-center text-muted"),
                html.H5(id="top-dept-display", children=top_hiring_dept, className="text-center")
            ])
        ]),
        md=4, className="mb-3"
    ),
    dbc.Col(
        dbc.Card([
            dbc.CardBody([
                html.H6("Average Salary", className="card-title text-center text-muted"),
                html.H3(id="avg-salary-display", children=f"${avg_salary:,}", className="text-center")
            ])
        ]),
        md=4, className="mb-3"
    )
])

tabs = dbc.Tabs([
    dbc.Tab([
        dcc.Loading(
            dcc.Graph(id="job-map", style={"height": "70vh"}),
            type="circle"
        )
    ], label="Map View", tab_id="map-tab"),
    dbc.Tab([
        html.H5("Top Paying Skills", className="mt-3 mb-3"),
        dcc.Loading(
            dcc.Graph(id="skill-ranking-chart", style={"height": "60vh"}),
            type="circle"
        )
    ], label="Skills Analysis", tab_id="skills-tab"),
    dbc.Tab([
        html.H5("Jobs by Career Level", className="mt-3 mb-3"),
        dcc.Loading(
            dcc.Graph(id="career-level-chart", style={"height": "60vh"}),
            type="circle"
        )
    ], label="Career Levels", tab_id="career-tab")
], id="tabs", active_tab="map-tab", className="mb-4")

# App Layout
app.layout = dbc.Container([
    header,
    dbc.Row([
        dbc.Col(filters, md=3),
        dbc.Col([
            summary_cards,
            tabs
        ], md=9)
    ])
], fluid=True)

In [8]:

# Charts and maps
def create_top_skills_chart(filtered_df=None):
    if filtered_df is not None and len(filtered_df) > 0:
        df_skills_filtered = filtered_df.select([
            pl.col("Salary Range From").alias("Salary From"),
            pl.col("Salary Range To").alias("Salary To"),
            pl.col("Preferred Skills").alias("Skills")
        ]).drop_nulls().with_columns([
            pl.col("Skills").str.split(",").alias("Skill List")
        ]).explode("Skill List").with_columns([
            pl.col("Skill List").str.strip_chars().str.to_lowercase().alias("Skill"),
            ((pl.col("Salary From") + pl.col("Salary To")) / 2).alias("Avg Salary")
        ])

        skill_summary = df_skills_filtered.group_by("Skill").agg([
            pl.col("Avg Salary").mean().round(0).alias("Avg Salary"),
            pl.len().alias("Job Count")
        ]).sort("Avg Salary", descending=True).filter(pl.col("Job Count") > 2)[:15]

        data = skill_summary.to_pandas()
    else:
        data = skill_salary_summary.filter(pl.col("Job Count") > 5)[:15].to_pandas()

    # Format for better display
    data["Avg Salary"] = data["Avg Salary"].round(0).astype(int)

    fig = px.bar(
        data,
        x="Skill",
        y="Avg Salary",
        color="Job Count",
        text=data["Avg Salary"].apply(lambda x: f"${x:,}"),
        labels={"Avg Salary": "Average Salary ($)", "Skill": ""},
        color_continuous_scale=px.colors.sequential.Blues
    )

    fig.update_traces(
        texttemplate='%{text}',
        textposition='outside'
    )

    fig.update_layout(
        template="plotly_white",
        margin=dict(t=30, b=50, l=50, r=30),
        yaxis_title="Average Salary ($)",
        xaxis={'categoryorder':'total descending'}
    )

    return fig

In [9]:

def create_career_level_chart(filtered_df=None):


    if filtered_df is not None and len(filtered_df) > 0:
        # Get job counts from filtered data
        job_counts = filtered_df.group_by("Career Level").agg([
            pl.count().alias("Job Count")
        ]).filter(pl.col("Career Level").is_not_null())

        # Get job types that are in the filter
        job_types = filtered_df.select(pl.col("Full-Time/Part-Time indicator")).unique()
        job_type_values = job_types.to_series().to_list()

        # Get salary info from original data but filtered by job type if specified
        if "all" not in job_type_values and len(job_type_values) > 0:
            salary_data = df.filter(pl.col("Full-Time/Part-Time indicator").is_in(job_type_values))
        else:
            salary_data = df
    else:
        # If no filters, use all data
        job_counts = df.group_by("Career Level").agg([
            pl.count().alias("Job Count")
        ]).filter(pl.col("Career Level").is_not_null())
        salary_data = df

    # Calculate salary statistics from consistent dataset
    salary_stats = salary_data.group_by("Career Level").agg([
        pl.col("Salary Range From").mean().round(0).alias("Avg Min Salary"),
        pl.col("Salary Range To").mean().round(0).alias("Avg Max Salary")
    ]).filter(pl.col("Career Level").is_not_null())

    # Merge job counts with salary stats
    career_summary = job_counts.join(salary_stats, on="Career Level", how="left")
    career_summary = career_summary.sort("Job Count", descending=True)

    career_summary = career_summary.to_pandas()

    # Create the figure
    fig = go.Figure()

    colors = px.colors.qualitative.Bold

    for i, level in enumerate(career_summary["Career Level"]):
        fig.add_trace(go.Bar(
            name=level,
            x=["Min Salary", "Max Salary"],
            y=[career_summary.loc[i, "Avg Min Salary"], career_summary.loc[i, "Avg Max Salary"]],
            text=[f"${career_summary.loc[i, 'Avg Min Salary']:,.0f}", f"${career_summary.loc[i, 'Avg Max Salary']:,.0f}"],
            textposition="auto",
            marker_color=colors[i % len(colors)]
        ))

    fig.update_layout(
        barmode='group',
        template="plotly_white",
        margin=dict(t=30, b=50, l=50, r=30),
        yaxis_title="Salary ($)"
    )

    # Add job count annotation
    annotations = []
    for i, level in enumerate(career_summary["Career Level"]):
        annotations.append(dict(
            x=0.5,
            y=career_summary.loc[i, "Avg Max Salary"] + 5000,
            text=f"{career_summary.loc[i, 'Job Count']} jobs",
            showarrow=False,
            font=dict(size=12),
            xref="x",
            yref="y"
        ))

    fig.update_layout(annotations=annotations)

    return fig

In [10]:

def create_overlay_map(filtered_df, overlay_type):
    if filtered_df is None or len(filtered_df) == 0:
        filtered_df = df.sample(n=100, seed=42).to_pandas()
    else:
        filtered_df = filtered_df.to_pandas() if isinstance(filtered_df, pl.DataFrame) else filtered_df

    filtered_df["Salary Range"] = filtered_df["Salary Range From"].astype(str) + " - " + filtered_df["Salary Range To"].astype(str)

    # Color map for career levels
    color_map = {
        "Student": "#4285F4",
        "Entry-Level": "#34A853",
        "Experienced (non-manager)": "#FBBC05",
        "Executive": "#EA4335",
        "Manager": "#8E24AA"
    }

    # Create hover text
    hover_text = [
        f"<b>{title}</b><br>" +
        f"Level: {level}<br>" +
        f"Salary: ${int(sal_from):,} - ${int(sal_to):,}"
        for title, level, sal_from, sal_to in zip(
            filtered_df["Business Title"],
            filtered_df["Career Level"],
            filtered_df["Salary Range From"],
            filtered_df["Salary Range To"]
        )
    ]

    # Use color map
    colors = [color_map.get(level, "#757575") for level in filtered_df["Career Level"]]

    # Job locations
    scatter = go.Scattermapbox(
        lat=filtered_df["Latitude"],
        lon=filtered_df["Longitude"],
        mode="markers",
        marker=go.scattermapbox.Marker(
            size=10,
            color=colors,
            opacity=0.8
        ),
        text=hover_text,
        hoverinfo="text",
        name="Jobs"
    )

    # County overlay
    if overlay_type == "rent":
        z = rent_df.set_index("GeoCounty")["Average_Rent"]
        colorbar_title = "Avg Rent ($)"
        colorscale = "YlOrRd"
        title = "Job Locations with Average Rent by County"
    else:
        z = ny_crime_df.set_index("GeoCounty")["crime_rate_per_100000"]
        colorbar_title = "Crime per 100k"
        colorscale = "Reds"
        title = "Job Locations with Crime Rate by County"

    choropleth = go.Choroplethmapbox(
        geojson=geojson_data,
        locations=z.index,
        z=z.values,
        featureidkey="properties.name",
        colorscale=colorscale,
        colorbar_title=colorbar_title,
        marker_opacity=0.5,
        marker_line_width=0
    )

    # Legend for career levels
    legend_traces = []
    for level, color in color_map.items():
        legend_traces.append(
            go.Scattermapbox(
                lat=[0],
                lon=[0],
                mode="markers",
                marker=dict(size=10, color=color),
                name=level,
                showlegend=True,
                visible=True,
                hoverinfo="none"
            )
        )

    fig = go.Figure(data=[choropleth, scatter] + legend_traces)

    fig.update_layout(
        mapbox_style="carto-positron",
        mapbox_zoom=10,
        mapbox_center={"lat": 40.7128, "lon": -74.0060},
        margin={"r": 0, "t": 40, "l": 0, "b": 0},
        title=title,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=0.02,
            xanchor="center",
            x=0.5
        )
    )

    return fig

In [11]:

# Callbacks
@app.callback(
    [Output("job-map", "figure"),
     Output("skill-ranking-chart", "figure"),
     Output("career-level-chart", "figure"),
     Output("total-jobs-display", "children"),
     Output("top-dept-display", "children"),
     Output("avg-salary-display", "children")],
    [Input("search-btn", "n_clicks"),
     Input("reset-btn", "n_clicks"),
     Input("overlay-type", "value"),
     Input("tabs", "active_tab")],
    [State("job-search", "value"),
     State("career-level", "value"),
     State("salary-slider", "value"),
     State("job-type", "value")]
)
def update_dashboard(search_clicks, reset_clicks, overlay_type, active_tab,
                     search_query, career_level, salary_range, job_type):
    ctx = callback_context

    if not ctx.triggered:
        # Default state
        filtered_df = df
    else:
        button_id = ctx.triggered[0]["prop_id"].split(".")[0]

        if button_id == "reset-btn":
            filtered_df = df
        else:
            filtered_df = df.clone()

            # Apply filters
            if search_query:
                filtered_df = filtered_df.filter(pl.col("Business Title").str.to_lowercase().str.contains(search_query.lower()))

            if career_level:
                filtered_df = filtered_df.filter(pl.col("Career Level") == career_level)

            if salary_range:
                min_salary, max_salary = salary_range
                filtered_df = filtered_df.filter(
                    (pl.col("Salary Range From") <= max_salary) &
                    (pl.col("Salary Range To") >= min_salary)
                )

            if job_type and job_type != "all":
                filtered_df = filtered_df.filter(pl.col("Full-Time/Part-Time indicator") == job_type)

    # Calculate summary stats
    if len(filtered_df) > 0:
        total_jobs_display = f"{len(filtered_df):,}"

        top_dept = filtered_df.group_by("Agency").agg(pl.count()).sort("count", descending=True)
        top_dept_display = top_dept[0, "Agency"] if len(top_dept) > 0 else "N/A"

        avg_salary = filtered_df.select(
            ((pl.col("Salary Range From") + pl.col("Salary Range To")) / 2).alias("avg")
        ).mean()[0, 0]
        avg_salary_display = f"${int(avg_salary):,}" if not pd.isna(avg_salary) else "N/A"
    else:
        total_jobs_display = "0"
        top_dept_display = "N/A"
        avg_salary_display = "N/A"

    # Create visualizations
    map_fig = create_overlay_map(filtered_df, overlay_type)
    skills_fig = create_top_skills_chart(filtered_df)
    career_fig = create_career_level_chart(filtered_df)

    return map_fig, skills_fig, career_fig, total_jobs_display, top_dept_display, avg_salary_display

In [13]:

if __name__ == '__main__':
    app.run(debug=True)


`pl.count()` is deprecated. Please use `pl.len()` instead.


`pl.count()` is deprecated. Please use `pl.len()` instead.


`pl.count()` is deprecated. Please use `pl.len()` instead.


`pl.count()` is deprecated. Please use `pl.len()` instead.


`pl.count()` is deprecated. Please use `pl.len()` instead.


`pl.count()` is deprecated. Please use `pl.len()` instead.

