In [9]:
# Core libraries
import pandas as pd
import numpy as np
import os
import re
import json
from typing import List, Dict, Optional, Tuple, Any
from pathlib import Path
import plotly.express as px
from collections import Counter



## Load Data

In [2]:
def load_scholarship_data(workspace_path="workspace/Data"):
    json_path = os.path.join(workspace_path, "scholarships.json")
    if os.path.exists(json_path):
        with open(json_path, "r") as f:
            data = json.load(f)
        return pd.DataFrame(data)
    else:
        print("❌ No scholarships.json found in workspace/Data/")
        return None

df = load_scholarship_data()
if df is not None:
    display(df.head())

Unnamed: 0,Scholarship Name,Provider,Eligibility,Deadline,Benefits,Link
0,African Excellence Award,University of Glasgow,African students applying for MSc Data Science,2026-03-15,Full tuition + living stipend,https://www.gla.ac.uk/scholarships/african-exc...
1,Global Leaders Scholarship,University of Oxford,International students with leadership experience,2026-01-30,Partial tuition + mentorship program,https://www.ox.ac.uk/scholarships/global-leaders
2,Women in Tech Fellowship,MIT,Female students pursuing Computer Science or E...,2026-02-20,Full tuition + research grant,https://www.mit.edu/scholarships/women-in-tech
3,Climate Action Scholarship,ETH Zurich,Students working on climate-resilient agricult...,2026-04-10,Full tuition + living stipend + research funding,https://ethz.ch/scholarships/climate-action


## Key Metrics

In [3]:
print("✅ Total Scholarships:", len(df))
if "Provider" in df.columns:
    print("✅ Providers:", df["Provider"].nunique())
if "Scholarship Name" in df.columns:
    print("✅ Scholarship Programs:", df["Scholarship Name"].nunique())
if "Eligibility" in df.columns:
    avg_length = df["Eligibility"].str.len().mean()
    print("✅ Avg Eligibility Text Length:", round(avg_length), "chars")

✅ Total Scholarships: 4
✅ Providers: 4
✅ Scholarship Programs: 4
✅ Avg Eligibility Text Length: 54 chars


## Scholarship Overview

### Deadline Timeline

In [5]:
if "Deadline" in df.columns:
    df["Deadline"] = pd.to_datetime(df["Deadline"], errors="coerce")
    deadline_counts = df["Deadline"].value_counts().sort_index()

    fig = px.line(x=deadline_counts.index, y=deadline_counts.values,
                  labels={"x": "Deadline", "y": "Number of Scholarships"},
                  title="Scholarship Deadlines Over Time")
    fig.show()

### Top Providers

In [6]:
if "Provider" in df.columns:
    top_providers = df["Provider"].value_counts().head(10)
    fig = px.bar(x=top_providers.values, y=top_providers.index,
                 orientation="h",
                 labels={"x": "Number of Scholarships", "y": "Provider"},
                 title="Top Scholarship Providers")
    fig.update_layout(yaxis={"categoryorder": "total ascending"})
    fig.show()

## Eligibility Analysis

In [7]:
if "Eligibility" in df.columns:
    # Word count of eligibility text
    df["Eligibility Length"] = df["Eligibility"].str.len()
    df["Eligibility Word Count"] = df["Eligibility"].str.split().str.len()

    print("Average Eligibility Length:", df["Eligibility Length"].mean())
    print("Median Eligibility Length:", df["Eligibility Length"].median())
    print("Max Eligibility Length:", df["Eligibility Length"].max())

    fig = px.histogram(df, x="Eligibility Word Count", nbins=20,
                       title="Distribution of Eligibility Word Counts")
    fig.show()

Average Eligibility Length: 54.5
Median Eligibility Length: 52.5
Max Eligibility Length: 67


## Benefits Analysis

In [10]:
if "Benefits" in df.columns:
    all_benefits = []
    for benefits in df["Benefits"].dropna():
        if isinstance(benefits, str):
            all_benefits.extend(benefits.split("+"))

    benefit_counts = Counter([b.strip() for b in all_benefits])
    benefit_df = pd.DataFrame.from_dict(benefit_counts, orient="index", columns=["Count"])
    benefit_df = benefit_df.sort_values("Count", ascending=False)

    fig = px.bar(benefit_df, x=benefit_df.index, y="Count",
                 title="Top Scholarship Benefits",
                 labels={"x": "Benefit", "y": "Count"})
    fig.update_layout(xaxis_tickangle=-45)
    fig.show()

✓ Loaded 14,760 job postings
Columns: ['job_id', 'job_text_cleaned']

First few rows:
   job_id                                   job_text_cleaned
0       0  Job Title:\r\nDigital Marketing Specialist\r\n...
1       1  Job Title:\r\nWeb Developer\r\nResponsibilitie...
2       2  Job Title:\r\nOperations Manager\r\nResponsibi...
