# Scholarship Cleaning Code

In [1]:
import pandas as pd
import json
import os
import unicodedata
import re

## Define Cleaning Function

In [3]:
def build_scholarship_text(row):
    """Combine scholarship fields into one text block"""
    parts = []
    if pd.notnull(row.get("Scholarship Name")):
        parts.append("Scholarship Name:\n" + str(row["Scholarship Name"]))
    if pd.notnull(row.get("Eligibility")):
        parts.append("Eligibility:\n" + str(row["Eligibility"]))
    if pd.notnull(row.get("Benefits")):
        parts.append("Benefits:\n" + str(row["Benefits"]))
    if pd.notnull(row.get("Provider")):
        parts.append("Provider:\n" + str(row["Provider"]))
    return "\n".join(parts)

def clean_text(text):
    """General text cleaning for scholarship descriptions"""
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r"[\u200b\u200c\u200d\u2060\ufeff]", "", text)
    text = re.sub(r"\S+@\S+", " ", text)                          # emails
    text = re.sub(r"\+?\d[\d\-\s\(\)]{7,}\d", " ", text)          # phone numbers
    text = re.sub(r"(https?:\/\/\S+|www\.\S+)", " ", text)        # URLs
    text = re.sub(r"<[^>]+>", " ", text)                          # HTML tags
    text = re.sub(r"&[a-z]+;", " ", text)                         # HTML entities
    text = re.sub(r"[•●▪■◆▶►▸⦿⦾]", "- ", text)                   # bullets
    text = text.replace("–", "-").replace("—", "-")
    text = text.replace("\t", " ")
    text = re.sub(r" {2,}", " ", text)
    lines = [line.strip() for line in text.split("\n")]
    final_lines, blank_seen = [], False
    for line in lines:
        if line == "":
            if not blank_seen:
                final_lines.append("")
            blank_seen = True
        else:
            final_lines.append(line)
            blank_seen = False
    text = "\n".join(final_lines)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

## Load Raw Scholarship data

In [8]:
def load_scholarship_data(workspace_path="../../workspace/Data"):
    json_path = os.path.join(workspace_path, "scholarships.json")
    if os.path.exists(json_path):
        with open(json_path, "r") as f:
            data = json.load(f)
        return pd.DataFrame(data)
    print("❌ scholarships.json not found")
    return None

df = load_scholarship_data()
df.head()

Unnamed: 0,Scholarship Name,Provider,Eligibility,Deadline,Benefits,Link
0,African Excellence Award,University of Glasgow,African students applying for MSc Data Science,2026-03-15,Full tuition + living stipend,https://www.gla.ac.uk/scholarships/african-exc...
1,Global Leaders Scholarship,University of Oxford,International students with leadership experience,2026-01-30,Partial tuition + mentorship program,https://www.ox.ac.uk/scholarships/global-leaders
2,Women in Tech Fellowship,MIT,Female students pursuing Computer Science or E...,2026-02-20,Full tuition + research grant,https://www.mit.edu/scholarships/women-in-tech
3,Climate Action Scholarship,ETH Zurich,Students working on climate-resilient agricult...,2026-04-10,Full tuition + living stipend + research funding,https://ethz.ch/scholarships/climate-action


## Apply Cleaning

In [9]:
if df is not None:
    # Combine text fields
    df["scholarship_text_raw"] = df.apply(build_scholarship_text, axis=1)

    # Clean text
    df["scholarship_text_cleaned"] = df["scholarship_text_raw"].apply(clean_text)

    # Deduplicate
    initial_count = len(df)
    df = df.drop_duplicates(subset=["scholarship_text_cleaned"], keep="first")
    print(f"Deduplicated: reduced from {initial_count} to {len(df)} scholarships")

    # Add scholarship_id
    df["scholarship_id"] = range(len(df))

    # Preview
    print(df.head())

Deduplicated: reduced from 4 to 4 scholarships
             Scholarship Name               Provider  \
0    African Excellence Award  University of Glasgow   
1  Global Leaders Scholarship   University of Oxford   
2    Women in Tech Fellowship                    MIT   
3  Climate Action Scholarship             ETH Zurich   

                                         Eligibility    Deadline  \
0     African students applying for MSc Data Science  2026-03-15   
1  International students with leadership experience  2026-01-30   
2  Female students pursuing Computer Science or E...  2026-02-20   
3  Students working on climate-resilient agricult...  2026-04-10   

                                           Benefits  \
0                     Full tuition + living stipend   
1              Partial tuition + mentorship program   
2                     Full tuition + research grant   
3  Full tuition + living stipend + research funding   

                                                Link  \

## Saved Cleaned Data

In [11]:
if df is not None:
    save_path = "../../workspace/Data/cleaned_scholarships.json"
    json_data = df.to_dict("records")
    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(json_data, f, indent=2, ensure_ascii=False)
    print(f"✅ Saved {len(df)} cleaned scholarships to {save_path}")

✅ Saved 4 cleaned scholarships to ../../workspace/Data/cleaned_scholarships.json
