In [7]:
import os
import json
import requests
import pandas as pd
import re

# --------------------------------------------------
# Config
# --------------------------------------------------

MAIN_DIR = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"
PLAYERS_JSON = os.path.join(MAIN_DIR, "players.json")
GIRLS_PLAYERS_CSV = os.path.join(MAIN_DIR, "girls_players.csv")
SCHOOLS_CSV = os.path.join(MAIN_DIR, "schools.csv")

PLAYERS_URL = "https://api.v2.tennisreporting.com/players"


# --------------------------------------------------
# Helpers
# --------------------------------------------------

def clean_lastname(name: str) -> str:
    """Clean last names like '\"Lastname,\"' → 'Lastname'."""
    if pd.isna(name):
        return name
    s = str(name).strip()
    # Remove surrounding quotes
    s = s.strip('"\'')
    # Remove trailing commas
    s = s.rstrip(',')
    return s


def can_encode_utf8(value) -> bool:
    """Return False if the value cannot be encoded as UTF-8."""
    if pd.isna(value):
        return True
    try:
        str(value).encode("utf-8")
        return True
    except UnicodeEncodeError:
        return False


# --------------------------------------------------
# Main ETL
# --------------------------------------------------

def main():
    # 1) Fetch players JSON and store as players.json
    print("Downloading player data...")
    resp = requests.get(PLAYERS_URL)
    resp.raise_for_status()
    players_data = resp.json()

    print(f"Writing JSON to {PLAYERS_JSON} ...")
    with open(PLAYERS_JSON, "w", encoding="utf-8") as f:
        json.dump(players_data, f, ensure_ascii=False, indent=2)

    # 2) Read JSON into DataFrame
    print("Reading players.json into DataFrame...")
    with open(PLAYERS_JSON, "r", encoding="utf-8") as f:
        data = json.load(f)

    df = pd.DataFrame(data)

    # 3) Load schools.csv and get valid schoolIDs
    print(f"Reading schools from {SCHOOLS_CSV} ...")
    schools = pd.read_csv(SCHOOLS_CSV, dtype={"SchoolID": "Int64"})
    valid_school_ids = set(schools["SchoolID"].dropna().astype(int))

    # 4) Clean last names
    print("Cleaning lastName values...")
    df["lastName"] = df["lastName"].apply(clean_lastname)

    # 5) Create fullname = firstname + ' ' + lastname
    print("Creating fullname field...")
    df["fullName"] = df["firstName"].fillna("").str.strip() + " " + df["lastName"].fillna("").str.strip()
    df["fullName"] = df["fullName"].str.strip()

    # 6) Filtering according to rules

    # 6.1) Only include players whose schoolId is present in schools.csv
    before = len(df)
    df = df[df["schoolId"].isin(valid_school_ids)]
    print(f"Filtered on schoolId in schools.csv: {before} -> {len(df)} rows")

    # 6.2) Only include records where genderId = 2
    before = len(df)
    df = df[df["genderId"] == 2]
    print(f"Filtered on genderId == 2: {before} -> {len(df)} rows")

    # 6.3) Exclude records where firstname=Default and lastname=Default
    before = len(df)
    mask_default = (df["firstName"] == "Default") & (df["lastName"] == "Default")
    df = df[~mask_default]
    print(f"Excluded Default/Default names: {before} -> {len(df)} rows")

    # 6.4) Exclude records where graduatedDate is NOT null and year < 2025
    #      Keep rows where graduatedDate is null OR year >= 2025
    before = len(df)
    grad_dates = pd.to_datetime(df["graduatedDate"], errors="coerce", utc=True)
    keep_mask = grad_dates.isna() | (grad_dates.dt.year >= 2025)
    df = df[keep_mask]
    print(f"Filtered on graduatedDate (exclude < 2025): {before} -> {len(df)} rows")

    # 7) Select and rename columns for output CSV
    #    Only keep: id, schoolID, firstname, lastname, grade, fullname
    print("Selecting and renaming columns for girls_players.csv...")
    df_out = df[["id", "schoolId", "firstName", "lastName", "grade", "fullName"]].copy()
    df_out = df_out.rename(
        columns={
            "id": "playerID",
            "schoolId": "schoolID",
            "firstName": "firstname",
            "lastName": "lastname",
            "fullName": "fullname",
        }
    )

    # 8) Write girls_players.csv
    print(f"Writing {GIRLS_PLAYERS_CSV} ...")
    df_out.to_csv(GIRLS_PLAYERS_CSV, index=False, encoding="utf-8")
    print("girls_players.csv written successfully.")

    # --------------------------------------------------
    # Data Quality Checks
    # --------------------------------------------------
    print("\n--- DATA QUALITY CHECKS ---")

    # 1) Confirm there are no duplicate values for playerID
    dup_mask = df_out["playerID"].duplicated(keep=False)
    dup_rows = df_out[dup_mask].sort_values("playerID")
    if dup_rows.empty:
        print("1) No duplicate playerID values found.")
    else:
        print("1) Duplicate playerID values found:")
        print(dup_rows[["playerID", "firstname", "lastname"]])

    # 2) Show all firstname values that have non-alphabetic characters
    #    (A-Z or a-z only)
    fname_nonalpha_mask = df_out["firstname"].astype(str).str.contains(r"[^A-Za-z]", na=False)
    bad_firstnames = df_out.loc[fname_nonalpha_mask, "firstname"].dropna().unique()
    print("\n2) Firstnames with non-alphabetic characters:")
    if len(bad_firstnames) == 0:
        print("   None found.")
    else:
        for name in sorted(bad_firstnames):
            print("  ", name)

    # 3) Show all lastname values that have non-alphabetic characters
    lname_nonalpha_mask = df_out["lastname"].astype(str).str.contains(r"[^A-Za-z]", na=False)
    bad_lastnames = df_out.loc[lname_nonalpha_mask, "lastname"].dropna().unique()
    print("\n3) Lastnames with non-alphabetic characters:")
    if len(bad_lastnames) == 0:
        print("   None found.")
    else:
        for name in sorted(bad_lastnames):
            print("  ", name)

    # 4) Check for values that will cause utf-8 encoding problems
    print("\n4) Checking for UTF-8 encoding issues...")
    problem_cells = []
    for col in df_out.columns:
        if df_out[col].dtype == "object":
            for idx, val in df_out[col].items():
                if not can_encode_utf8(val):
                    problem_cells.append((idx, col, repr(val)))

    if not problem_cells:
        print("   No UTF-8 encoding problems detected.")
    else:
        print("   Potential UTF-8 encoding problems found in the following cells:")
        for idx, col, val_repr in problem_cells:
            print(f"   Row {idx}, Column {col}, Value {val_repr}")

    # 5) Frequency report on first name and show 5 most common
    print("\n5) Top 5 most common firstnames:")
    freq_first = df_out["firstname"].value_counts().head(5)
    print(freq_first)


if __name__ == "__main__":
    main()


Downloading player data...
Writing JSON to C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\players.json ...
Reading players.json into DataFrame...
Reading schools from C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\schools.csv ...
Cleaning lastName values...
Creating fullname field...
Filtered on schoolId in schools.csv: 139612 -> 9614 rows
Filtered on genderId == 2: 9614 -> 5365 rows
Excluded Default/Default names: 5365 -> 5342 rows
Filtered on graduatedDate (exclude < 2025): 5342 -> 2928 rows
Selecting and renaming columns for girls_players.csv...
Writing C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\girls_players.csv ...
girls_players.csv written successfully.

--- DATA QUALITY CHECKS ---
1) No duplicate playerID values found.

2) Firstnames with non-alphabetic characters:
    Ana
    Ashley
   Abbigail 
   Abby 
   Abigail 
   Ada Elizabeth 
   Ada Gene
   Adalia 
   Addy 
   Adelyn 
   Adriana 
   Adrianne 
   Alejandra Deni