### Gather the team scores from the downloaded json files

In [1]:
import os
import json
import pandas as pd

# --------------------------------------------------
# Config
# --------------------------------------------------

MAIN_DIR = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"
REPORT_DIR = os.path.join(MAIN_DIR, "school_reports_2025_girls")
FILES_LIST = os.path.join(MAIN_DIR, "schools_with_json_files.csv")
OUTPUT_CSV = os.path.join(MAIN_DIR, "all_team_matches.csv")


def extract_meet_data(json_data):
    results = []

    if "meets" not in json_data or json_data["meets"] is None:
        return results

    for meet in json_data["meets"]:

        team_match_id = meet.get("id")

        winners = meet.get("schools", {}).get("winners", [])
        losers = meet.get("schools", {}).get("losers", [])

        winner = winners[0] if len(winners) > 0 else {}
        loser = losers[0] if len(losers) > 0 else {}

        winning_team_name = winner.get("name")
        winning_team_id = winner.get("id")
        winning_team_win_cnt = winner.get("score")

        losing_team_name = loser.get("name")
        losing_team_id = loser.get("id")
        losing_team_win_cnt = loser.get("score")

        # Raw timestamp
        meet_timestamp = meet.get("meetDateTime")

        results.append({
            "team_match_id": team_match_id,
            "winning_team_name": winning_team_name,
            "winning_team_id": winning_team_id,
            "winning_team_win_cnt": winning_team_win_cnt,
            "losing_team_name": losing_team_name,
            "losing_team_id": losing_team_id,
            "losing_team_win_cnt": losing_team_win_cnt,
            "match_date": meet_timestamp
        })

    return results


def main():
    print(f"Loading list of school JSON files from {FILES_LIST}")
    df_files = pd.read_csv(FILES_LIST)

    all_meets = []

    for filename in df_files["filename"]:
        full_path = os.path.join(REPORT_DIR, filename)

        if not os.path.exists(full_path):
            print(f"  WARNING: Missing file: {full_path}")
            continue

        try:
            with open(full_path, "r", encoding="utf-8") as f:
                data = json.load(f)
        except Exception as e:
            print(f"  ERROR reading {full_path}: {e}")
            continue

        all_meets.extend(extract_meet_data(data))

    df_all = pd.DataFrame(all_meets)

    # ------------------------------------------------------
    # Convert match_date timestamp → date
    # ------------------------------------------------------
    df_all["match_date"] = pd.to_datetime(df_all["match_date"], errors="coerce").dt.date

    # Deduplicate full match records
    before = len(df_all)
    df_all = df_all.drop_duplicates()
    after = len(df_all)
    print(f"Deduped matches: {before} → {after}")

    df_all.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    print(f"Saved all team matches to: {OUTPUT_CSV}")


if __name__ == "__main__":
    main()


Loading list of school JSON files from C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\schools_with_json_files.csv
Deduped matches: 2120 → 934
Saved all team matches to: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\all_team_matches.csv


### Check ten top/bottom rows for expected data

In [2]:
# --- Read CSV ---
df = pd.read_csv(OUTPUT_CSV)

# --- Print first 10 records ---
print("\n=== TOP 10 RECORDS ===")
print(df.head(10).to_string(index=False))

# --- Print last 10 records ---
print("\n=== LAST 10 RECORDS ===")
print(df.tail(10).to_string(index=False))



=== TOP 10 RECORDS ===
 team_match_id winning_team_name  winning_team_id  winning_team_win_cnt      losing_team_name  losing_team_id  losing_team_win_cnt match_date
        199638   Marist Catholic           124728                     7  St Mary's of Medford        124823.0                  7.0 2025-05-22
        199636             Baker            74728                     4             Riverdale         75619.0                  3.0 2025-05-22
        199467              Vale            74596                     4             Pendleton         75454.0                  2.0 2025-05-15
        199176             Baker            74728                    12                  Vale         74596.0                  8.0 2025-05-15
        198945              Vale            74596                     2                 Nyssa         75412.0                  1.0 2025-05-13
        197883              Vale            74596                     6 LA GRANDE HIGH SCHOOL         75227.0               

### Run some data quality checks
##### A match for St. Helen's (team_match_id 193353 has been found with no losing team - keep the record)

In [4]:
from function_scripts.data_quality import check_csv

results = check_csv(
    csv_path=OUTPUT_CSV,
    unique_fields=["team_match_id"],         
    null_fields=["team_match_id","winning_team_id","losing_team_id","winning_team_win_cnt", "losing_team_win_cnt", "match_date"]
)


===== DATA QUALITY CHECKS =====

Reading CSV: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\all_team_matches.csv
Loaded 934 rows.

Checking for duplicate values...

✅ No duplicates in 'team_match_id'

---------------------------------------

Checking for null values...

✅ No null values in 'team_match_id'
✅ No null values in 'winning_team_id'
❌ Null values found in 'losing_team_id': 1 rows
✅ No null values in 'winning_team_win_cnt'
❌ Null values found in 'losing_team_win_cnt': 1 rows
✅ No null values in 'match_date'

---------------------------------------

No whitespace checks requested.

