In [9]:
import os
import json
import pandas as pd

# --------------------------------------------------
# Config
# --------------------------------------------------

MAIN_DIR = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"
REPORT_DIR = os.path.join(MAIN_DIR, "school_reports_2025_girls")
FILES_LIST = os.path.join(MAIN_DIR, "schools_with_json_files.csv")
OUTPUT_CSV = os.path.join(MAIN_DIR, "singles_matches_raw.csv")


# --------------------------------------------------
# Extract Singles Matches (safe version)
# --------------------------------------------------

def extract_singles_from_meet(meet):

    # Protect against null or malformed meet objects
    if meet is None or not isinstance(meet, dict):
        return []

    results = []

    team_match_id = meet.get("id")

    # Convert match date safely
    match_date_raw = meet.get("meetDateTime")
    try:
        match_date = pd.to_datetime(match_date_raw).date()
    except:
        match_date = None

    matches_obj = meet.get("matches", {})
    if matches_obj is None or not isinstance(matches_obj, dict):
        return []

    singles_list = matches_obj.get("Singles", [])
    if singles_list is None or not isinstance(singles_list, list):
        singles_list = []

    # ------------------------------------------------------
    # Loop through each singles match
    # ------------------------------------------------------
    for match in singles_list:

        match_type = "Singles"
        match_flight = match.get("flight")
        singles_match_id = match.get("id")

        match_teams = match.get("matchTeams", [])
        if match_teams is None or not isinstance(match_teams, list):
            match_teams = []

        # ==============================================================
        #                     PLAYER 0
        # ==============================================================
        try:
            p0_team = match_teams[0]
            p0_id_in_match = p0_team.get("id")
            p0_is_winner = p0_team.get("isWinner")

            p0_players = p0_team.get("players", [])
            if p0_players is None:
                p0_players = []

            p0_real_id = p0_players[0].get("id") if len(p0_players) > 0 else None

        except:
            p0_id_in_match = None
            p0_is_winner = None
            p0_real_id = None

        # ==============================================================
        #                     PLAYER 1
        # ==============================================================
        try:
            p1_team = match_teams[1]
            p1_id_in_match = p1_team.get("id")

            p1_players = p1_team.get("players", [])
            if p1_players is None:
                p1_players = []

            p1_real_id = p1_players[0].get("id") if len(p1_players) > 0 else None

        except:
            p1_id_in_match = None
            p1_real_id = None

        # ==============================================================
        #                     SET SCORES
        # ==============================================================
        sets = match.get("sets", [])
        if sets is None or not isinstance(sets, list):
            sets = []

        set_scores = [None, None, None]  # set1, set2, set3

        p0_key = str(p0_id_in_match) if p0_id_in_match is not None else None
        p1_key = str(p1_id_in_match) if p1_id_in_match is not None else None

        for idx, s in enumerate(sets):
            if idx > 2:
                break
            try:
                p0_score = s.get(p0_key)
                p1_score = s.get(p1_key)
                if p0_score is not None and p1_score is not None:
                    set_scores[idx] = f"{p0_score}*{p1_score}"
            except:
                pass

        # ==============================================================
        #                     BUILD RESULT ROW
        # ==============================================================
        results.append({
            "team_match_id": team_match_id,
            "match_date": match_date,
            "match_type": match_type,
            "match_flight": match_flight,
            "singles_match_id": singles_match_id,
            "singles_player0_id_in_match": p0_id_in_match,
            "singles_player0_is_winner": p0_is_winner,
            "singles_player0_real_id": p0_real_id,
            "singles_player1_id_in_match": p1_id_in_match,
            "singles_player1_real_id": p1_real_id,
            "set1_score": set_scores[0],
            "set2_score": set_scores[1],
            "set3_score": set_scores[2]
        })

    return results


# --------------------------------------------------
# Main program (safe version)
# --------------------------------------------------

def main():
    df_files = pd.read_csv(FILES_LIST)

    # Full run
    filenames = df_files["filename"].tolist()

    all_rows = []

    for filename in filenames:
        full_path = os.path.join(REPORT_DIR, filename)

        if not os.path.exists(full_path):
            print(f"Missing file: {full_path}")
            continue

        # Load JSON safely
        try:
            with open(full_path, "r", encoding="utf-8") as f:
                data = json.load(f)
        except Exception as e:
            print(f"Error loading JSON {filename}: {e}")
            continue

        # ------------------------------------------------------
        # FIX: Normalize meets into a list
        # ------------------------------------------------------
        meets = data.get("meets")
        if not isinstance(meets, list):
            meets = []

        for meet in meets:
            rows = extract_singles_from_meet(meet)
            if rows:
                all_rows.extend(rows)

    df_out = pd.DataFrame(all_rows)
    df_out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

    print("\nSingles match extraction complete.")
    print(f"Output written to: {OUTPUT_CSV}")
    print(f"Total singles matches extracted: {len(df_out)}")


if __name__ == "__main__":
    main()


Singles match extraction complete.
Output written to: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\singles_matches_raw.csv
Total singles matches extracted: 14051


In [7]:
MAIN_DIR = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"
REPORT_DIR = os.path.join(MAIN_DIR, "school_reports_2025_girls")
FILES_LIST = os.path.join(MAIN_DIR, "schools_with_json_files.csv")
OUTPUT_CSV = os.path.join(MAIN_DIR, "doubles_matches_raw.csv")


# --------------------------------------------------
# Extract Doubles Matches (with set scores)
# --------------------------------------------------

def extract_doubles_from_meet(meet):

    # Protect against malformed meet entries
    if meet is None or not isinstance(meet, dict):
        return []

    results = []

    team_match_id = meet.get("id")

    # Convert match date safely
    match_date_raw = meet.get("meetDateTime")
    try:
        match_date = pd.to_datetime(match_date_raw).date()
    except:
        match_date = None

    matches_obj = meet.get("matches", {})
    if matches_obj is None or not isinstance(matches_obj, dict):
        return []

    doubles_list = matches_obj.get("Doubles", [])
    if doubles_list is None or not isinstance(doubles_list, list):
        doubles_list = []

    for match in doubles_list:

        match_type = "Doubles"
        match_flight = match.get("flight")
        doubles_match_id = match.get("id")

        match_teams = match.get("matchTeams", [])
        if match_teams is None or not isinstance(match_teams, list):
            match_teams = []

        # ==============================================================
        #             TEAM 0  (Players 1 & 2)
        # ==============================================================

        try:
            t0_team = match_teams[0]
            t0_id_in_match = t0_team.get("id")
            t0_is_winner = t0_team.get("isWinner")

            t0_players = t0_team.get("players", [])
            if t0_players is None:
                t0_players = []

            t0_p1_real = t0_players[0].get("id") if len(t0_players) > 0 else None
            t0_p2_real = t0_players[1].get("id") if len(t0_players) > 1 else None

        except:
            t0_id_in_match = None
            t0_is_winner = None
            t0_p1_real = None
            t0_p2_real = None

        # ==============================================================
        #             TEAM 1  (Players 3 & 4)
        # ==============================================================

        try:
            t1_team = match_teams[1]
            t1_id_in_match = t1_team.get("id")

            t1_players = t1_team.get("players", [])
            if t1_players is None:
                t1_players = []

            t1_p1_real = t1_players[0].get("id") if len(t1_players) > 0 else None
            t1_p2_real = t1_players[1].get("id") if len(t1_players) > 1 else None

        except:
            t1_id_in_match = None
            t1_p1_real = None
            t1_p2_real = None

        # ==============================================================
        #                     SET SCORES
        # ==============================================================

        sets = match.get("sets", [])
        if sets is None or not isinstance(sets, list):
            sets = []

        set_scores = [None, None, None]

        t0_key = str(t0_id_in_match) if t0_id_in_match is not None else None
        t1_key = str(t1_id_in_match) if t1_id_in_match is not None else None

        for idx, s in enumerate(sets):
            if idx > 2:
                break
            try:
                t0_score = s.get(t0_key)
                t1_score = s.get(t1_key)

                if t0_score is not None and t1_score is not None:
                    set_scores[idx] = f"{t0_score}*{t1_score}"
            except:
                pass

        # ==============================================================

        results.append({
            "team_match_id": team_match_id,
            "match_date": match_date,
            "match_type": match_type,
            "match_flight": match_flight,
            "doubles_match_id": doubles_match_id,

            # Team 0 player info
            "doubles_team0_id_in_match": t0_id_in_match,
            "doubles_team0_is_winner": t0_is_winner,
            "doubles_team0_player1_real_id": t0_p1_real,
            "doubles_team0_player2_real_id": t0_p2_real,

            # Team 1 player info
            "doubles_team1_id_in_match": t1_id_in_match,
            "doubles_team1_player1_real_id": t1_p1_real,
            "doubles_team1_player2_real_id": t1_p2_real,

            # Set scores
            "set1_score": set_scores[0],
            "set2_score": set_scores[1],
            "set3_score": set_scores[2]
        })

    return results


# --------------------------------------------------
# Main Program
# --------------------------------------------------

def main():
    print(f"Loading file list from: {FILES_LIST}")
    df_files = pd.read_csv(FILES_LIST)

    # Process ALL files (remove .head(5) for full run)
    file_list = df_files["filename"].tolist()

    all_rows = []

    for filename in file_list:
        full_path = os.path.join(REPORT_DIR, filename)

        if not os.path.exists(full_path):
            print(f"Missing file: {full_path}")
            continue

        try:
            with open(full_path, "r", encoding="utf-8") as f:
                data = json.load(f)
        except Exception as e:
            print(f"Error reading {filename}: {e}")
            continue

        # -----------------------
        # FIX: normalize meets
        # -----------------------
        meets = data.get("meets")
        if not isinstance(meets, list):
            meets = []

        for meet in meets:
            rows = extract_doubles_from_meet(meet)
            if rows:
                all_rows.extend(rows)

    df_out = pd.DataFrame(all_rows)
    df_out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

    print("\nDoubles match extraction complete.")
    print(f"File written: {OUTPUT_CSV}")
    print(f"Total doubles matches extracted: {len(df_out)}")


if __name__ == "__main__":
    main()

Loading file list from: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\schools_with_json_files.csv

Doubles match extraction complete.
File written: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\doubles_matches_raw.csv
Total doubles matches extracted: 13663


In [10]:
import os
import pandas as pd

# --------------------------------------------------
# Config
# --------------------------------------------------

MAIN_DIR = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"
INPUT_FILE = os.path.join(MAIN_DIR, "singles_matches_raw.csv")
OUTPUT_FILE = os.path.join(MAIN_DIR, "singles_matches_cleaner.csv")


# --------------------------------------------------
# Helper to reverse scores "3*6" → "6*3"
# --------------------------------------------------
def reverse_score(score):
    if pd.isna(score) or score is None:
        return None
    try:
        a, b = score.split("*")
        return f"{b}*{a}"
    except:
        return None


# --------------------------------------------------
# Main cleaning process
# --------------------------------------------------

def main():

    df = pd.read_csv(INPUT_FILE)

    # --------------------------------------------------
    # Create new field: flight = S + match_flight
    # --------------------------------------------------
    df["flight"] = "S" + df["match_flight"].astype(str)

    # --------------------------------------------------
    # Create winner1 and loser1
    # --------------------------------------------------
    df["winner1"] = df.apply(
        lambda row: row["singles_player0_real_id"]
        if row["singles_player0_is_winner"] == True
        else row["singles_player1_real_id"],
        axis=1
    )

    df["loser1"] = df.apply(
        lambda row: row["singles_player0_real_id"]
        if row["singles_player0_is_winner"] == False
        else row["singles_player1_real_id"],
        axis=1
    )

    # --------------------------------------------------
    # Build set1, set2, set3 (with reversing)
    # --------------------------------------------------
    def choose_score(row, col):
        """Return the correct orientation of the set score."""
        score = row[col]
        if row["singles_player0_is_winner"] == True:
            return score
        else:
            return reverse_score(score)

    df["set1"] = df.apply(lambda r: choose_score(r, "set1_score"), axis=1)
    df["set2"] = df.apply(lambda r: choose_score(r, "set2_score"), axis=1)
    df["set3"] = df.apply(lambda r: choose_score(r, "set3_score"), axis=1)

    # --------------------------------------------------
    # Drop unwanted columns
    # --------------------------------------------------
    df = df.drop(columns=[
        "match_type",
        "singles_player0_id_in_match",
        "singles_player1_id_in_match",
        "singles_player0_real_id",
        "singles_player1_real_id",
        "set1_score",
        "set2_score",
        "set3_score",
        "singles_player0_is_winner",
        "match_flight"
    ], errors="ignore")

    # --------------------------------------------------
    # Drop records where set1_score == '2*0'
    # --------------------------------------------------
    df = df[df["set1"] != "2*0"]

    # --------------------------------------------------
    # Drop records where all three sets are null
    # --------------------------------------------------
    df = df[~(df["set1"].isna() & df["set2"].isna() & df["set3"].isna())]

    # --------------------------------------------------
    # Shift sets upward if earlier ones are blank
    # --------------------------------------------------
    def shift_sets(row):
        sets = [row["set1"], row["set2"], row["set3"]]
        # Remove None values but keep order
        non_null = [s for s in sets if pd.notna(s)]

        # Pad with None back to length 3
        non_null += [None] * (3 - len(non_null))

        return pd.Series({
            "set1": non_null[0],
            "set2": non_null[1],
            "set3": non_null[2]
        })

    df[["set1", "set2", "set3"]] = df.apply(shift_sets, axis=1)

    # --------------------------------------------------
    # Replace any remaining nulls with "0*0"
    # --------------------------------------------------
    df["set1"] = df["set1"].fillna("0*0")
    df["set2"] = df["set2"].fillna("0*0")
    df["set3"] = df["set3"].fillna("0*0")

    # --------------------------------------------------
    # Write output
    # --------------------------------------------------
    df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")
    print(f"Cleaned singles matches written to: {OUTPUT_FILE}")


if __name__ == "__main__":
    main()

Cleaned singles matches written to: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\singles_matches_cleaner.csv


In [8]:
MAIN_DIR = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"
INPUT_FILE = os.path.join(MAIN_DIR, "doubles_matches_raw.csv")
OUTPUT_FILE = os.path.join(MAIN_DIR, "doubles_matches_cleaner.csv")


# --------------------------------------------------
# Helper to reverse scores "3*6" → "6*3"
# --------------------------------------------------
def reverse_score(score):
    if pd.isna(score) or score is None:
        return None
    try:
        a, b = score.split("*")
        return f"{b}*{a}"
    except:
        return None


# --------------------------------------------------
# Main cleaning process
# --------------------------------------------------

def main():

    df = pd.read_csv(INPUT_FILE)

    # --------------------------------------------------
    # Create new field: flight = D + match_flight
    # --------------------------------------------------
    df["flight"] = "D" + df["match_flight"].astype(str)

    # --------------------------------------------------
    # Create winners & losers
    # team0 is winner if doubles_team0_is_winner == TRUE
    # --------------------------------------------------

    df["winner1"] = df.apply(
        lambda r: r["doubles_team0_player1_real_id"]
        if r["doubles_team0_is_winner"] == True
        else r["doubles_team1_player1_real_id"],
        axis=1
    )

    df["winner2"] = df.apply(
        lambda r: r["doubles_team0_player2_real_id"]
        if r["doubles_team0_is_winner"] == True
        else r["doubles_team1_player2_real_id"],
        axis=1
    )

    df["loser1"] = df.apply(
        lambda r: r["doubles_team0_player1_real_id"]
        if r["doubles_team0_is_winner"] == False
        else r["doubles_team1_player1_real_id"],
        axis=1
    )

    df["loser2"] = df.apply(
        lambda r: r["doubles_team0_player2_real_id"]
        if r["doubles_team0_is_winner"] == False
        else r["doubles_team1_player2_real_id"],
        axis=1
    )

    # --------------------------------------------------
    # Build set1, set2, set3 (with reversing)
    # --------------------------------------------------
    def choose_score(row, col):
        score = row[col]
        if row["doubles_team0_is_winner"] == True:
            return score
        else:
            return reverse_score(score)

    df["set1"] = df.apply(lambda r: choose_score(r, "set1_score"), axis=1)
    df["set2"] = df.apply(lambda r: choose_score(r, "set2_score"), axis=1)
    df["set3"] = df.apply(lambda r: choose_score(r, "set3_score"), axis=1)

    # --------------------------------------------------
    # Drop unwanted fields
    # --------------------------------------------------
    df = df.drop(columns=[
        "match_type",
        "doubles_team0_id_in_match",
        "doubles_team1_id_in_match",
        "doubles_team0_player1_real_id",
        "doubles_team0_player2_real_id",
        "doubles_team1_player1_real_id",
        "doubles_team1_player2_real_id",
        "set1_score",
        "set2_score",
        "set3_score",
        "doubles_team0_is_winner",
        "match_flight"
    ], errors="ignore")

    # --------------------------------------------------
    # Drop records where set1 = “2*0”
    # --------------------------------------------------
    df = df[df["set1"] != "2*0"]

    # --------------------------------------------------
    # Drop records where all 3 sets are null
    # --------------------------------------------------
    df = df[~(df["set1"].isna() & df["set2"].isna() & df["set3"].isna())]

    # --------------------------------------------------
    # Shift sets upward if needed
    # --------------------------------------------------
    def shift_sets(row):
        sets = [row["set1"], row["set2"], row["set3"]]
        non_null = [s for s in sets if pd.notna(s)]
        non_null += [None] * (3 - len(non_null))
        return pd.Series({
            "set1": non_null[0],
            "set2": non_null[1],
            "set3": non_null[2]
        })

    df[["set1", "set2", "set3"]] = df.apply(shift_sets, axis=1)

    # --------------------------------------------------
    # Replace any remaining nulls with “0*0”
    # --------------------------------------------------
    df["set1"] = df["set1"].fillna("0*0")
    df["set2"] = df["set2"].fillna("0*0")
    df["set3"] = df["set3"].fillna("0*0")

    # --------------------------------------------------
    # Save output
    # --------------------------------------------------
    df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")
    print(f"Doubles matches cleaned → {OUTPUT_FILE}")


if __name__ == "__main__":
    main()

Doubles matches cleaned → C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\doubles_matches_cleaner.csv


In [20]:
MAIN_DIR = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"

SINGLES_INPUT  = MAIN_DIR + r"\singles_matches_cleaner.csv"
DOUBLES_INPUT  = MAIN_DIR + r"\doubles_matches_cleaner.csv"

SINGLES_OUTPUT = MAIN_DIR + r"\singles_cleaner2.csv"
DOUBLES_OUTPUT = MAIN_DIR + r"\doubles_cleaner2.csv"

SINGLES_TRANS  = MAIN_DIR + r"\singles_score_transformations.csv"
DOUBLES_TRANS  = MAIN_DIR + r"\doubles_score_transformations.csv"


# --------------------------------------------------
# Helper functions
# --------------------------------------------------

def parse_score(s):
    try:
        a, b = s.split("*")
        return int(a), int(b)
    except:
        return None


def total_games(row):
    tot = 0
    for col in ["set1", "set2", "set3"]:
        sc = parse_score(str(row[col]))
        if sc:
            tot += sc[0] + sc[1]
    return tot



# ---------------- RULE 3 (UPDATED): fix 60–76 values ----------------

def fix_large_values(s):
    sc = parse_score(str(s))
    if not sc:
        return s

    a, b = sc

    # Apply to 60–76 on EITHER side
    if 60 <= a <= 76:
        a_new = int(str(a)[0])
        b_new = int(str(a)[1])
        return f"{a_new}*{b_new}"

    if 60 <= b <= 76:
        a_new = int(str(b)[1])
        b_new = int(str(b)[0])
        return f"{a_new}*{b_new}"

    return s



# ---------------- RULE 4 (UPDATED): Set3 big values ----------------

def fix_set3_big(s):
    sc = parse_score(str(s))
    if not sc:
        return s

    a, b = sc
    if a >= 10 or b >= 10:
        # If a > b, winner is left → return 1*0
        if a > b:
            return "1*0"
        else:
            return "0*1"
    return s



# ---------------- RULE 6 (NEW): Large values for sets 1–2 ----------------

def fix_big_set_1_2(s):
    sc = parse_score(str(s))
    if not sc:
        return s

    a, b = sc

    if a >= 10 or b >= 10:
        if a > b:
            return "7*6"   # winner on left
        else:
            return "6*7"   # winner on right

    return s



# ---------------- RULE 7 (NEW): 7*0–7*4 and 0*7–4*7 ----------------

def fix_7x_low(s):
    sc = parse_score(str(s))
    if not sc:
        return s

    a, b = sc

    # 7 beating low scores
    if a == 7 and b in [0,1,2,3,4]:
        return "7*6"

    # losing to 7
    if b == 7 and a in [0,1,2,3,4]:
        return "6*7"

    return s



# --------------------------------------------------
# Main processing function
# --------------------------------------------------

def process(df, idcol, transfile):
    df = df.copy()

    # 1) Drop low-total-game rows
    df["tg"] = df.apply(total_games, axis=1)
    df = df[df.tg >= 6].drop(columns=["tg"])

    # 2) Reverse entire match if last valid set is backwards
    for idx, row in df.iterrows():
        for col in ["set3", "set2", "set1"]:
            sc = parse_score(str(row[col]))
            if sc:
                if sc[0] < sc[1]:
                    # reverse all sets
                    for c in ["set1", "set2", "set3"]:
                        s = parse_score(str(row[c]))
                        if s:
                            df.at[idx, c] = f"{s[1]}*{s[0]}"
                break

    # 3–7) Transformations WITH LOGGING
    logs = []

    # ---- Rule 3
    for idx, row in df.iterrows():
        for col in ["set1", "set2", "set3"]:
            orig = row[col]
            new  = fix_large_values(orig)
            if new != orig:
                logs.append([row[idcol], col, orig, new])
                df.at[idx, col] = new

    # ---- Rule 4 (set3 only)
    for idx, row in df.iterrows():
        col = "set3"
        orig = row[col]
        new  = fix_set3_big(orig)
        if new != orig:
            logs.append([row[idcol], col, orig, new])
            df.at[idx, col] = new

    # ---- Rule 6 (sets 1 and 2)
    for idx, row in df.iterrows():
        for col in ["set1", "set2"]:
            orig = row[col]
            new  = fix_big_set_1_2(orig)
            if new != orig:
                logs.append([row[idcol], col, orig, new])
                df.at[idx, col] = new

    # ---- Rule 7
    for idx, row in df.iterrows():
        for col in ["set1", "set2", "set3"]:
            orig = row[col]
            new  = fix_7x_low(orig)
            if new != orig:
                logs.append([row[idcol], col, orig, new])
                df.at[idx, col] = new

    # Save transformation log
    tf = pd.DataFrame(logs, columns=["match_id", "set", "before", "after"])
    tf.to_csv(transfile, index=False)

    return df, tf



# --------------------------------------------------
# Apply to singles and doubles
# --------------------------------------------------

df_sing = pd.read_csv(SINGLES_INPUT)
df_doub = pd.read_csv(DOUBLES_INPUT)

sing2, t1 = process(df_sing, "singles_match_id", SINGLES_TRANS)
doub2, t2 = process(df_doub, "doubles_match_id", DOUBLES_TRANS)

sing2.to_csv(SINGLES_OUTPUT, index=False)
doub2.to_csv(DOUBLES_OUTPUT, index=False)

print("Cleaner2 generation complete.")
print(f"Singles written to: {SINGLES_OUTPUT}")
print(f"Doubles written to: {DOUBLES_OUTPUT}")
print(f"Transformation logs written to {SINGLES_TRANS} and {DOUBLES_TRANS}")

Cleaner2 generation complete.
Singles written to: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\singles_cleaner2.csv
Doubles written to: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\doubles_cleaner2.csv
Transformation logs written to C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\singles_score_transformations.csv and C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\doubles_score_transformations.csv


In [21]:
BASE = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"

singles_file = BASE + r"\singles_cleaner2.csv"
doubles_file = BASE + r"\doubles_cleaner2.csv"

# Load files
df_sing = pd.read_csv(singles_file)
df_doub = pd.read_csv(doubles_file)

def audit_set_frequencies(df, label):
    print(f"\n==============================")
    print(f" Frequency Report for {label}")
    print(f"==============================")

    for col in ["set1", "set2", "set3"]:
        print(f"\n--- {col} Frequencies ---")
        freq = df[col].value_counts(dropna=False).sort_index()
        print(freq.to_string())

# Run audits
audit_set_frequencies(df_sing, "Singles Cleaner 2")
audit_set_frequencies(df_doub, "Doubles Cleaner 2")



 Frequency Report for Singles Cleaner 2

--- set1 Frequencies ---
set1
0*6      78
0*8       8
1*0       2
1*6     161
1*8       5
2*4       1
2*6     193
2*8       4
3*6     178
3*8       3
4*0      19
4*1       6
4*2       6
4*3       5
4*6     255
4*8      16
5*1       2
5*7     120
5*8       2
6*0    3514
6*1    2203
6*2    1370
6*3    1367
6*4     956
6*5       2
6*6       4
6*7     108
7*5     371
7*6     278
7*9       4
8*0     491
8*1     489
8*2     374
8*3     240
8*4     383
8*5     258
8*6     167
8*7      30
8*8       2
8*9       4
9*7      81
9*8      41

--- set2 Frequencies ---
set2
0*0    2643
0*1       6
0*4       1
0*6      97
1*0       8
1*1       2
1*6     117
2*1       2
2*4       2
2*6      94
3*0       4
3*2       2
3*4       3
3*5       2
3*6     172
4*0      16
4*1       8
4*2       7
4*3      17
4*6     188
5*2       2
5*5       2
5*7      76
6*0    3554
6*1    2084
6*2    1816
6*3    1138
6*4    1052
6*6       4
6*7      64
7*5     253
7*6     363
7*9      

In [22]:
import pandas as pd

MAIN_DIR = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"

# Input files
SINGLES_IN  = MAIN_DIR + r"\singles_cleaner2.csv"
DOUBLES_IN  = MAIN_DIR + r"\doubles_cleaner2.csv"

# Output files
SINGLES_OUT = MAIN_DIR + r"\singles_matches_clean.csv"
DOUBLES_OUT = MAIN_DIR + r"\doubles_matches_clean.csv"


def dedupe_file(input_csv, output_csv):
    print(f"\nProcessing: {input_csv}")

    df = pd.read_csv(input_csv)

    before = len(df)
    df = df.drop_duplicates()
    after = len(df)

    df.to_csv(output_csv, index=False)

    print(f"  Records before dedupe: {before}")
    print(f"  Records after  dedupe: {after}")
    print(f"  Duplicates removed:    {before - after}")
    print(f"  Clean file written to: {output_csv}")


# Run the dedupe operations
dedupe_file(SINGLES_IN, SINGLES_OUT)
dedupe_file(DOUBLES_IN, DOUBLES_OUT)

print("\nAll clean files generated successfully.")



Processing: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\singles_cleaner2.csv
  Records before dedupe: 13801
  Records after  dedupe: 4107
  Duplicates removed:    9694
  Clean file written to: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\singles_matches_clean.csv

Processing: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\doubles_cleaner2.csv
  Records before dedupe: 13469
  Records after  dedupe: 4024
  Duplicates removed:    9445
  Clean file written to: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\doubles_matches_clean.csv

All clean files generated successfully.
