### Reduce the match results files down to just season-end tournament results

In [1]:
import pandas as pd

MAIN_DIR = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"

# Input files
SINGLES_IN  = MAIN_DIR + r"\singles_matches_clean.csv"
DOUBLES_IN  = MAIN_DIR + r"\doubles_matches_clean.csv"
TOURNEY_IN  = MAIN_DIR + r"\district_state_tournament_match_ids.csv"

# Output files
SINGLES_OUT = MAIN_DIR + r"\Tourney_Singles.csv"
DOUBLES_OUT = MAIN_DIR + r"\Tourney_Doubles.csv"

# -------------------------------------------------------
# Load tournament match IDs
# -------------------------------------------------------
tourney_ids = pd.read_csv(TOURNEY_IN)

# Ensure ID column is numeric
tourney_ids["team_match_id"] = pd.to_numeric(tourney_ids["team_match_id"], errors="coerce")

# Convert to set for faster lookup
tourney_set = set(tourney_ids["team_match_id"].dropna().astype(int))


# -------------------------------------------------------
# Process SINGLES — keep only tournament matches
# -------------------------------------------------------
sing = pd.read_csv(SINGLES_IN)

# Ensure numeric
sing["team_match_id"] = pd.to_numeric(sing["team_match_id"], errors="coerce")

# KEEP only rows where team_match_id is in the tournament list
sing_tourney = sing[sing["team_match_id"].isin(tourney_set)]

# Save output
sing_tourney.to_csv(SINGLES_OUT, index=False)


# -------------------------------------------------------
# Process DOUBLES — keep only tournament matches
# -------------------------------------------------------
doub = pd.read_csv(DOUBLES_IN)

# Ensure numeric
doub["team_match_id"] = pd.to_numeric(doub["team_match_id"], errors="coerce")

# KEEP only rows where team_match_id is in the tournament list
doub_tourney = doub[doub["team_match_id"].isin(tourney_set)]

# Save output
doub_tourney.to_csv(DOUBLES_OUT, index=False)


print("Tournament match filtering complete.")
print(f"Tourney singles written to: {SINGLES_OUT}")
print(f"Tourney doubles written to: {DOUBLES_OUT}")

Tournament match filtering complete.
Tourney singles written to: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\Tourney_Singles.csv
Tourney doubles written to: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\Tourney_Doubles.csv


### ChatGPT Comparison code - did the OSAA Ratings by Todd (System A) or singles_player_ratings by ChatGPT (System B) predict more accurately

In [3]:
import pandas as pd

# --------------------------------------------------------
# File locations
# --------------------------------------------------------
MAIN = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"

FILE_SYS_A = MAIN + r"\OSAA_Ratings_2025.csv"
FILE_SYS_B = MAIN + r"\singles_player_ratings_pre_tourneys.csv"
FILE_MATCH = MAIN + r"\Tourney_Singles.csv"

OUTPUT_CSV = MAIN + r"\comparison_results.csv"

# --------------------------------------------------------
# Load datasets
# --------------------------------------------------------
sysA = pd.read_csv(FILE_SYS_A)
sysB = pd.read_csv(FILE_SYS_B)
matches = pd.read_csv(FILE_MATCH)

# Standardize column names for merges
sysA = sysA.rename(columns={"player": "playerID", "sing_rating": "Todd_Rating"})
sysB = sysB.rename(columns={"playerID": "playerID", "rating": "ChatGPT_Rating"})

# Convert IDs to numeric types
for col in ["winner1", "loser1"]:
    matches[col] = pd.to_numeric(matches[col], errors="coerce")

sysA["playerID"] = pd.to_numeric(sysA["playerID"], errors="coerce")
sysB["playerID"] = pd.to_numeric(sysB["playerID"], errors="coerce")

# Drop invalid records
matches = matches.dropna(subset=["winner1", "loser1"])
matches = matches[(matches["winner1"] > 0) & (matches["loser1"] > 0)]
matches["winner1"] = matches["winner1"].astype(int)
matches["loser1"]  = matches["loser1"].astype(int)

# --------------------------------------------------------
# Merge System A (Todd) Ratings
# --------------------------------------------------------
df = matches.merge(sysA, left_on="winner1", right_on="playerID", how="left") \
            .rename(columns={"Todd_Rating": "winner_Todd_Rating"}) \
            .drop(columns=["playerID"])

df = df.merge(sysA, left_on="loser1", right_on="playerID", how="left") \
       .rename(columns={"Todd_Rating": "loser_Todd_Rating"}) \
       .drop(columns=["playerID"])

# --------------------------------------------------------
# Merge System B (ChatGPT) Ratings
# --------------------------------------------------------
df = df.merge(sysB, left_on="winner1", right_on="playerID", how="left") \
       .rename(columns={"ChatGPT_Rating": "winner_ChatGPT_Rating"}) \
       .drop(columns=["playerID"])

df = df.merge(sysB, left_on="loser1", right_on="playerID", how="left") \
       .rename(columns={"ChatGPT_Rating": "loser_ChatGPT_Rating"}) \
       .drop(columns=["playerID"])

# --------------------------------------------------------
# Remove records where ratings are missing
# --------------------------------------------------------
df = df.dropna(subset=[
    "winner_Todd_Rating", "loser_Todd_Rating",
    "winner_ChatGPT_Rating", "loser_ChatGPT_Rating"
])

# --------------------------------------------------------
# Compute prediction correctness
# --------------------------------------------------------
df["Todd_correct"] = df["winner_Todd_Rating"] > df["loser_Todd_Rating"]
df["ChatGPT_correct"] = df["winner_ChatGPT_Rating"] > df["loser_ChatGPT_Rating"]

# --------------------------------------------------------
# Create the comparison result field
# --------------------------------------------------------
def compare(row):
    if row["Todd_correct"] and row["ChatGPT_correct"]:
        return "Both Right"
    elif row["Todd_correct"] and not row["ChatGPT_correct"]:
        return "Only Todd was Right"
    elif row["ChatGPT_correct"] and not row["Todd_correct"]:
        return "Only ChatGPT was Right"
    else:
        return "Both Wrong"

df["comparison_outcome"] = df.apply(compare, axis=1)

# --------------------------------------------------------
# Save Final Output
# --------------------------------------------------------
df.to_csv(OUTPUT_CSV, index=False)

print("comparison_results.csv created:")
print(OUTPUT_CSV)

comparison_results.csv created:
C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\comparison_results.csv


### Singles Miss Analysis

In [4]:
MAIN = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"
FILE_COMPARE = MAIN + r"\comparison_results.csv"

# Load data
df = pd.read_csv(FILE_COMPARE)

# --------------------------------------------------
# Compute rating gaps for each system
# Positive gap means the system thought the loser should win.
# --------------------------------------------------

df["Todd_gap"] = df["loser_Todd_Rating"] - df["winner_Todd_Rating"]
df["ChatGPT_gap"] = df["loser_ChatGPT_Rating"] - df["winner_ChatGPT_Rating"]

# --------------------------------------------------
# Filter only the WRONG predictions
# --------------------------------------------------

df_todd_wrong = df[df["Todd_correct"] == False].copy()
df_chatgpt_wrong = df[df["ChatGPT_correct"] == False].copy()

# --------------------------------------------------
# Summary statistics
# --------------------------------------------------

print("\n==============================")
print("   WRONG PREDICTION ANALYSIS")
print("==============================")

print("\n--- Todd Wrong Predictions ---")
print(f"Count: {len(df_todd_wrong)}")
print("Gap Summary:")
print(df_todd_wrong["Todd_gap"].describe())

print("\n--- ChatGPT Wrong Predictions ---")
print(f"Count: {len(df_chatgpt_wrong)}")
print("Gap Summary:")
print(df_chatgpt_wrong["ChatGPT_gap"].describe())

# --------------------------------------------------
# Optional: save wrong prediction details
# --------------------------------------------------

df_todd_wrong_out = df_todd_wrong[[
    "winner1","loser1",
    "winner_Todd_Rating","loser_Todd_Rating",
    "Todd_gap","comparison_outcome"
]]

df_chatgpt_wrong_out = df_chatgpt_wrong[[
    "winner1","loser1",
    "winner_ChatGPT_Rating","loser_ChatGPT_Rating",
    "ChatGPT_gap","comparison_outcome"
]]

df_todd_wrong_out.to_csv(MAIN + r"\Todd_wrong_prediction_details.csv", index=False)
df_chatgpt_wrong_out.to_csv(MAIN + r"\ChatGPT_wrong_prediction_details.csv", index=False)

print("\nFiles created:")
print(" - Todd_wrong_prediction_details.csv")
print(" - ChatGPT_wrong_prediction_details.csv")

print("\nAnalysis complete.\n")


   WRONG PREDICTION ANALYSIS

--- Todd Wrong Predictions ---
Count: 88
Gap Summary:
count    88.000000
mean      1.787498
std       2.398446
min       0.018865
25%       0.537150
50%       1.148892
75%       2.109822
max      18.147273
Name: Todd_gap, dtype: float64

--- ChatGPT Wrong Predictions ---
Count: 125
Gap Summary:
count    125.000000
mean       2.732722
std        2.091542
min        0.009117
25%        1.149466
50%        2.409232
75%        3.766138
max       14.043622
Name: ChatGPT_gap, dtype: float64

Files created:
 - Todd_wrong_prediction_details.csv
 - ChatGPT_wrong_prediction_details.csv

Analysis complete.



## Doubles

In [5]:
# --------------------------------------------------------
# File paths
# --------------------------------------------------------
MAIN = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"

FILE_SYS_A = MAIN + r"\OSAA_Ratings_2025.csv"                      # Todd ratings
FILE_SYS_B = MAIN + r"\doubles_player_ratings_pre_tourneys.csv"    # ChatGPT ratings
FILE_MATCH = MAIN + r"\Tourney_Doubles.csv"                        # Doubles matches

OUTPUT_COMPARE = MAIN + r"\doubles_comparison.csv"
OUTPUT_TODD_WRONG = MAIN + r"\Doubles_Todd_wrong_prediction_details.csv"
OUTPUT_GPT_WRONG = MAIN + r"\Doubles_ChatGPT_wrong_prediction_details.csv"

# --------------------------------------------------------
# Load data
# --------------------------------------------------------
sysA = pd.read_csv(FILE_SYS_A)
sysB = pd.read_csv(FILE_SYS_B)
matches = pd.read_csv(FILE_MATCH)

# --------------------------------------------------------
# Prep rating datasets
# --------------------------------------------------------
sysA = sysA.rename(columns={"player": "playerID", "doub_rating": "Todd_Rating"})
sysA["playerID"] = pd.to_numeric(sysA["playerID"], errors="coerce")

sysB = sysB.rename(columns={"playerID": "playerID", "rating": "ChatGPT_Rating"})
sysB["playerID"] = pd.to_numeric(sysB["playerID"], errors="coerce")

# Convert match ID fields to numeric
for col in ["winner1", "winner2", "loser1", "loser2"]:
    matches[col] = pd.to_numeric(matches[col], errors="coerce")

# Drop any incomplete match fields
matches = matches.dropna(subset=["winner1", "winner2", "loser1", "loser2"]).copy()

# --------------------------------------------------------
# Merge ratings for all 4 players
# --------------------------------------------------------
def merge_ratings(df, ratings, rating_col, suffix):
    df = df.merge(ratings[["playerID", rating_col]],
                  left_on="winner1", right_on="playerID", how="left") \
           .rename(columns={rating_col: f"winner1_{suffix}"}) \
           .drop(columns=["playerID"])

    df = df.merge(ratings[["playerID", rating_col]],
                  left_on="winner2", right_on="playerID", how="left") \
           .rename(columns={rating_col: f"winner2_{suffix}"}) \
           .drop(columns=["playerID"])

    df = df.merge(ratings[["playerID", rating_col]],
                  left_on="loser1", right_on="playerID", how="left") \
           .rename(columns={rating_col: f"loser1_{suffix}"}) \
           .drop(columns=["playerID"])

    df = df.merge(ratings[["playerID", rating_col]],
                  left_on="loser2", right_on="playerID", how="left") \
           .rename(columns={rating_col: f"loser2_{suffix}"}) \
           .drop(columns=["playerID"])

    return df

# Merge Todd ratings (system A)
df = merge_ratings(matches.copy(), sysA, "Todd_Rating", "Todd")

# Merge ChatGPT ratings (system B)
df = merge_ratings(df, sysB, "ChatGPT_Rating", "GPT")

# Drop matches with missing any ratings
df = df.dropna(subset=[
    "winner1_Todd", "winner2_Todd", "loser1_Todd", "loser2_Todd",
    "winner1_GPT", "winner2_GPT", "loser1_GPT", "loser2_GPT"
])

# --------------------------------------------------------
# Compute AVERAGE pair ratings
# --------------------------------------------------------
df["winner_Todd_avg"] = (df["winner1_Todd"] + df["winner2_Todd"]) / 2
df["loser_Todd_avg"]  = (df["loser1_Todd"] + df["loser2_Todd"]) / 2

df["winner_GPT_avg"] = (df["winner1_GPT"] + df["winner2_GPT"]) / 2
df["loser_GPT_avg"]  = (df["loser1_GPT"] + df["loser2_GPT"]) / 2

# --------------------------------------------------------
# Determine correctness
# --------------------------------------------------------
df["Todd_correct"] = df["winner_Todd_avg"] > df["loser_Todd_avg"]
df["GPT_correct"]  = df["winner_GPT_avg"] > df["loser_GPT_avg"]

# --------------------------------------------------------
# Create summary field
# --------------------------------------------------------
def compare(row):
    if row["Todd_correct"] and row["GPT_correct"]:
        return "Both Right"
    elif row["Todd_correct"] and not row["GPT_correct"]:
        return "Only Todd was Right"
    elif row["GPT_correct"] and not row["Todd_correct"]:
        return "Only ChatGPT was Right"
    else:
        return "Both Wrong"

df["comparison_outcome"] = df.apply(compare, axis=1)

# --------------------------------------------------------
# Save comparison output
# --------------------------------------------------------
df.to_csv(OUTPUT_COMPARE, index=False)
print(f"Created: {OUTPUT_COMPARE}")

# --------------------------------------------------------
# Compute rating gaps for WRONG predictions
# --------------------------------------------------------

# Todd gap = loser_avg - winner_avg (positive means Todd overrated loser)
df["Todd_gap"] = df["loser_Todd_avg"] - df["winner_Todd_avg"]

# GPT gap
df["GPT_gap"] = df["loser_GPT_avg"] - df["winner_GPT_avg"]

# Wrong matches only
df_todd_wrong = df[df["Todd_correct"] == False].copy()
df_gpt_wrong = df[df["GPT_correct"] == False].copy()

# --------------------------------------------------------
# Save wrong prediction details
# --------------------------------------------------------
df_todd_wrong_out = df_todd_wrong[[
    "winner1","winner2","loser1","loser2",
    "winner_Todd_avg","loser_Todd_avg",
    "Todd_gap","comparison_outcome"
]]
df_todd_wrong_out.to_csv(OUTPUT_TODD_WRONG, index=False)

df_gpt_wrong_out = df_gpt_wrong[[
    "winner1","winner2","loser1","loser2",
    "winner_GPT_avg","loser_GPT_avg",
    "GPT_gap","comparison_outcome"
]]
df_gpt_wrong_out.to_csv(OUTPUT_GPT_WRONG, index=False)

print(f"Created: {OUTPUT_TODD_WRONG}")
print(f"Created: {OUTPUT_GPT_WRONG}")

# --------------------------------------------------------
# Print basic stats on wrong predictions
# --------------------------------------------------------
print("\n==============================")
print("   DOUBLES RATING ANALYSIS")
print("==============================")

accuracy_todd = df["Todd_correct"].mean()
accuracy_gpt  = df["GPT_correct"].mean()

print(f"Todd accuracy:   {accuracy_todd:.4f} ({accuracy_todd*100:.2f}%)")
print(f"ChatGPT accuracy:{accuracy_gpt:.4f} ({accuracy_gpt*100:.2f}%)")

print("\n--- Wrong Prediction Gap Summary ---")
print("\nTodd wrong gap:")
print(df_todd_wrong["Todd_gap"].describe())

print("\nChatGPT wrong gap:")
print(df_gpt_wrong["GPT_gap"].describe())

print("\nAnalysis complete.")

Created: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\doubles_comparison.csv
Created: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\Doubles_Todd_wrong_prediction_details.csv
Created: C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite\Doubles_ChatGPT_wrong_prediction_details.csv

   DOUBLES RATING ANALYSIS
Todd accuracy:   0.8507 (85.07%)
ChatGPT accuracy:0.7745 (77.45%)

--- Wrong Prediction Gap Summary ---

Todd wrong gap:
count    90.000000
mean      1.689078
std       1.951129
min       0.003394
25%       0.535767
50%       1.058507
75%       2.077204
max       9.351905
Name: Todd_gap, dtype: float64

ChatGPT wrong gap:
count    136.000000
mean       2.498210
std        2.050913
min        0.016369
25%        0.874832
50%        1.945953
75%        3.652013
max        9.993370
Name: GPT_gap, dtype: float64

Analysis complete.
