## Singles Prep and Scoring

In [1]:
import pandas as pd
import numpy as np

### ----------------------------------------------------------
### 1. LOAD DATA
### ----------------------------------------------------------

MAIN_DIR = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"
sing = pd.read_csv(MAIN_DIR + r"\Non_Tourney_Singles.csv")


In [2]:
### ----------------------------------------------------------
### 2. CLEAN winner1 / loser1
### ----------------------------------------------------------

# Convert to numeric (fixes blank strings, NaN, or bad values)
sing["winner1"] = pd.to_numeric(sing["winner1"], errors="coerce")
sing["loser1"]  = pd.to_numeric(sing["loser1"], errors="coerce")

# Drop matches with missing or invalid IDs
sing = sing.dropna(subset=["winner1", "loser1"])
sing = sing[(sing["winner1"] > 0) & (sing["loser1"] > 0)]

# Convert to int now that they're clean
sing["winner1"] = sing["winner1"].astype(int)
sing["loser1"]  = sing["loser1"].astype(int)



In [3]:
### ----------------------------------------------------------
### 3. SCORE MARGIN (use set1, set2, set3)
### ----------------------------------------------------------

def score_to_margin(row):
    total = 0
    for col in ["set1", "set2", "set3"]:
        val = row[col]
        if isinstance(val, str) and "*" in val:
            a, b = val.split("*")
            total += (int(a) - int(b))
    return total

sing["margin"] = sing.apply(score_to_margin, axis=1)



In [4]:
### ----------------------------------------------------------
### 4. FLIGHT WEIGHTS
###    S1 > S2 > S3 > S4
### ----------------------------------------------------------

flight_weight = {"S1": 1.30, "S2": 1.15, "S3": 1.05, "S4": 1.00}
sing["flight_weight"] = sing["flight"].map(flight_weight).fillna(1.00)



### Rating Calculation Steps

In [5]:
### ----------------------------------------------------------
### 5. INITIALIZE RATINGS
### ----------------------------------------------------------

all_players = pd.unique(sing[["winner1", "loser1"]].values.ravel("K"))
ratings = {pid: 20.0 for pid in all_players}   # neutral starting point


### ----------------------------------------------------------
### 6. ELO FUNCTION
### ----------------------------------------------------------

def expected_score(rA, rB):
    # Smaller divisor → larger rating spread sensitivity (tennis appropriate)
    return 1 / (1 + 10**((rB - rA) / 12))


K = 6.0   # base update size


### ----------------------------------------------------------
### 7. ITERATIVE TRAINING
### ----------------------------------------------------------

for epoch in range(12):
    for _, row in sing.iterrows():

        w = row["winner1"]
        l = row["loser1"]
        margin = row["margin"]
        f = row["flight_weight"]

        rw = ratings[w]
        rl = ratings[l]

        Ew = expected_score(rw, rl)
        El = 1 - Ew

        margin_factor = 1 + (abs(margin) * 0.10)

        ratings[w] += K * f * margin_factor * (1 - Ew)
        ratings[l] += K * f * margin_factor * (0 - El)


### ----------------------------------------------------------
### 8. NORMALIZE TO RANGE 5–39
### ----------------------------------------------------------

rating_series = pd.Series(ratings)
min_r = rating_series.min()
max_r = rating_series.max()

def scale_5_39(x):
    return 5 + (x - min_r) * (34 / (max_r - min_r))

scaled = rating_series.apply(scale_5_39)


### ----------------------------------------------------------
### 9. BUILD FINAL OUTPUT TABLE
### ----------------------------------------------------------

player_ratings = pd.DataFrame({
    "playerID": scaled.index,
    "rating": scaled.values
}).sort_values("rating", ascending=False)

player_ratings.to_csv(MAIN_DIR + r"\singles_player_ratings_pre_tourneys.csv", index=False)

player_ratings.head(20)

Unnamed: 0,playerID,rating
714,123263,39.0
413,123420,38.582126
235,164183,37.969482
54,140879,37.749162
352,145172,37.397177
73,104618,36.839484
295,143730,36.147894
26,141640,35.861083
204,143078,35.565851
715,163631,35.420209


In [6]:
print("Rows loaded:", len(sing))
print("Rows with missing winner1:", sing["winner1"].isna().sum())
print("Rows with missing loser1:", sing["loser1"].isna().sum())

print("Sample winner1 values:", sing["winner1"].head())
print("Sample loser1 values:", sing["loser1"].head())

print("Unique flights:", sing["flight"].unique())


Rows loaded: 3415
Rows with missing winner1: 0
Rows with missing loser1: 0
Sample winner1 values: 0    162432
1    163020
2    163020
3    141167
4    163020
Name: winner1, dtype: int32
Sample loser1 values: 0    121539
1    104187
2    162432
3    121019
4    129948
Name: loser1, dtype: int32
Unique flights: ['S1' 'S2' 'S3' 'S4' 'S5' 'S7' 'S10' 'S6']


## Doubles

In [7]:
### ----------------------------------------------------------
### 1. LOAD DATA
### ----------------------------------------------------------

MAIN_DIR = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"
doub = pd.read_csv(MAIN_DIR + r"\Non_Tourney_Doubles.csv")


In [8]:
### ----------------------------------------------------------
### 2. CLEAN ID FIELDS — drop any match with missing players
### ----------------------------------------------------------

# Convert to numeric safely
for col in ["winner1", "winner2", "loser1", "loser2"]:
    doub[col] = pd.to_numeric(doub[col], errors="coerce")

# Drop rows where ANY doubles player is missing or invalid
doub = doub.dropna(subset=["winner1", "winner2", "loser1", "loser2"])
doub = doub[(doub["winner1"] > 0) & (doub["winner2"] > 0) &
            (doub["loser1"]  > 0) & (doub["loser2"]  > 0)]

# Convert to int now that they’re clean
for col in ["winner1", "winner2", "loser1", "loser2"]:
    doub[col] = doub[col].astype(int)

In [9]:
### ----------------------------------------------------------
### 3. SCORE MARGIN (same method as singles)
### ----------------------------------------------------------

def score_to_margin(row):
    total = 0
    for col in ["set1", "set2", "set3"]:
        val = row[col]
        if isinstance(val, str) and "*" in val:
            a, b = val.split("*")
            total += (int(a) - int(b))
    return total

doub["margin"] = doub.apply(score_to_margin, axis=1)


In [10]:
### ----------------------------------------------------------
### 4. FLIGHT WEIGHTS (same pattern as singles)
### ----------------------------------------------------------

flight_weight = {"D1": 1.30, "D2": 1.15, "D3": 1.05, "D4": 1.00}
doub["flight_weight"] = doub["flight"].map(flight_weight).fillna(1.00)

### Start Doubles Rating Process

In [11]:
### ----------------------------------------------------------
### 5. INITIALIZE RATINGS
### ----------------------------------------------------------

# Collect all individual doubles players
all_players = pd.unique(
    doub[["winner1", "winner2", "loser1", "loser2"]].values.ravel("K")
)

ratings = {pid: 20.0 for pid in all_players}   # neutral starting point


### ----------------------------------------------------------
### 6. ELO EXPECTED SCORE FUNCTION
### ----------------------------------------------------------

def expected_score(rA, rB):
    # divisor=12 used because doubles spreads are smaller
    return 1 / (1 + 10**((rB - rA) / 12))

K = 6.0


### ----------------------------------------------------------
### 7. ITERATIVE TRAINING LOOP FOR DOUBLES
### ----------------------------------------------------------

for epoch in range(12):
    for _, row in doub.iterrows():

        # Extract players
        w1, w2 = row["winner1"], row["winner2"]
        l1, l2 = row["loser1"],  row["loser2"]
        margin = row["margin"]
        f = row["flight_weight"]

        # Team ratings = sum of individual ratings
        rw = ratings[w1] + ratings[w2]
        rl = ratings[l1] + ratings[l2]

        # Expected team outcomes
        Ew = expected_score(rw, rl)
        El = 1 - Ew

        # Margin factor (same as singles)
        margin_factor = 1 + (abs(margin) * 0.10)

        # Rating update amount
        delta = K * f * margin_factor

        # Update WINNER TEAM
        change_w = delta * (1 - Ew)
        ratings[w1] += change_w / 2
        ratings[w2] += change_w / 2

        # Update LOSER TEAM
        change_l = delta * (0 - El)
        ratings[l1] += change_l / 2
        ratings[l2] += change_l / 2


### ----------------------------------------------------------
### 8. NORMALIZE RATINGS TO RANGE 5–39
### ----------------------------------------------------------

rating_series = pd.Series(ratings)
min_r, max_r = rating_series.min(), rating_series.max()

def scale_5_39(x):
    return 5 + (x - min_r) * (34 / (max_r - min_r))

scaled = rating_series.apply(scale_5_39)


### ----------------------------------------------------------
### 9. BUILD FINAL OUTPUT TABLE
### ----------------------------------------------------------

player_ratings = pd.DataFrame({
    "playerID": scaled.index,
    "rating": scaled.values
}).sort_values("rating", ascending=False)

player_ratings.to_csv(MAIN_DIR + r"\doubles_player_ratings_pre_tourneys.csv", index=False)

player_ratings.head(20)

Unnamed: 0,playerID,rating
316,106168,39.0
306,100762,36.915838
61,121168,35.952202
237,107511,35.794512
242,121900,35.650678
547,131711,35.590633
320,165255,34.494139
327,100675,34.255306
241,99666,33.976907
352,145002,33.930493


## Bring together ratings and names

In [12]:
import pandas as pd

BASE = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"

# Load ratings files
sing_ratings = pd.read_csv(BASE + r"\singles_player_ratings_pre_tourneys.csv")
doub_ratings = pd.read_csv(BASE + r"\doubles_player_ratings_pre_tourneys.csv")

# Load players file
players = pd.read_csv(BASE + r"\girls_players.csv")[["playerID", "fullname"]]

# Convert IDs to numeric consistently
sing_ratings["playerID"] = pd.to_numeric(sing_ratings["playerID"], errors="coerce")
doub_ratings["playerID"] = pd.to_numeric(doub_ratings["playerID"], errors="coerce")
players["playerID"] = pd.to_numeric(players["playerID"], errors="coerce")

# Merge names into ratings
sing_with_names = sing_ratings.merge(players, on="playerID", how="left")
doub_with_names = doub_ratings.merge(players, on="playerID", how="left")

# Sort and show top 20
top20_sing = sing_with_names.sort_values("rating", ascending=False).head(20)
top20_doub = doub_with_names.sort_values("rating", ascending=False).head(20)

print("\n=== TOP 20 SINGLES PLAYERS ===\n")
print(top20_sing)

print("\n=== TOP 20 DOUBLES PLAYERS ===\n")
print(top20_doub)



=== TOP 20 SINGLES PLAYERS ===

    playerID     rating              fullname
0     123263  39.000000         Sonya Drayton
1     123420  38.582126           Raegan Farm
2     164183  37.969482  Nikhitha Raji Suresh
3     140879  37.749162      Carlyta Barfield
4     145172  37.397177          Kate Bonetto
5     104618  36.839484        Kennedy Harris
6     143730  36.147894          Sitara Soman
7     141640  35.861083        Whitney Hedden
8     143078  35.565851        Alexis Uschold
9     163631  35.420209              Leah Lup
10    121899  35.346808        Sabrina Larsen
11    145001  35.242848       Elizabeth Sacks
12    164184  35.113096           Vivienne Lu
13    100675  34.192776      Brooklyn Lindsey
14    119281  33.418249          Amanda Perez
15    102472  32.920215            Hana Yokoi
16    122723  32.784533        Hayden Kearney
17    124077  32.604686        Malavika Rahul
18    144711  32.572237  Claudia Stadtmueller
19    144531  32.441309            Isa Llados

