## Singles Prep and Scoring

In [1]:
import pandas as pd
import numpy as np

### ----------------------------------------------------------
### 1. LOAD DATA
### ----------------------------------------------------------

MAIN_DIR = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"
sing = pd.read_csv(MAIN_DIR + r"\singles_matches_clean.csv")


In [2]:
### ----------------------------------------------------------
### 2. CLEAN winner1 / loser1
### ----------------------------------------------------------

# Convert to numeric (fixes blank strings, NaN, or bad values)
sing["winner1"] = pd.to_numeric(sing["winner1"], errors="coerce")
sing["loser1"]  = pd.to_numeric(sing["loser1"], errors="coerce")

# Drop matches with missing or invalid IDs
sing = sing.dropna(subset=["winner1", "loser1"])
sing = sing[(sing["winner1"] > 0) & (sing["loser1"] > 0)]

# Convert to int now that they're clean
sing["winner1"] = sing["winner1"].astype(int)
sing["loser1"]  = sing["loser1"].astype(int)



In [3]:
### ----------------------------------------------------------
### 3. SCORE MARGIN (use set1, set2, set3)
### ----------------------------------------------------------

def score_to_margin(row):
    total = 0
    for col in ["set1", "set2", "set3"]:
        val = row[col]
        if isinstance(val, str) and "*" in val:
            a, b = val.split("*")
            total += (int(a) - int(b))
    return total

sing["margin"] = sing.apply(score_to_margin, axis=1)



In [4]:
### ----------------------------------------------------------
### 4. FLIGHT WEIGHTS
###    S1 > S2 > S3 > S4
### ----------------------------------------------------------

flight_weight = {"S1": 1.30, "S2": 1.15, "S3": 1.05, "S4": 1.00}
sing["flight_weight"] = sing["flight"].map(flight_weight).fillna(1.00)



### Rating Calculation Steps

In [6]:
### ----------------------------------------------------------
### 5. INITIALIZE RATINGS
### ----------------------------------------------------------

all_players = pd.unique(sing[["winner1", "loser1"]].values.ravel("K"))
ratings = {pid: 20.0 for pid in all_players}   # neutral starting point


### ----------------------------------------------------------
### 6. ELO FUNCTION
### ----------------------------------------------------------

def expected_score(rA, rB):
    # Smaller divisor → larger rating spread sensitivity (tennis appropriate)
    return 1 / (1 + 10**((rB - rA) / 12))


K = 6.0   # base update size


### ----------------------------------------------------------
### 7. ITERATIVE TRAINING
### ----------------------------------------------------------

for epoch in range(12):
    for _, row in sing.iterrows():

        w = row["winner1"]
        l = row["loser1"]
        margin = row["margin"]
        f = row["flight_weight"]

        rw = ratings[w]
        rl = ratings[l]

        Ew = expected_score(rw, rl)
        El = 1 - Ew

        margin_factor = 1 + (abs(margin) * 0.10)

        ratings[w] += K * f * margin_factor * (1 - Ew)
        ratings[l] += K * f * margin_factor * (0 - El)


### ----------------------------------------------------------
### 8. NORMALIZE TO RANGE 5–39
### ----------------------------------------------------------

rating_series = pd.Series(ratings)
min_r = rating_series.min()
max_r = rating_series.max()

def scale_5_39(x):
    return 5 + (x - min_r) * (34 / (max_r - min_r))

scaled = rating_series.apply(scale_5_39)


### ----------------------------------------------------------
### 9. BUILD FINAL OUTPUT TABLE
### ----------------------------------------------------------

player_ratings = pd.DataFrame({
    "playerID": scaled.index,
    "rating": scaled.values
}).sort_values("rating", ascending=False)

player_ratings.to_csv(MAIN_DIR + r"\player_ratings.csv", index=False)

player_ratings.head(20)

Unnamed: 0,playerID,rating
75,163631,39.0
4,123420,37.108171
249,145172,37.057582
70,164183,36.752026
80,141982,36.271394
125,104618,35.204742
84,143078,34.785011
9,141640,34.604943
82,123263,34.27346
320,121899,34.163175


In [6]:
print("Rows loaded:", len(sing))
print("Rows with missing winner1:", sing["winner1"].isna().sum())
print("Rows with missing loser1:", sing["loser1"].isna().sum())

print("Sample winner1 values:", sing["winner1"].head())
print("Sample loser1 values:", sing["loser1"].head())

print("Unique flights:", sing["flight"].unique())


Rows loaded: 4081
Rows with missing winner1: 0
Rows with missing loser1: 0
Sample winner1 values: 0    143027
1    171234
2    141639
3     99031
4    123420
Name: winner1, dtype: int32
Sample loser1 values: 0    165951
1    143025
2     99025
3    141570
4     99367
Name: loser1, dtype: int32
Unique flights: ['S1' 'S2' 'S3' 'S4' 'S5' 'S7' 'S10' 'S6']


## Doubles

In [7]:
### ----------------------------------------------------------
### 1. LOAD DATA
### ----------------------------------------------------------

MAIN_DIR = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"
doub = pd.read_csv(MAIN_DIR + r"\doubles_matches_clean.csv")


In [8]:
### ----------------------------------------------------------
### 2. CLEAN ID FIELDS — drop any match with missing players
### ----------------------------------------------------------

# Convert to numeric safely
for col in ["winner1", "winner2", "loser1", "loser2"]:
    doub[col] = pd.to_numeric(doub[col], errors="coerce")

# Drop rows where ANY doubles player is missing or invalid
doub = doub.dropna(subset=["winner1", "winner2", "loser1", "loser2"])
doub = doub[(doub["winner1"] > 0) & (doub["winner2"] > 0) &
            (doub["loser1"]  > 0) & (doub["loser2"]  > 0)]

# Convert to int now that they’re clean
for col in ["winner1", "winner2", "loser1", "loser2"]:
    doub[col] = doub[col].astype(int)

In [9]:
### ----------------------------------------------------------
### 3. SCORE MARGIN (same method as singles)
### ----------------------------------------------------------

def score_to_margin(row):
    total = 0
    for col in ["set1", "set2", "set3"]:
        val = row[col]
        if isinstance(val, str) and "*" in val:
            a, b = val.split("*")
            total += (int(a) - int(b))
    return total

doub["margin"] = doub.apply(score_to_margin, axis=1)


In [10]:
### ----------------------------------------------------------
### 4. FLIGHT WEIGHTS (same pattern as singles)
### ----------------------------------------------------------

flight_weight = {"D1": 1.30, "D2": 1.15, "D3": 1.05, "D4": 1.00}
doub["flight_weight"] = doub["flight"].map(flight_weight).fillna(1.00)

### Start Doubles Rating Process

In [11]:
### ----------------------------------------------------------
### 5. INITIALIZE RATINGS
### ----------------------------------------------------------

# Collect all individual doubles players
all_players = pd.unique(
    doub[["winner1", "winner2", "loser1", "loser2"]].values.ravel("K")
)

ratings = {pid: 20.0 for pid in all_players}   # neutral starting point


### ----------------------------------------------------------
### 6. ELO EXPECTED SCORE FUNCTION
### ----------------------------------------------------------

def expected_score(rA, rB):
    # divisor=12 used because doubles spreads are smaller
    return 1 / (1 + 10**((rB - rA) / 12))

K = 6.0


### ----------------------------------------------------------
### 7. ITERATIVE TRAINING LOOP FOR DOUBLES
### ----------------------------------------------------------

for epoch in range(12):
    for _, row in doub.iterrows():

        # Extract players
        w1, w2 = row["winner1"], row["winner2"]
        l1, l2 = row["loser1"],  row["loser2"]
        margin = row["margin"]
        f = row["flight_weight"]

        # Team ratings = sum of individual ratings
        rw = ratings[w1] + ratings[w2]
        rl = ratings[l1] + ratings[l2]

        # Expected team outcomes
        Ew = expected_score(rw, rl)
        El = 1 - Ew

        # Margin factor (same as singles)
        margin_factor = 1 + (abs(margin) * 0.10)

        # Rating update amount
        delta = K * f * margin_factor

        # Update WINNER TEAM
        change_w = delta * (1 - Ew)
        ratings[w1] += change_w / 2
        ratings[w2] += change_w / 2

        # Update LOSER TEAM
        change_l = delta * (0 - El)
        ratings[l1] += change_l / 2
        ratings[l2] += change_l / 2


### ----------------------------------------------------------
### 8. NORMALIZE RATINGS TO RANGE 5–39
### ----------------------------------------------------------

rating_series = pd.Series(ratings)
min_r, max_r = rating_series.min(), rating_series.max()

def scale_5_39(x):
    return 5 + (x - min_r) * (34 / (max_r - min_r))

scaled = rating_series.apply(scale_5_39)


### ----------------------------------------------------------
### 9. BUILD FINAL OUTPUT TABLE
### ----------------------------------------------------------

player_ratings = pd.DataFrame({
    "playerID": scaled.index,
    "rating": scaled.values
}).sort_values("rating", ascending=False)

player_ratings.to_csv(MAIN_DIR + r"\doubles_player_ratings.csv", index=False)

player_ratings.head(20)

Unnamed: 0,playerID,rating
983,121901,39.0
78,107511,37.750494
990,124077,37.537987
64,121899,36.214898
300,100762,35.684072
367,106168,35.60524
217,102363,35.492528
9,119281,35.316553
74,103807,35.250399
371,165255,34.464865


## Bring together ratings and names

In [12]:
import pandas as pd

BASE = r"C:\Users\toddw\Desktop\Python Rating Code and Files\GenAI_Rewrite"

# Load ratings files
sing_ratings = pd.read_csv(BASE + r"\player_ratings.csv")
doub_ratings = pd.read_csv(BASE + r"\doubles_player_ratings.csv")

# Load players file
players = pd.read_csv(BASE + r"\girls_players.csv")[["playerID", "fullname"]]

# Convert IDs to numeric consistently
sing_ratings["playerID"] = pd.to_numeric(sing_ratings["playerID"], errors="coerce")
doub_ratings["playerID"] = pd.to_numeric(doub_ratings["playerID"], errors="coerce")
players["playerID"] = pd.to_numeric(players["playerID"], errors="coerce")

# Merge names into ratings
sing_with_names = sing_ratings.merge(players, on="playerID", how="left")
doub_with_names = doub_ratings.merge(players, on="playerID", how="left")

# Sort and show top 20
top20_sing = sing_with_names.sort_values("rating", ascending=False).head(20)
top20_doub = doub_with_names.sort_values("rating", ascending=False).head(20)

print("\n=== TOP 20 SINGLES PLAYERS ===\n")
print(top20_sing)

print("\n=== TOP 20 DOUBLES PLAYERS ===\n")
print(top20_doub)



=== TOP 20 SINGLES PLAYERS ===

    playerID     rating              fullname
0     163631  39.000000              Leah Lup
1     123420  37.108171           Raegan Farm
2     145172  37.057582          Kate Bonetto
3     164183  36.752026  Nikhitha Raji Suresh
4     141982  36.271394        Sofia Sorokina
5     104618  35.204742        Kennedy Harris
6     143078  34.785011        Alexis Uschold
7     141640  34.604943        Whitney Hedden
8     123263  34.273460         Sonya Drayton
9     121899  34.163175        Sabrina Larsen
10    140879  33.967050      Carlyta Barfield
11    124077  33.331987        Malavika Rahul
12    164184  33.196413           Vivienne Lu
13    119281  33.114099          Amanda Perez
14    143730  32.224519          Sitara Soman
15    122723  31.905471        Hayden Kearney
16    145001  31.571343       Elizabeth Sacks
17    123192  31.520223           Rosalyn Cho
18    101022  31.312867       Veronica Miller
19    123078  31.302610     Vasilisa Blinkova

