In [1]:
import pandas as pd 

In [4]:
year = 2025
base_path = f"/Users/jimmyhe/Desktop/ML/KaggleCompetitions/NCAA/DataPreparation/CompData/march-machine-learning-mania-{year}/"
result_regular = pd.read_csv(base_path + "WRegularSeasonCompactResults.csv")
seed = pd.read_csv(base_path + "WNCAATourneySeeds.csv")
teams = pd.read_csv(base_path + "WTeams.csv")
elo_rating = pd.read_csv("/Users/jimmyhe/Desktop/ML/KaggleCompetitions/NCAA/Feature_Eng/mine_2025_EloRating_womens_10.csv")


In [12]:

def create_features(team_id, result_regular, seed):
    # Filter games involving the team
    tmp = result_regular[(result_regular["WTeamID"] == team_id) | (result_regular["LTeamID"] == team_id)].copy()

    # Extract numeric seed
    seed_tmp = seed[seed["TeamID"] == team_id].copy()
    seed_tmp["Seed"] = seed_tmp["Seed"].str[1:3].astype(int)
    seed_tmp = seed_tmp.drop(columns=["TeamID"])

    # Rename columns for winning and losing games separately
    tmp2 = tmp.rename(columns={"WTeamID": "TeamID", "WScore": "Score", "LTeamID": "Opp_TeamID", "LScore": "Opp_Score"})
    tmp3 = tmp.rename(columns={"LTeamID": "TeamID", "LScore": "Score", "WTeamID": "Opp_TeamID", "WScore": "Opp_Score"})

    # Combine both winning and losing games
    tmp4 = pd.concat([tmp2, tmp3])
    tmp4 = tmp4[tmp4["TeamID"] == team_id].copy()

    # Calculate score difference and win indicator
    tmp4["diff_score"] = tmp4["Score"] - tmp4["Opp_Score"]
    tmp4["win"] = (tmp4["diff_score"] > 0).astype(int)

    # Aggregate statistics by season
    tmp5 = tmp4.groupby("Season").agg(
        count=("win", "size"),
        win_count=("win", "sum"),
        win_rate=("win", "mean"),
        gap_avg=("diff_score", "mean")
    ).reset_index()

    # Merge with seed data
    tmp5 = tmp5.merge(seed_tmp, on="Season", how="left")
    tmp5["TeamID"] = team_id

    return tmp5

# Loop over all teams to create features
res_list = []
for team_id in teams["TeamID"]:
    res_list.append(create_features(team_id, result_regular, seed))

# Combine all teams' results into one DataFrame
res_df = pd.concat(res_list, ignore_index=True)

# Merge with Elo ratings data
df_feat = res_df.merge(elo_rating, on=["TeamID", "Season"], how="left").sort_values(by = ["Season", "Seed"], ascending = [False, True])


len(df_feat[df_feat["Season"] == 2025]), len(df_feat[(df_feat["Season"] == 2025) & df_feat["Seed"].notnull()])



(362, 68)

In [18]:
my_peek = df_feat[df_feat["Season"]<= 2024]
my_peek.tail()

Unnamed: 0,Season,count,win_count,win_rate,gap_avg,Seed,TeamID,Rating,TeamName
9258,1998,26,3,0.115385,-10.307692,,3459,,
9286,1998,25,7,0.28,-4.6,,3460,,
9314,1998,23,8,0.347826,-4.347826,,3461,,
9342,1998,27,16,0.592593,3.666667,,3462,,
9370,1998,24,13,0.541667,-1.666667,,3463,,


In [13]:
# Save the final features DataFrame to CSV
df_feat.to_csv("/Users/jimmyhe/Desktop/ML/KaggleCompetitions/NCAA/Train_Set/WOMEN/flaty_elo_W.csv", index=False)


In [None]:
BASE_PATH = "/Users/jimmyhe/Desktop/ML/KaggleCompetitions/NCAA/Feature_Eng/Flaty_Data_Ref/"
flaty_W_elo_ref = pd.read_csv(BASE_PATH + 'features_womens.csv')
flaty_W_elo_ref = flaty_W_elo_ref.sort_values(by = ['Season', 'Seed'], ascending= [False, True])
flaty_W_elo_ref.tail()

Unnamed: 0,TeamID,TeamName,Rating,num,Season,count,win_count,win_rate,gap_avg,Seed
371,3474,Queens NC,50.0,372,2010,,,,,
372,3475,Southern Indiana,50.0,373,2010,,,,,
373,3476,Stonehill,50.0,374,2010,,,,,
374,3477,TX A&M Commerce,50.0,375,2010,,,,,
375,3478,Le Moyne,50.0,376,2010,,,,,
