In [304]:
import pandas as pd 
import os 
import numpy as np
import re

In [305]:
INPUT_PATH = "/Users/jimmyhe/Desktop/ML/KaggleCompetitions/NCAA/DataPreparation/Kenpom/KenpomDataCollectionFinal/"
OUTPUT_PATH = "/Users/jimmyhe/Desktop/ML/KaggleCompetitions/NCAA/DataPreparation/Kenpom/KenPomCleanHeaders/"

In [306]:

def remove_rk_rows(df):
    rk_indices = df.index[df.iloc[:, 0] == "Rk"].tolist()
    if len(rk_indices) > 0:
        rk_indices_to_remove = [idx for idx in rk_indices if idx != rk_indices[0]]
    else:
        rk_indices_to_remove = []
    rows_to_remove = set(rk_indices_to_remove + [idx - 1 for idx in rk_indices_to_remove if idx - 1 >= 0])
    df_cleaned = df.drop(rows_to_remove).reset_index(drop=True)
    return df_cleaned

# Ensure the output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Iterate over all CSV files in the input directory
for filename in os.listdir(INPUT_PATH):
    if filename.endswith(".csv"):  # Process only .csv files
        print(f"Processing file: {filename}")
        file_path = os.path.join(INPUT_PATH, filename)
        df = pd.read_csv(file_path)
        
        # Step 1: Clean the dataframe by removing the "Rk" rows
        cleaned_df = remove_rk_rows(df)
        cleaned_df = cleaned_df.iloc[:, [0, 1, 3, 4, 6, 8]]
        cleaned_df.columns = ['Rk','Team','NetRtg','ORtg','DRtg','AdjT']
        cleaned_df = cleaned_df.drop(0, axis=0)


    if cleaned_df.shape[1] > 1 and cleaned_df.iloc[:, 1].dtype == 'object':  # Check if it's a string
        def remove_last_part_if_number(string):
            parts = string.split(' ')  # Split the string into parts
            if parts[-1].isdigit():  # Check if the last part is a number
                return ' '.join(parts[:-1])  # Return the string without the last part
            else:
                return string  # Return the string unchanged if the last part is not a number
        
        cleaned_df.iloc[:, 1] = cleaned_df.iloc[:, 1].apply(remove_last_part_if_number)


        

        cleaned_df.rename(columns={'Team': 'TeamName'}, inplace=True)
        
        # Save the cleaned DataFrame to a new file
        filename = "ColumnCleaned_" + filename
        output_path = os.path.join(OUTPUT_PATH, filename)
        cleaned_df.to_csv(output_path, index=False)

print("Processing complete. Cleaned files saved to the output directory.")


Processing file: Kenpom_2021_03_19.csv
Processing file: Kenpom_2022_03_17.csv
Processing file: Kenpom_2019_03_21.csv
Processing file: Kenpom_2025-03-19.csv
Processing file: Kenpom_2023_03_16.csv
Processing file: Kenpom_2018_03_15.csv
Processing file: Kenpom_2024_03_21.csv
Processing complete. Cleaned files saved to the output directory.


In [307]:
CLEAN_PATH = "/Users/jimmyhe/Desktop/ML/KaggleCompetitions/NCAA/DataPreparation/Kenpom/KenPomCleanHeaders/"


all_dfs = []

for filename in os.listdir(CLEAN_PATH):
    if '.DS_Store' in filename:
        continue
    filepath = os.path.join(CLEAN_PATH, filename)
    
    if os.path.isfile(filepath):
        temp = pd.read_csv(filepath)
        temp['Season'] = re.search(r'\d{4}', filename).group()
        print(filename, "shape:", temp.shape)
        all_dfs.append(temp)

Kenpom_stacked = pd.concat(all_dfs, ignore_index=True)
print('Final Shape:',Kenpom_stacked.shape)

Kenpom_stacked = Kenpom_stacked.sort_values(by=["Season", "Rk"], ascending=[False, True])
Kenpom_stacked.head()

ColumnCleaned_Kenpom_2023_03_16.csv shape: (363, 7)
ColumnCleaned_Kenpom_2025-03-19.csv shape: (364, 7)
ColumnCleaned_Kenpom_2019_03_21.csv shape: (353, 7)
ColumnCleaned_Kenpom_2022_03_17.csv shape: (358, 7)
ColumnCleaned_Kenpom_2021_03_19.csv shape: (357, 7)
ColumnCleaned_Kenpom_2024_03_21.csv shape: (362, 7)
ColumnCleaned_Kenpom_2018_03_15.csv shape: (351, 7)
Final Shape: (2508, 7)


Unnamed: 0,Rk,TeamName,NetRtg,ORtg,DRtg,AdjT,Season
363,1,Duke,38.25,128.0,89.8,65.8,2025
364,2,Florida,36.19,128.6,92.5,69.6,2025
365,3,Houston,35.41,123.2,87.8,61.5,2025
366,4,Auburn,35.11,128.5,93.4,67.6,2025
367,5,Tennessee,31.13,120.3,89.1,63.7,2025


In [308]:
def replace_names(df, column_name, replacements):
    df[column_name] = df[column_name].map(replacements).fillna(df[column_name])
    return df


replacements = {

    'Coastal Carolina': 'Coastal Car',
    'Western Carolina': 'W Carolina',
    'Eastern Washington': 'E Washington',
    'Western Illinois': 'W Illinois',
    'Nebraska Omaha': 'NE Omaha',
    'The Citadel': 'Citadel',
    'FIU': 'Florida Intl',
    'San Diego St.': 'San Diego St',
    'Florida Atlantic': 'FL Atlantic',
    'Purdue Fort Wayne': 'PFW',
    'Arkansas Pine Bluff': 'Ark Pine Bluff',
    'North Carolina Central': 'NC Central',
    'Abilene Christian': 'Abilene Chr',
    'Charleston Southern': 'Charleston So',
    'Illinois Chicago': 'IL Chicago',
    'George Washington': 'G Washington',
    "Texas A&M Commerce": "TX A&M Commerce",
    "Saint Mary's": "St Mary's CA",
    "Georgia Southern": "Ga Southern",
    "Mississippi Valley St.": "MS Valley St",
    "East Tennessee St": "ESTU",
    "UMass Lowell": "MA Lowell",
    "Grambling St": "Grambling",
    "Fairleigh Dickinson": "F Dickinson",
    "USC Upstate": "SC Upstate",
    "Eastern Illinois": "E Illinois",
    "South Carolina St": "S Carolina St",
    "Louisiana Monroe": "la-monroe",
    "Tennessee Martin": "TN Martin",
    "Kennesaw St": "Kennesaw",
    "UT Rio Grande Valley": "UTRGV",
    "Texas A&M Corpus Chris": "TAM C. Christi",
    "North Carolina A&T": "NC A&T",
    "Texas Southern": "TX Southern",
    "Saint Louis": "St Louis",
    "Little Rock": "Ark Little Rock",
    "Eastern Michigan": "E Michigan",
    "Sacramento St": "CS Sacramento",
    "College of Charleston": "Col Charleston",
    "Fort Wayne": "PFW",
    "Milwaukee": "WI Milwaukee",
    "Cal St. Bakersfield": "Bakersfield",
    "Loyola Marymount": "Loy Marymount",
    "Louisiana Lafayette": "Louisiana",
    "St Thomas": "St Thomas MN",
    "LIU": "LIU Brooklyn",
    "Cal St Northridge": "CS Northridge",
    "Green Bay": "WI Green Bay",
    "Maryland Eastern Shore": "MD E Shore",
    "Houston Baptist": "Houston Chr",
    "Houston Christian": "Houston Chr",
    "American": "American Univ",
    "Prairie View A&M": "Prairie View",
    "Loyola Chicago": "Loyola-Chicago",
    "Mount St Mary's": "Mt St Mary's",
    "Monmouth": "Monmouth NJ",
    "Western Michigan": "W Michigan",
    "Florida Gulf Coast": "FL Gulf Coast",
    "Saint Joseph's": "St Joseph's PA",
    "Western Kentucky": "WKU",
    "Southeast Missouri": "se missouri st",
    "Southeast Missouri St.": "se missouri st",
    "Cal St Fullerton": "CS Fullerton",
    "Central Connecticut": "Central Conn",
    "South Dakota St": "S Dakota St",
    "Central Michigan": "C Michigan",
    "Stephen F. Austin": "SF Austin",
    "UTSA": "UT San Antonio",
    "Bethune Cookman": "Bethune-Cookman",
    "Northern Kentucky": "N Kentucky",
    "Middle Tennessee": "MTSU",
    "Eastern Kentucky": "E Kentucky",
    "Central Arkansas": "Cent Arkansas",
    "Southern": "Southern Univ",
    "Southern Illinois": "S Illinois",
    "N.C. State": "NC State",
    "Queens": "Queens NC",
    "Boston University": "Boston Univ",
    "Northern Illinois": "N Illinois",
    "Detroit Mercy": "Detroit",
    "Albany": "SUNY Albany",
    "Northern Colorado": "N Colorado",
    "Saint Peter's": "St Peter's",
    "Southeastern Louisiana": "SE Louisiana",
    "Kent St": "Kent",
    "Saint Francis": "St Francis PA"

}

Kenpom_stacked = replace_names(Kenpom_stacked, 'TeamName', replacements)


In [309]:

M_teams = pd.read_csv('/Users/jimmyhe/Desktop/ML/KaggleCompetitions/NCAA/DataPreparation/CompData/march-machine-learning-mania-2025/MTeams.csv')
spellings = pd.read_csv('/Users/jimmyhe/Desktop/ML/KaggleCompetitions/NCAA/DataPreparation/CompData/march-machine-learning-mania-2025/MTeamSpellings.csv')
spellings = spellings.rename(columns={"TeamNameSpelling": "TeamName"})
Kenpom_stacked['TeamName'] = Kenpom_stacked['TeamName'].str.lower()
spellings['TeamName'] = spellings['TeamName'].str.lower()
final_kenpom = Kenpom_stacked.merge(spellings, on='TeamName', how='left')
mask = final_kenpom['TeamID'].isnull() & final_kenpom['TeamName'].str.contains(r'st\.', regex=True)
final_kenpom.loc[mask, 'TeamName'] = final_kenpom.loc[mask, 'TeamName'].str.replace(r'st\.', 'st', regex=True)
final_kenpom = final_kenpom.drop(columns='TeamID')
final_kenpom = final_kenpom.merge(spellings, on='TeamName', how='left')
missing = final_kenpom[final_kenpom['TeamID'].isnull()]
print(missing)


Empty DataFrame
Columns: [Rk, TeamName, NetRtg, ORtg, DRtg, AdjT, Season, TeamID]
Index: []


In [310]:
names = final_kenpom[final_kenpom['TeamID'].isnull()]['TeamName']
names = names.tolist()
names

[]

In [311]:

final_kenpom.to_csv("/Users/jimmyhe/Desktop/ML/KaggleCompetitions/NCAA/Train_Set/MEN/Kenpom_final.csv")

In [312]:
counts = final_kenpom['TeamID'].value_counts()
numbers_with_6_occurrences = counts[counts == 6].index.tolist()

counts

TeamID
1181    7
1343    7
1262    7
1283    7
1436    7
       ..
1475    3
1478    2
1366    2
1480    1
1479    1
Name: count, Length: 367, dtype: int64

In [313]:
len(numbers_with_6_occurrences)


2