In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

In [None]:
historicalTourneySeeds = pd.read_csv('data/MNCAATourneySeeds.csv')
seeds24 = pd.read_csv('data/2024_tourney_seeds.csv')
mRegDetail = pd.read_csv('data/MRegularSeasonDetailedResults.csv')
tourneyCompact = pd.read_csv('data/MNCAATourneyCompactResults.csv')
display(mRegDetail)

In [None]:
display(mRegDetail.columns.values)

In [None]:
winTeams = pd.DataFrame()
LoseTeams = pd.DataFrame()

columns = ['Season', 'TeamID', 'Score', 'OppScore',
       'Loc', 'NumOT', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA',
       'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'OppFGM', 'OppFGA',
       'OppFGM3', 'OppFGA3', 'OppFTM', 'OppFTA', 'OppOR', 'OppDR', 'OppAst', 'OppTO',
       'OppStl', 'OppBlk', 'OppPF']

In [None]:
winTeams[columns] = mRegDetail[['Season', 'WTeamID', 'WScore', 'LScore',
       'WLoc', 'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA',
       'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA',
       'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO',
       'LStl', 'LBlk', 'LPF']]

winTeams['Wins'] = 1
winTeams['Losses'] = 0

LoseTeams[columns] = mRegDetail[['Season', 'LTeamID', 'LScore', 'WScore',
       'WLoc', 'NumOT', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA',
       'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 'WFGM', 'WFGA',
       'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO',
       'WStl', 'WBlk', 'WPF']]

def changeLocation(location):
    if location == 'H':
        return 'A'
    elif location == 'A':
        return 'H'
    else:
        return 'N'
    
LoseTeams['Loc'] = LoseTeams['Loc'].apply(changeLocation) 

LoseTeams['Wins'] = 0
LoseTeams['Losses'] = 1

mAllRegDetail = pd.concat([winTeams, LoseTeams])

In [None]:
seasonDetails = mAllRegDetail.groupby(['Season', 'TeamID']).sum(numeric_only=True)
seasonDetails['NumGames'] = seasonDetails['Wins'] + seasonDetails['Losses']
display(seasonDetails.columns.values)

In [None]:
mRegSeasonInput = pd.DataFrame()

# Building some useful features not found already in the dataset
mRegSeasonInput['WinRatio'] = seasonDetails['Wins'] / seasonDetails['NumGames']
mRegSeasonInput['PointsPerGame'] = seasonDetails['Score'] / seasonDetails['NumGames']
mRegSeasonInput['PointsAllowedPerGame'] = seasonDetails['OppScore'] / seasonDetails['NumGames']
mRegSeasonInput['PointsRatio'] = seasonDetails['Score'] / seasonDetails['OppScore']
mRegSeasonInput['OTsPerGame'] = seasonDetails['NumOT'] / seasonDetails['NumGames']

# Transforming features already present
mRegSeasonInput['FGMPerGame'] = seasonDetails['FGM'] / seasonDetails['NumGames']
mRegSeasonInput['FGMRatio'] = seasonDetails['FGM'] / seasonDetails['FGA']
mRegSeasonInput['FGAllowedPerGame'] = seasonDetails['OppFGM'] / seasonDetails['NumGames']

mRegSeasonInput['FG3MPerGame'] = seasonDetails['FGM3'] / seasonDetails['NumGames']
mRegSeasonInput['FG3MRatio'] = seasonDetails['FGM3'] / seasonDetails['FGA3']
mRegSeasonInput['FG3AllowedPerGame'] = seasonDetails['OppFGM3'] / seasonDetails['NumGames']

mRegSeasonInput['FTMPerGame'] = seasonDetails['FTM'] / seasonDetails['NumGames']
mRegSeasonInput['FTMRatio'] = seasonDetails['FTM'] / seasonDetails['FTA']
mRegSeasonInput['FTAllowedPerGame'] = seasonDetails['OppFTM'] / seasonDetails['NumGames']

mRegSeasonInput['ORRatio'] = seasonDetails['OR'] / (seasonDetails['OR'] + seasonDetails['OppDR'])
mRegSeasonInput['DRRatio'] = seasonDetails['DR'] / (seasonDetails['DR'] + seasonDetails['OppOR'])
mRegSeasonInput['AstPerGame'] = seasonDetails['Ast'] / seasonDetails['NumGames']
mRegSeasonInput['TOPerGame'] = seasonDetails['TO'] / seasonDetails['NumGames']
mRegSeasonInput['StlPerGame'] = seasonDetails['Stl'] / seasonDetails['NumGames']
mRegSeasonInput['BlkPerGame'] = seasonDetails['Blk'] / seasonDetails['NumGames']
mRegSeasonInput['PFPerGame'] = seasonDetails['PF'] / seasonDetails['NumGames']

display(mRegSeasonInput)
display(mRegSeasonInput.describe())

In [None]:
# Compare teams in previous tournaments
seedDict = historicalTourneySeeds.set_index(['Season', 'TeamID'])

winnersTourney = pd.DataFrame()
winnersTourney[['Season', 'Team1', 'Team2']] = tourneyCompact[['Season', 'WTeamID', 'LTeamID']]
winnersTourney['Result'] = 1

lossersTourney = pd.DataFrame()
lossersTourney[['Season', 'Team1', 'Team2']] = tourneyCompact[['Season', 'LTeamID', 'WTeamID']]
lossersTourney['Result'] = 0

tourneyInput = pd.concat([winnersTourney, lossersTourney])
tourneyInput = tourneyInput[tourneyInput['Season'] >= 2003].reset_index(drop=True)

team1Seeds = []
team2Seeds = []

for x in range(len(tourneyInput)):
    idx = tourneyInput['Season'][x], tourneyInput['Team1'][x]
    seed = seedDict.loc[idx].values[0]
    
    if len(seed) == 4:
        seed = int(seed[1:-1])
    else:
        seed = int(seed[1:])
    team1Seeds.append(seed)
    
    idx = tourneyInput['Season'][x], tourneyInput['Team2'][x]
    seed = seedDict.loc[idx].values[0]
    
    if len(seed) == 4:
        seed = int(seed[1:-1])
    else:
        seed = int(seed[1:])
    team2Seeds.append(seed)
    
tourneyInput['Team1Seed'] = team1Seeds
tourneyInput['Team2Seed'] = team2Seeds

display(tourneyInput)

In [None]:
outscores = []

for x in range(len(tourneyInput)):
    idx = tourneyInput['Season'][x], tourneyInput['Team1'][x]
    team1Score = mRegSeasonInput.loc[idx]
    team1Score['Seed'] = tourneyInput['Team1Seed'][x]
    
    idx = tourneyInput['Season'][x], tourneyInput['Team2'][x]
    team2Score = mRegSeasonInput.loc[idx]
    team2Score['Seed'] = tourneyInput['Team2Seed'][x]
    
    outscore = team1Score - team2Score
    outscore['Result'] = tourneyInput['Result'][x]
    outscores.append(outscore)

In [None]:
outscores = pd.DataFrame(outscores)
display(outscores)
display(outscores.describe())

In [None]:
corrs = round(outscores.corr(), 2)
display(np.abs(corrs['Result']))

plt.figure(figsize=(20,10))
sns.heatmap(corrs)
plt.show()

In [None]:
X = outscores[outscores.columns[:-1]].values
y = outscores['Result'].values

# Split the data
np.random.seed(1)
idx = np.random.permutation(len(X))

train_idx = idx[:int(-.2*len(X))]
test_idx = idx[int(-.2*len(X)):]

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

#Scale the data
mins = X_train.min(axis=0)
maxs = X_train.max(axis=0)

X_train = (X_train - mins) / (maxs - mins)
X_test = (X_test - mins) / (maxs - mins)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
model = RandomForestClassifier(random_state=1)
model = model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
mTourneySeeds24 = seeds24[seeds24['Tournament'] == 'M']
display(mTourneySeeds24)

In [None]:
mRegSeasonInput24 = mRegSeasonInput.loc[2024]
display(mRegSeasonInput24)
display(mRegSeasonInput24.describe())

In [None]:
for x in range(len(mTourneySeeds24)):
    seed = mTourneySeeds24['Seed'][x]
    
    if len(seed) == 4:
        seed = int(seed[1:-1])
    else:
        seed = int(seed[1:])
    mTourneySeeds24['Seed'][x] = seed
    
mTourneySeedDict24 = mTourneySeeds24.set_index('TeamID')['Seed'].to_dict()
print(len(mTourneySeedDict24))
print(mTourneySeedDict24)

In [None]:
# Pair every team against every other team for 2024
pairs = []

teams = list(mTourneySeedDict24.keys())

for i in range(len(teams)):
    for j in range(i + 1, len(teams)): # only add unique pairings
        pair = (teams[i], mTourneySeedDict24[teams[i]], teams[j], mTourneySeedDict24[teams[j]])
        pairs.append(pair)

tourneyInput24 = pd.DataFrame(pairs, columns=['Team1', 'Seed1', 'Team2', 'Seed2'])
print(len(tourneyInput24))

In [None]:
outScores24 = []

for x in range(len(tourneyInput24)):
    team1ID = tourneyInput24['Team1'][x]
    team2ID = tourneyInput24['Team2'][x]
    
    team1Score = mRegSeasonInput.loc[2024, team1ID]
    team2Score = mRegSeasonInput.loc[2024, team2ID]
    
    team1Score['Seed'] = int(tourneyInput24['Seed1'][x])
    team2Score['Seed'] = int(tourneyInput24['Seed2'][x])
    
    outscore = team1Score - team2Score
    outScores24.append(outscore)

In [None]:
outScores24 = pd.DataFrame(outScores24)
display(outScores24)
display(outScores24.describe())

In [None]:
X24 = outScores24[outScores24.columns].values

# Scale the new data
XScaled24 = (X24 - mins) / (maxs - mins)

predictions = model.predict(XScaled24)
tourneyInput24['PredictedResult'] = predictions
tourneyInput24.head(100)

In [None]:
# Add team names to predictions
finalOutput24 = tourneyInput24
teamSpellingsDf = pd.read_excel('data/MTeamSpellings.xlsx')

for index, row in tourneyInput24.iterrows():
    team1Spelling = teamSpellingsDf.loc[teamSpellingsDf['TeamID'] == row['Team1'], 'TeamNameSpelling'].iloc[0]
    team2Spelling = teamSpellingsDf.loc[teamSpellingsDf['TeamID'] == row['Team2'], 'TeamNameSpelling'].iloc[0]
    
    finalOutput24.at[index, 'Team1Spelling'] = team1Spelling
    finalOutput24.at[index, 'Team2Spelling'] = team2Spelling

display(finalOutput24)
finalOutput24.to_csv('output.csv')