In [None]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split

Read in from Kaggle datasets and clean up

In [None]:
tourney_results = pd.read_csv('/content/MNCAATourneyCompactResults.csv')
tourney_results = tourney_results[tourney_results['Season'] >= 2010]
tourney_results = tourney_results.drop(columns=['DayNum', 'WLoc', 'NumOT'])
tourney_results = tourney_results.reset_index(drop=True)
tourney_results.head()

Unnamed: 0,Season,WTeamID,WScore,LTeamID,LScore
0,2010,1115,61,1457,44
1,2010,1124,68,1358,59
2,2010,1139,77,1431,59
3,2010,1140,99,1196,92
4,2010,1242,90,1250,74


In [None]:
seeds = pd.read_csv('/content/MNCAATourneySeeds.csv')
seeds = seeds[seeds['Season'] >= 2010]

# convert seeds to usable numbers
seeds['Seed'] = seeds['Seed'].transform(
    lambda x: int(x[1:-1]) if len(x[1:]) == 3 else int(x[1:])
)

seeds = seeds.reset_index(drop=True)
seeds.tail()

Unnamed: 0,Season,Seed,TeamID
740,2021,12,1457
741,2021,13,1317
742,2021,14,1159
743,2021,15,1331
744,2021,16,1216


In [None]:
ratings = pd.read_csv('/content/MMasseyOrdinals.csv')

#ratings systems used for prediction
systems = ['POM', 'SAG', 'MOR', 'COL', 'MAS', 'LMC']
ratings = ratings[ratings['Season'] >= 2010]
ratings = ratings[ratings['SystemName'].isin(systems)] # filter by system

# for each team, in each season, rated by each system, get the most recent rating only
ratings = ratings.groupby(['Season', 'SystemName', 'TeamID']).apply(
    lambda group: group.loc[group['RankingDayNum'] == group['RankingDayNum'].max()]['OrdinalRank']
)

ratings.head()

Season  SystemName  TeamID         
2010    COL         1102    1240945    229
                    1103    1240946     77
                    1104    1240947     88
                    1105    1240948    323
                    1106    1240949    274
Name: OrdinalRank, dtype: int64

In [None]:
# ratings test: find Gonzaga's KenPom rating in 2022
print(ratings[2021]['POM'][1211])

4333821    1
Name: OrdinalRank, dtype: int64


Build new data set that has Pomeroy, Sagarin, Moore, Colley, Massey, LRMC, and Seed for each team in each matchup of each year

In [None]:
def seed_from_season_team(season, team):
  return seeds.loc[(seeds['Season'] == season) & (seeds['TeamID'] == team)]['Seed']

def rating_from_season_team(season, team, system):
  return ratings[season][system][team]

def collapse_nans(df):
  return df.stack().groupby(level=0).first().reindex(df.index)

def generate_ratings_column(system):
  col = tourney_results[['Season', 'WTeamID']].apply(lambda x: rating_from_season_team(x['Season'], x['WTeamID'], system), axis=1)
  return collapse_nans(col)

In [None]:
SEED = tourney_results[['Season', 'WTeamID']].apply(lambda x: seed_from_season_team(x['Season'], x['WTeamID']), axis=1)
SEED = collapse_nans(SEED)
tourney_results.insert(2, 'WSeed', SEED)
tourney_results.head()

Unnamed: 0,Season,WTeamID,WSeed,WScore,LTeamID,LScore
0,2010,1115,16.0,61,1457,44
1,2010,1124,3.0,68,1358,59
2,2010,1139,5.0,77,1431,59
3,2010,1140,7.0,99,1196,92
4,2010,1242,1.0,90,1250,74


In [None]:
tourney_results.insert(2, 'WPOM', generate_ratings_column('POM'))

In [None]:
tourney_results.insert(2, 'WSAG', generate_ratings_column('SAG'))

In [None]:
tourney_results.insert(2, 'WMOR', generate_ratings_column('MOR'))

In [None]:
tourney_results.insert(2, 'WCOL', generate_ratings_column('COL'))

In [None]:
tourney_results.insert(2, 'WMAS', generate_ratings_column('MAS'))

In [None]:
tourney_results.insert(2, 'WLMC', generate_ratings_column('LMC'))

In [None]:
tourney_results.head()

Unnamed: 0,Season,WTeamID,WLMC,WMAS,WCOL,WMOR,WSAG,WPOM,WSeed,WScore,LTeamID,LScore
0,2010,1115,198.0,232.0,215.0,269.0,239.0,238.0,16.0,61,1457,44
1,2010,1124,13.0,11.0,13.0,11.0,8.0,12.0,3.0,68,1358,59
2,2010,1139,15.0,13.0,11.0,24.0,22.0,26.0,5.0,77,1431,59
3,2010,1140,4.0,14.0,17.0,14.0,9.0,7.0,7.0,99,1196,92
4,2010,1242,1.0,1.0,1.0,1.0,1.0,2.0,1.0,90,1250,74


Now, need to add same stats for losing team.

In [None]:
LSEED = tourney_results[['Season', 'LTeamID']].apply(lambda x: seed_from_season_team(x['Season'], x['LTeamID']), axis=1)
LSEED = collapse_nans(LSEED)
tourney_results.insert(11, 'LSeed', LSEED)
tourney_results.head()

Unnamed: 0,Season,WTeamID,WLMC,WMAS,WCOL,WMOR,WSAG,WPOM,WSeed,WScore,LTeamID,LSeed,LScore
0,2010,1115,198.0,232.0,215.0,269.0,239.0,238.0,16.0,61,1457,16.0,44
1,2010,1124,13.0,11.0,13.0,11.0,8.0,12.0,3.0,68,1358,14.0,59
2,2010,1139,15.0,13.0,11.0,24.0,22.0,26.0,5.0,77,1431,12.0,59
3,2010,1140,4.0,14.0,17.0,14.0,9.0,7.0,7.0,99,1196,10.0,92
4,2010,1242,1.0,1.0,1.0,1.0,1.0,2.0,1.0,90,1250,16.0,74


In [None]:
def generate_Lratings_column(system):
  col = tourney_results[['Season', 'LTeamID']].apply(lambda x: rating_from_season_team(x['Season'], x['LTeamID'], system), axis=1)
  return collapse_nans(col)

In [None]:
tourney_results.insert(11, 'LPOM', generate_Lratings_column('POM'))

In [None]:
tourney_results.insert(11, 'LSAG', generate_Lratings_column('SAG'))

In [None]:
tourney_results.insert(11, 'LMOR', generate_Lratings_column('MOR'))

In [None]:
tourney_results.insert(11, 'LCOL', generate_Lratings_column('COL'))

In [None]:
tourney_results.insert(11, 'LMAS', generate_Lratings_column('MAS'))

In [None]:
tourney_results.insert(11, 'LLMC', generate_Lratings_column('LMC'))

In [None]:
tourney_results.head()

Unnamed: 0,Season,WTeamID,WLMC,WMAS,WCOL,WMOR,WSAG,WPOM,WSeed,WScore,LTeamID,LLMC,LMAS,LCOL,LMOR,LSAG,LPOM,LSeed,LScore
0,2010,1115,198.0,232.0,215.0,269.0,239.0,238.0,16.0,61,1457,203.0,192.0,178.0,199.0,210.0,212.0,16.0,44
1,2010,1124,13.0,11.0,13.0,11.0,8.0,12.0,3.0,68,1358,79.0,96.0,89.0,112.0,106.0,102.0,14.0,59
2,2010,1139,15.0,13.0,11.0,24.0,22.0,26.0,5.0,77,1431,30.0,28.0,31.0,38.0,33.0,34.0,12.0,59
3,2010,1140,4.0,14.0,17.0,14.0,9.0,7.0,7.0,99,1196,58.0,55.0,54.0,40.0,48.0,49.0,10.0,92
4,2010,1242,1.0,1.0,1.0,1.0,1.0,2.0,1.0,90,1250,171.0,201.0,166.0,149.0,204.0,180.0,16.0,74


In [None]:
tourney_results.tail()

Unnamed: 0,Season,WTeamID,WLMC,WMAS,WCOL,WMOR,WSAG,WPOM,WSeed,WScore,LTeamID,LLMC,LMAS,LCOL,LMOR,LSAG,LPOM,LSeed,LScore
728,2021,1211,3.0,1.0,1.0,1.0,1.0,1.0,1.0,85,1425,9.0,20.0,13.0,14.0,27.0,14.0,6.0,66
729,2021,1417,44.0,43.0,39.0,49.0,44.0,44.0,11.0,51,1276,4.0,4.0,3.0,4.0,4.0,2.0,1.0,49
730,2021,1124,8.0,3.0,2.0,3.0,2.0,4.0,1.0,78,1222,7.0,6.0,6.0,6.0,6.0,6.0,2.0,59
731,2021,1211,3.0,1.0,1.0,1.0,1.0,1.0,1.0,93,1417,44.0,43.0,39.0,49.0,44.0,44.0,11.0,90
732,2021,1124,8.0,3.0,2.0,3.0,2.0,4.0,1.0,86,1211,3.0,1.0,1.0,1.0,1.0,1.0,1.0,70


Now, need to randomly split the rows into two groups, one where "Team 1" loses and one where "Team 1" wins. That way the model has an equal number of wins and losses to work with.

In [None]:
tourney_results = tourney_results.sample(frac=1) # shuffle
# Data21 = tourney_results[tourney_results['Season'] == 2021] # store before we remove
# tourney_results = tourney_results[tourney_results['Season'] != 2021] # no current season data
partition = np.array_split(tourney_results, 2)  # split

In [None]:
# WINNING TEAM - LOSING TEAM | TO BE CLASSIFIED AS A WIN

winners = pd.DataFrame({'LMC_diff': partition[0]['WLMC'] - partition[0]['LLMC'], 
                        'MAS_diff': partition[0]['WMAS'] - partition[0]['LMAS'],
                        'COL_diff': partition[0]['WCOL'] - partition[0]['LCOL'],
                        'MOR_diff': partition[0]['WMOR'] - partition[0]['LMOR'],
                        'SAG_diff': partition[0]['WSAG'] - partition[0]['LSAG'],
                        'POM_diff': partition[0]['WPOM'] - partition[0]['LPOM'],
                        'SEED_diff': partition[0]['WSeed'] - partition[0]['LSeed']
                        })

winners['Outcome'] = 1
winners.head()

Unnamed: 0,LMC_diff,MAS_diff,COL_diff,MOR_diff,SAG_diff,POM_diff,SEED_diff,Outcome
568,-161.0,-234.0,-246.0,-243.0,-230.0,-235.0,-15.0,1
311,-8.0,-17.0,-23.0,1.0,-14.0,-2.0,-3.0,1
457,20.0,36.0,22.0,11.0,18.0,13.0,-1.0,1
486,-13.0,-6.0,-15.0,6.0,-12.0,-15.0,-1.0,1
198,-16.0,-33.0,-71.0,-3.0,-18.0,-25.0,0.0,1


In [None]:
# LOSING TEAM - WINNING TEAM | TO BE CLASSIFIED AS A LOSS

losers =  pd.DataFrame({'LMC_diff': partition[1]['LLMC'] - partition[1]['WLMC'], 
                        'MAS_diff': partition[1]['LMAS'] - partition[1]['WMAS'],
                        'COL_diff': partition[1]['LCOL'] - partition[1]['WCOL'],
                        'MOR_diff': partition[1]['LMOR'] - partition[1]['WMOR'],
                        'SAG_diff': partition[1]['LSAG'] - partition[1]['WSAG'],
                        'POM_diff': partition[1]['LPOM'] - partition[1]['WPOM'],
                        'SEED_diff': partition[1]['LSeed'] - partition[1]['WSeed']
                        })

losers['Outcome'] = 0
losers.head()

Unnamed: 0,LMC_diff,MAS_diff,COL_diff,MOR_diff,SAG_diff,POM_diff,SEED_diff,Outcome
266,-24.0,10.0,-14.0,12.0,-19.0,2.0,0.0,0
325,47.0,51.0,39.0,54.0,55.0,48.0,10.0,0
244,25.0,31.0,20.0,24.0,27.0,22.0,8.0,0
180,16.0,13.0,10.0,31.0,19.0,30.0,7.0,0
675,211.0,158.0,133.0,129.0,136.0,142.0,15.0,0


In [None]:
all_games = pd.concat([winners, losers])
all_games = all_games.sample(frac=1)
all_games.head()

Unnamed: 0,LMC_diff,MAS_diff,COL_diff,MOR_diff,SAG_diff,POM_diff,SEED_diff,Outcome
234,70.0,71.0,90.0,54.0,82.0,89.0,8.0,0
548,19.0,33.0,27.0,1.0,-9.0,-2.0,3.0,0
102,14.0,10.0,10.0,5.0,3.0,6.0,3.0,0
175,23.0,13.0,17.0,6.0,13.0,10.0,3.0,1
248,46.0,26.0,32.0,19.0,30.0,28.0,5.0,0


Split the data into training and testing

In [None]:
train, test = train_test_split(all_games, test_size=0.2)

Training the Model

In [None]:
lsvc = svm.LinearSVC()
training_features = train[['MAS_diff',	'COL_diff',	'MOR_diff',	'SAG_diff',	'POM_diff',	'SEED_diff']].to_numpy();
training_labels   = train['Outcome'].to_numpy();

In [None]:
lsvc.fit(training_features, training_labels)



LinearSVC()

In [None]:
print(lsvc.score(training_features, training_labels))

0.7184300341296929


Now for testing

In [None]:
test_features = test[['MAS_diff',	'COL_diff',	'MOR_diff',	'SAG_diff',	'POM_diff',	'SEED_diff']].to_numpy();
test_labels   = test['Outcome'].to_numpy();

In [None]:
pred_labels = lsvc.predict(test_features)
from sklearn.metrics import classification_report
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           0       0.69      0.71      0.70        76
           1       0.68      0.66      0.67        71

    accuracy                           0.69       147
   macro avg       0.69      0.69      0.69       147
weighted avg       0.69      0.69      0.69       147



Predicting 2022 Tournament

In [None]:
teams = pd.read_csv('/content/MTeams.csv')
teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2021
1,1102,Air Force,1985,2021
2,1103,Akron,1985,2021
3,1104,Alabama,1985,2021
4,1105,Alabama A&M,2000,2021


In [None]:
team1ID = teams.loc[teams['TeamName'] == "Gonzaga"]['TeamID']
team2ID = teams.loc[teams['TeamName'] == "Arizona"]['TeamID']

diff_vec = [int(ratings[2022]['MAS'][team1ID]) - int(ratings[2022]['MAS'][team2ID]),
            int(ratings[2022]['COL'][team1ID]) - int(ratings[2022]['COL'][team2ID]),
            int(ratings[2022]['MOR'][team1ID]) - int(ratings[2022]['MOR'][team2ID]),
            int(ratings[2022]['SAG'][team1ID]) - int(ratings[2022]['SAG'][team2ID]),
            int(ratings[2022]['POM'][team1ID]) - int(ratings[2022]['POM'][team2ID]),
            1 - 16
            ]

print(lsvc.predict(np.reshape(diff_vec, (1, -1))))

[0]


Arizona is predicted to be the National Champion