# Generating Predictions

Using the Logistic Regression model that we chose in the Selecting a Model notebook, we will create predictions for the 2021 NCAA Tournament.

In [34]:
# Import packages
import sys
sys.path.append('./')

import pandas as pd
from sklearn.linear_model import LogisticRegression
import collegebasketball as cbb

import warnings
warnings.filterwarnings('ignore')

cbb.__version__

'2023'

## Train the Model

Using the same method as before, we will train the model. To understand how I arrived at this model, please look at the Selecting a Model notebook for more information.

However, there is one major difference in how we will train the model this time. Before, we split the data into training and testing sets, but since we are predicting for new games, we will use all of the training data to train the model.

In [35]:
# Load the csv files that contain the scores/kenpom data
path = './Data/Training/training.csv'
train = pd.read_csv(path)

# Get a sense for the size of each data set
print('Length of training data: {}'.format(len(train)))

Length of training data: 10828


In [36]:
train.head(3)

Unnamed: 0,Favored,Underdog,Year,Tournament,Label,Win_Loss_Fav,Win_Loss,AdjEM_Fav,AdjEM,AdjO_Fav,...,FT%_opp_Fav,FT%_opp,AST_Fav,AST,AST_opp_Fav,AST_opp,BLK_Fav,BLK,BLK_opp_Fav,BLK_opp
0,Kansas,North Carolina Central,2024,,0,0.6875,0.580645,18.96,-6.74,113.2,...,0.701,0.727,18.8125,12.83871,12.375,12.290323,3.875,2.677419,2.75,3.419355
1,Duke,Dartmouth,2024,,0,0.75,0.222222,24.84,-16.98,121.8,...,0.69,0.706,15.40625,12.296296,12.65625,14.111111,3.6875,3.518519,4.375,2.814815
2,Purdue,Samford,2024,,0,0.878788,0.852941,29.07,9.87,125.0,...,0.724,0.688,18.393939,17.529412,14.424242,13.529412,3.787879,3.823529,2.272727,3.764706


In [37]:
# Get feature names
exclude = ['Favored', 'Underdog', 'Year', 'Tournament', 'Label']

features = list(train.columns)
for col in exclude:
    features.remove(col)

In [55]:
# Train the classifier
log = LogisticRegression(penalty='l2', C=10, solver='liblinear', random_state=77)
# log.fit(train[features], train[['Label']])
log.fit(train[features], train[['Label']])

## Get Input Data for this Year

Next, we'll need to get the input data for this year so we can use it to predict game results for tournament games. We'll retrieve data from each source for this year, clean the data and combine it into a single data set.

In [56]:
year = 2024
stats_path = './Data/SportsReference/' + str(year) + '_stats.csv'
# stats = cbb.load_stats_dataframe(year=year, csv_file_path=stats_path)
stats = pd.read_csv(stats_path)
stats = cbb.update_basic(stats.rename(index=str, columns={'School': 'Team'}))

# Fix absolute stats to be per game
cols_to_fix = ['3PA', '3PA_opp',  'AST', 'AST_opp', 'BLK', 'BLK_opp']
for c in cols_to_fix:
    stats[c] = stats[c] / stats['G']

stats[stats['Team'] == 'Sam Houston']

Unnamed: 0,Team,G,SRS,SOS,Tm.,Opp.,MP,FG_opp,FGA_opp,FG%_opp,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
258,Sam Houston,33,-0.15,-1.5,2381,2282,1330,794,1862,0.426,...,480,663,0.724,393,1255,12.878788,226,2.272727,435,596


In [57]:
kp_path = './Data/Kenpom/' + str(year) + '_kenpom.csv'
# kenpom = cbb.load_kenpom_dataframe(year=year, csv_file_path=kp_path)
kenpom = pd.read_csv(kp_path)
kenpom = cbb.update_kenpom(kenpom)
kenpom[kenpom['Team'] == 'Sam Houston']

Unnamed: 0,Rank,Team,Seed,Conf,Wins,Losses,AdjEM,AdjO,AdjO Rank,AdjD,...,Luck,Luck Rank,OppAdjEM,OppAdjEM Rank,OppO,OppO Rank,OppD,OppD Rank,NCSOS AdjEM,NCSOS AdjEM Rank


In [58]:
TRank_path = './Data/TRank/' + str(year) + '_TRank.csv'
# TRank = cbb.load_TRank_dataframe(year=year, csv_file_path=TRank_path)
TRank = pd.read_csv(TRank_path)
TRank = cbb.update_TRank(TRank)
TRank[TRank['Team'] == 'Marquette']

Unnamed: 0,Rk,Team,Conf,G,Wins,Losses,AdjOE,AdjOE Rank,AdjDE,AdjDE Rank,...,3P%D,3P%D Rank,3PR,3PR Rank,3PRD,3PRD Rank,Adj T.,Adj T. Rank,WAB,WAB Rank
8,9,Marquette,BE,34,25,9,118.6,19,94.9,19,...,33.6,154,40.5,95,43.1,340,69.1,86,6.1,6


In [59]:
# Merge the data from each source (and drop columns that are repeats)
team_stats = pd.merge(kenpom, TRank.drop(['Conf', 'Wins', 'Losses'], axis=1), on='Team', sort=False)
team_stats = pd.merge(team_stats, stats.drop(['G', 'ORB', '3P%', 'ORB'], axis=1), on='Team', sort=False)
team_stats[team_stats['Team'] == 'Marquette']

Unnamed: 0,Rank,Team,Seed,Conf,Wins,Losses,AdjEM,AdjO,AdjO Rank,AdjD,...,3PA,FT,FTA,FT%,TRB,AST,STL,BLK,TOV,PF
12,13,Marquette,2.0,BE,25,9,22.19,118.1,22,95.9,...,24.705882,379,530,0.715,1115,15.823529,291,3.235294,338,522


In [60]:
# Load Tournament games
games_path = './Data/Tourney/{}.csv'.format(2024)
games = pd.read_csv(games_path)
games.head(3)

Unnamed: 0,Home,Away
0,UConn,Stetson
1,Florida Atlantic,Northwestern
2,San Diego State,UAB


In [61]:
all_teams = list(games['Home'].values)
all_teams.extend(list(games['Away'].values))
stats_teams = list(team_stats['Team'].values)
[t for t in all_teams if t not in stats_teams]

[]

In [62]:
# Join the team data with the game data
data = pd.merge(games, team_stats, left_on='Home', right_on='Team', sort=False)
data = pd.merge(data, team_stats, left_on='Away', right_on='Team', suffixes=('_Home', '_Away'), sort=False)
data.insert(0, 'Year', year)
data.insert(3, 'Tournament', 'NCAA Tournament')
data.head(3)

Unnamed: 0,Year,Home,Away,Tournament,Rank_Home,Team_Home,Seed_Home,Conf_Home,Wins_Home,Losses_Home,...,3PA_Away,FT_Away,FTA_Away,FT%_Away,TRB_Away,AST_Away,STL_Away,BLK_Away,TOV_Away,PF_Away
0,2024,UConn,Stetson,NCAA Tournament,1,UConn,1.0,BE,31,3,...,24.617647,459,599,0.766,1186,13.5,181,3.058824,355,504
1,2024,Florida Atlantic,Northwestern,NCAA Tournament,39,Florida Atlantic,8.0,Amer,25,8,...,21.09375,410,548,0.748,997,15.6875,222,3.28125,280,568
2,2024,San Diego State,UAB,NCAA Tournament,19,San Diego State,5.0,MWC,24,10,...,18.764706,604,810,0.746,1296,13.617647,229,4.617647,392,537


In [63]:
data.to_csv('./Data/Training/2024.csv', index=False)

## Predict Games Using the Classifier

Now that we have a trained model and data for the tournament games this year, we can use it to predict games in the 2021 NCAA Tournament.

In [64]:
# Make Predictions
predictions = cbb.predict(log, data, features)
predictions.to_csv('./Data/predictions/predictions_2024.csv', index=False)
predictions['Upset'] = predictions['Underdog'] == predictions['Predicted Winner']

In [65]:
# First Round
predictions.iloc[0:32,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities,Upset
0,UConn,Stetson,UConn,0.011366,False
1,Florida Atlantic,Northwestern,Northwestern,0.592245,True
2,San Diego State,UAB,San Diego State,0.255148,False
3,Auburn,Yale,Auburn,0.093424,False
4,BYU,Duquesne,BYU,0.250135,False
5,Illinois,Morehead State,Illinois,0.053337,False
6,Washington State,Drake,Drake,0.480677,True
7,Iowa State,South Dakota State,Iowa State,0.028889,False
8,UNC,Wagner,UNC,0.012724,False
9,Michigan State,Mississippi State,Mississippi State,0.536321,True


In [66]:
# Round of 32
predictions.iloc[32:48,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities,Upset
32,UConn,Northwestern,UConn,0.115215,False
33,Auburn,San Diego State,San Diego State,0.330147,True
34,Illinois,BYU,BYU,0.430779,True
35,Iowa State,Drake,Iowa State,0.223223,False
36,UNC,Mississippi State,Mississippi State,0.359173,True
37,Alabama,Grand Canyon,Alabama,0.247983,False
38,Baylor,Clemson,Baylor,0.287978,False
39,Arizona,Nevada,Nevada,0.403035,True
40,Houston,Nebraska,Houston,0.175922,False
41,Duke,Wisconsin,Wisconsin,0.448813,True


In [67]:
# Round of 16
predictions.iloc[48:56,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities,Upset
48,UConn,San Diego State,UConn,0.159473,False
49,Iowa State,BYU,BYU,0.346103,True
50,Alabama,Mississippi State,Mississippi State,0.375932,True
51,Baylor,Nevada,Nevada,0.355323,True
52,Houston,Wisconsin,Houston,0.184451,False
53,Texas Tech,Florida,Florida,0.486485,True
54,Purdue,Kansas,Purdue,0.278721,False
55,Colorado State,South Carolina,South Carolina,0.588502,True


In [68]:
# Elite 8
predictions.iloc[56:60,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities,Upset
56,UConn,BYU,UConn,0.19499,False
57,Mississippi State,Nevada,Nevada,0.575223,True
58,Houston,Florida,Houston,0.173428,False
59,Purdue,South Carolina,Purdue,0.246353,False


In [69]:
# Final 4
predictions.iloc[60:62,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities,Upset
60,UConn,Nevada,UConn,0.147238,False
61,Houston,Purdue,Purdue,0.528831,True


In [70]:
# Later Rounds
predictions.iloc[62:,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities,Upset
62,UConn,Purdue,Purdue,0.492682,True
