### **Model Run Setup**

In [1]:
# Select folder path based on user input
gender = 'M' #input('Enter gender (W for women, M for men): ')

# Assign the appropriate folder path based on the input
MAIN_DIR = './'
USE_DIR = MAIN_DIR + 'womens/' if gender.upper() == 'W' else MAIN_DIR + 'mens/'
PRE = 'W' if gender.upper() == 'W' else 'M'
NAME = 'womens' if gender.upper() == 'W' else 'mens'

In [2]:
import pandas as pd
from datetime import datetime, timedelta
import re
import random
import numpy as np
import pickle

### **Load Data**

In [3]:
# Set the year to get predictions
season = 2023

In [4]:
# Load 'Sample' data into variable
sample = pd.read_csv('sample-mens.csv')
sample.shape

(65703, 5)

In [5]:
# Load 'Sample' data into variable
games = pd.read_csv('games-mens.csv')

# Duplicate Games dataframe to get summary statistics
df1 = games[games['Season'] == season].drop(columns=['Team1']).copy()
df2 = games[games['Season'] == season].drop(columns=['Team0']).copy()

# Rename the TeamID columns in dataframe
df1.rename(columns = {'Team0':'TeamID'}, inplace = True)
df2.rename(columns = {'Team1':'TeamID'}, inplace = True)

# Concatenate duplicate dataframes along axis=0 (i.e., add as a new column)
df3 = pd.concat([df1, df2], ignore_index=True, sort=False)

# Group the dataframe by 'Season' and 'TeamID', and get the mean values
team_means = df3.groupby(['Season', 'TeamID'], as_index=False).mean()

# Zero out outcome
team_means['Outcome'] = None

# Show sample output
print('team_means:', team_means.shape)
team_means.head()

team_means: (363, 19)


  team_means = df3.groupby(['Season', 'TeamID'], as_index=False).mean()


Unnamed: 0,Season,TeamID,Seed,Site,MOV,FG2M,FG2A,FG3M,FG3A,FT1M,FT1A,ORB,DRB,AST,TOVR,STL,BLK,PFL,Outcome
0,2023,1101,0.0,0.153846,-3.692308,-0.423077,6.923077,0.115385,2.230769,-2.961538,-4.615385,0.269231,-5.538462,1.230769,-3.923077,2.576923,-1.230769,1.730769,
1,2023,1102,0.0,0.15625,-0.125,-0.25,-2.5625,2.96875,5.21875,-2.59375,-2.90625,-2.3125,-0.875,5.21875,0.21875,0.03125,0.84375,-0.03125,
2,2023,1103,0.0,0.419355,5.83871,0.677419,0.645161,1.741935,5.096774,2.741935,3.387097,1.064516,1.935484,1.516129,-0.483871,0.064516,-0.419355,-2.451613,
3,2023,1104,0.0,0.264706,13.676471,3.823529,-2.176471,4.529412,10.058824,1.5,1.735294,1.764706,6.147059,5.205882,2.176471,-1.235294,1.352941,-1.470588,
4,2023,1105,0.0,0.033333,-3.066667,0.633333,1.2,-1.066667,-3.6,-3.266667,-2.8,-1.2,-1.233333,-0.533333,-0.433333,0.0,0.266667,1.066667,


In [6]:
# Create Seeds dataframe
seeding = pd.read_csv(USE_DIR + PRE + 'NCAATourneySeeds.csv')

# Reduce size of Seeds dataframe
seeds = seeding[seeding['Season'] == season].copy()

# Create a regex to pull out number
regex = r'\d+'

# Apply the regular expression to the 'col1' column to extract the numeric value
seeds['Seed'] = seeds['Seed'].apply(lambda x: re.search(regex, x).group())

# Convert the numeric values to integers
seeds['Seed'] = seeds['Seed'].astype(int)

# Set order sequence
order = [0, 2, 1]

# Reorder and rename dataframe
seeds = seeds.iloc[:,order]
seeds.rename(columns = {'Seed':'Slot'}, inplace = True)

In [7]:
# Merge the 'Sample' and 'Seeds' dataframes on 'Season' and 'TeamID' columns
seeds1 = pd.merge(sample, seeds, how='left', left_on=['Season', 'Team0'], right_on=['Season', 'TeamID'])
seeds1.drop(columns='TeamID', inplace=True)

# Merge the 'Sample' and 'Seeds' dataframes on 'Season' and 'TeamID' columns
seeds2 = pd.merge(seeds1, seeds, how='left', left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
seeds2.drop(columns='TeamID', inplace=True)

# Fill missing values
seeds2['Slot_x'].fillna(17, inplace = True)
seeds2['Slot_y'].fillna(17, inplace = True)

# Tournament Seeding Consideration (Differential)
seeds2.loc[:, 'Seed'] = np.where(seeds2['Slot_x'] < seeds2['Slot_y'], 
                                 seeds2['Slot_x'] - seeds2['Slot_y'], 
                                 seeds2['Slot_y'] - seeds2['Slot_x'])
# Convert Seed to int64
seeds2['Seed'] = seeds2['Seed'].astype(int)

# Drop unwanted columns from games dataframe
seeds = seeds2.drop(columns=['Slot_x', 'Slot_y'])

# Add in the remaining columns the model was trained on an set these values equal to zero
seeds = seeds.assign(Site=0, MOV=0, FG2M=0, FG2A=0, FG3M=0, FG3A=0, FT1M=0, FT1A=0, ORB=0, 
                     DRB=0, AST=0, TOVR=0, STL=0, BLK=0, PFL=0)
# Set order sequence
order = [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 4]

# Reorder columns from games dataframe
seeds = seeds.iloc[:,order].copy()

# Show sample output
print('seeds:', seeds.shape)
print(seeds.columns)

seeds: (65703, 20)
Index(['Season', 'Team0', 'Team1', 'Seed', 'Site', 'MOV', 'FG2M', 'FG2A',
       'FG3M', 'FG3A', 'FT1M', 'FT1A', 'ORB', 'DRB', 'AST', 'TOVR', 'STL',
       'BLK', 'PFL', 'Outcome'],
      dtype='object')


In [8]:
# Merge 'Seeds' with 'Team_means' for Team0
df4 = seeds.merge(team_means.add_suffix('_0'), left_on=['Season', 'Team0'], right_on=['Season_0', 'TeamID_0'], how='left')

In [9]:
# Merge 'Seeds' with 'Team_means' for Team1
df4 = seeds.merge(team_means.add_suffix('_1'), left_on=['Season', 'Team1'], right_on=['Season_1', 'TeamID_1'], how='left')

In [10]:
# Calculate difference for each column
cols = ['Seed', 'Site', 'MOV', 'FG2M', 'FG2A', 'FG3M', 'FG3A', 'FT1M', 'FT1A', 'ORB', 'DRB', 'AST', 'TOVR', 'STL', 'BLK', 'PFL', 'Outcome']
for col in cols:
    df4[col] = df4[col] - df4[col + '_1']

In [11]:
# Drop redundant columns
df4 = df4.drop(columns=[col + '_1' for col in cols])
df4 = df4.drop(columns=['Season_1', 'TeamID_1'], axis=1)

data = df4.copy()

In [12]:
# Show sample output
print('merged_means:', data.shape)
print(data.columns)

merged_means: (65703, 20)
Index(['Season', 'Team0', 'Team1', 'Seed', 'Site', 'MOV', 'FG2M', 'FG2A',
       'FG3M', 'FG3A', 'FT1M', 'FT1A', 'ORB', 'DRB', 'AST', 'TOVR', 'STL',
       'BLK', 'PFL', 'Outcome'],
      dtype='object')


### **Load Model**

In [13]:
# Load the saved model from a file
loaded_model = pickle.load(open('./models/gnb-model.sav', 'rb'))

### **Run Model on Sample Data**

In [14]:
# Set X,y variable on sample data
X = data.drop(columns=['Outcome'], axis=1)
y = data['Outcome']

# Make predictions on sample data
y_pred = loaded_model.predict(X)

In [15]:
# Get probabilities on sample data
y_proba = loaded_model.predict_proba(X)

In [16]:
len(y_proba)

65703

### **Add Probabilities to Sample Dataframe**

In [17]:
# Create 'ID' column for sample submission
sample['ID'] = sample['Season'].astype(str) + "_" + sample['Team0'].astype(str) + "_" + sample['Team1'].astype(str)

# Input the classification results into 'Outcome'
sample['Outcome'] = y_pred

# Create holder column for 'Pred' (probability predictions)
sample['Pred'] = 0.0

In [18]:
# Iterate over y_pred array and set sample['Outcome'] values
for i in range(len(y_pred)):
    if y_pred[i] == 0:
        sample.at[i, 'Pred'] = y_proba[i][0].round(3)
    else:
        sample.at[i, 'Pred'] = y_proba[i][1].round(3)

# Drop unwanted columns from games dataframe
sample = sample.drop(columns=['Season', 'Team0', 'Team1', 'Seed', 'Outcome'], axis=1)

# Show sample output
print('sample:', sample.shape)
sample.head()

sample: (65703, 2)


Unnamed: 0,ID,Pred
0,2023_1101_1102,0.62
1,2023_1101_1103,0.972
2,2023_1101_1104,1.0
3,2023_1101_1105,0.916
4,2023_1101_1106,0.999


In [19]:
# Save dataframes as csv files
sample_submission = sample.to_csv('submission-' + NAME + '.csv',index=False)

In [20]:
sample.Pred.unique()

array([0.62 , 0.972, 1.   , 0.916, 0.999, 0.975, 0.756, 0.504, 0.845,
       0.979, 0.958, 0.996, 0.814, 0.688, 0.978, 0.919, 0.952, 0.812,
       0.988, 0.969, 0.891, 0.835, 0.844, 0.656, 0.676, 0.94 , 0.869,
       0.875, 0.92 , 0.948, 0.993, 0.99 , 0.59 , 0.834, 0.892, 0.987,
       0.808, 0.992, 0.957, 0.974, 0.781, 0.964, 0.997, 0.915, 0.865,
       0.897, 0.986, 0.868, 0.98 , 0.994, 0.925, 0.629, 0.809, 0.614,
       0.973, 0.55 , 0.743, 0.537, 0.947, 0.591, 0.666, 0.963, 0.649,
       0.894, 0.678, 0.782, 0.953, 0.797, 0.806, 0.991, 0.765, 0.744,
       0.664, 0.938, 0.848, 0.515, 0.878, 0.609, 0.708, 0.586, 0.911,
       0.699, 0.857, 0.551, 0.912, 0.867, 0.566, 0.702, 0.579, 0.705,
       0.989, 0.735, 0.674, 0.715, 0.856, 0.88 , 0.711, 0.832, 0.939,
       0.885, 0.819, 0.871, 0.771, 0.503, 0.529, 0.902, 0.704, 0.732,
       0.623, 0.692, 0.941, 0.583, 0.767, 0.849, 0.946, 0.679, 0.728,
       0.668, 0.763, 0.524, 0.896, 0.684, 0.929, 0.636, 0.841, 0.671,
       0.936, 0.703,