## Model Building

Build logistic regression, random forest, and XGBoost models to predict the probability of an upset in the NCAA tournament

Based on data output in M1_DataCleaning.ipynb, model used in M3_Predictions.py

In [1]:
import pandas as pd
import random
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from tqdm import tqdm
import pickle

In [2]:
# matchup data outputted from M1_DataCleaning.ipynb
matchups = pd.read_csv('mydata/mens/matchups.csv')
matchups.head()

Unnamed: 0,Season,TeamrankRating_x,TrankRating_x,OE_x,DE_x,Tempo_x,Seed_x,3ptRate_x,Ast%_x,FT%_x,...,xOffyDefFTRateAvg,yOffxDefFTRateAvg,AbsxOffyDefAstDiff,AbsyOffxDefAstDiff,xOffyDefAstAvg,yOffxDefAstAvg,TotalPossVarSum,GameScoreVarSum,TrankNaiveUpsetProbability,TeamrankNaiveUpsetProbability
0,2008.0,28.5,0.960742,117.213494,88.761128,73.7,2,0.390996,0.522826,0.694618,...,0.396759,0.345179,0.004257,0.109056,0.520698,0.531006,295.048054,0.047594,0.312558,0.323143
1,2008.0,23.6,0.927155,115.187217,92.329124,65.8,3,0.371802,0.610714,0.750341,...,0.403042,0.34281,0.092145,0.018604,0.564642,0.594836,269.3279,0.072884,0.459461,0.493502
2,2008.0,32.4,0.981585,120.970641,85.610492,69.5,1,0.291796,0.627572,0.707756,...,0.35093,0.31638,0.080697,0.071639,0.587224,0.57123,309.871774,0.046553,0.084577,0.069552
3,2008.0,32.4,0.981585,120.970641,85.610492,69.5,1,0.291796,0.627572,0.707756,...,0.417042,0.345843,0.045503,0.015533,0.60482,0.543177,297.352054,0.045505,0.066621,0.065752
4,2008.0,23.3,0.931374,113.555688,90.514646,69.6,5,0.327184,0.536017,0.623003,...,0.381549,0.357344,0.046052,0.062629,0.559043,0.519629,465.450547,0.04449,0.244308,0.25054


In [3]:
y = matchups['Upset']
X = matchups.drop(columns = ['Upset', 'ScorePerPossDiff'])

In [4]:
# read in previous results to not duplicate params that have already been tested
prev_results = pd.read_csv('mydata/training_results_prob.csv')
prev_results.head()

Unnamed: 0,Type,Num_Features,Features,Model,Model_Coef,2008_Score,2009_Score,2010_Score,2011_Score,2012_Score,2013_Score,2014_Score,2015_Score,2016_Score,2017_Score,2018_Score,2019_Score
0,log,62,"Index(['Season', 'TeamrankRating_x', 'Tempo_x'...","LogisticRegression(C=0.0003, max_iter=10000, r...","{'lasso_coef': 0.35, 'ridge_coef': 0.0003}",0.551392,0.552778,0.551759,0.565558,0.566366,0.560903,0.561768,0.562362,0.560164,0.554967,0.561638,0.563243
1,log,62,"Index(['Season', 'TeamrankRating_x', 'Tempo_x'...","LogisticRegression(C=0.001, max_iter=10000, ra...","{'lasso_coef': 0.35, 'ridge_coef': 0.001}",0.549022,0.553265,0.550056,0.574036,0.576793,0.562657,0.565542,0.566164,0.560347,0.546249,0.56422,0.568604
2,log,62,"Index(['Season', 'TeamrankRating_x', 'Tempo_x'...","LogisticRegression(C=0.003, max_iter=10000, ra...","{'lasso_coef': 0.35, 'ridge_coef': 0.003}",0.546229,0.556918,0.548946,0.587634,0.595416,0.56975,0.576705,0.574224,0.564091,0.53503,0.570633,0.58018
3,log,62,"Index(['Season', 'TeamrankRating_x', 'Tempo_x'...","LogisticRegression(C=0.01, max_iter=10000, ran...","{'lasso_coef': 0.35, 'ridge_coef': 0.01}",0.546655,0.570925,0.553785,0.605493,0.626102,0.59232,0.603184,0.587836,0.581075,0.527508,0.584872,0.60222
4,log,62,"Index(['Season', 'TeamrankRating_x', 'Tempo_x'...","LogisticRegression(C=0.03, max_iter=10000, ran...","{'lasso_coef': 0.35, 'ridge_coef': 0.03}",0.555047,0.594391,0.566325,0.617832,0.66047,0.626808,0.634556,0.60008,0.615269,0.530635,0.605035,0.626556


### Logistic Regression

In [22]:
# lasso regularization for feature selection
# coefficients range from 0.02 to ~2000
# each coefficient is 3% larger than last, there are 400 coefficients in total
lasso_reg = [0.02 * (1.03) ** i for i in range(400)]

# ridge regularization to prevent overfitting
ridge_reg = [1e-7, 3e-7, 1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3]

# scale used to scale columns before applying regularization
scale = StandardScaler()

def train_logistic_regression(features = None):
    
    # list/dictionaries to store results
    feature_list = []
    scores = {}
    model_coefs = []
    models = []
    num_features = []
    
    # for each possible lasso regularization coefficient
    for c in tqdm(lasso_reg):
        
        # dont use season as a feature, use inputted features if given
        if features:
            X_lasso = X[features]
        else:
            X_lasso = X.drop(columns = ['Season'])
        
        # scale columns before regularization
        X_lasso = pd.DataFrame(scale.fit_transform(X_lasso), columns = X_lasso.columns)
        
        # fit L1 logistic regression for feature selection
        lasso = LogisticRegression(penalty = 'l1', C = c, random_state = 0, solver = 'saga', max_iter = 10000).fit(X_lasso, y)
        
        # filter for the columns with nonzero coefficients and the season column
        zero_cols = []
        if features:
            zero_cols = list(set(X.drop(columns = ['Season']).columns) - set(features))
        for i in range(len(lasso.coef_[0])):
            if lasso.coef_[0][i] == 0.0:
                zero_cols.append(X_lasso.columns[i])
        nonzero_X = X.drop(columns = zero_cols)
        
        # if there is at least 1 nonzero column and the same amount of features hasn't already been built
        if (len(nonzero_X.columns) > 1 and (len(nonzero_X.columns) - 1) not in num_features):
            
            # try each L2 regularization coefficient
            for c2 in ridge_reg:
                
                # cross validate over each season
                for season in list(X['Season'].unique()):
                    
                    # add season to scores dictionary
                    if season not in scores:
                        scores[season] = []
                        
                    # split into train and validation sets
                    X_train = nonzero_X[nonzero_X['Season'] != season].drop(columns = ['Season'])
                    X_val = nonzero_X[nonzero_X['Season'] == season].drop(columns = ['Season'])
                    X_train = pd.DataFrame(scale.fit_transform(X_train), columns = X_train.columns)
                    X_val = pd.DataFrame(scale.transform(X_val), columns = X_val.columns)
                    y_train = y[X_train.index]
                    y_val = y[X_val.index]
                    
                    # fit logistic regression
                    log_model = LogisticRegression(penalty = 'l2', C = c2, max_iter = 10000, random_state = 0, solver = "sag").fit(X_train, y_train)
                    
                    # predict win probabilities
                    predictions = log_model.predict_proba(X_val)
                    
                    # calculate log loss and store
                    val_score = log_loss(y_val, predictions)
                    scores[season].append(val_score)
                    
                # retrain model on full dataset for coefficients
                log_model = LogisticRegression(penalty = 'l2', C = c2, max_iter = 10000, random_state = 0, solver = "sag").fit(pd.DataFrame(scale.fit_transform(nonzero_X), columns = nonzero_X.columns).drop(columns = ['Season']), y)
                
                # store model details
                feature_list.append(nonzero_X.drop(columns = ['Season']).columns)
                num_features.append(len(nonzero_X.drop(columns = ['Season']).columns))
                models.append(log_model)
                model_coefs.append({'lasso_coef': c, 'ridge_coef': c2})
                
    # return dataframe of results
    return pd.DataFrame({'Type': ['log' for i in range(len(models))],
                         'Num_Features': num_features,
                         'Features': feature_list,
                         'Model': models,
                         'Model_Coef': model_coefs,
                         '2008_Score': scores[2008],
                         '2009_Score': scores[2009],
                         '2010_Score': scores[2010],
                         '2011_Score': scores[2011],
                         '2012_Score': scores[2012],
                         '2013_Score': scores[2013],
                         '2014_Score': scores[2014],
                         '2015_Score': scores[2015],
                         '2016_Score': scores[2016],
                         '2017_Score': scores[2017],
                         '2018_Score': scores[2018],
                         '2019_Score': scores[2019]})

### Random Forest

In [37]:
# random forest paramaters
max_features_list = [5, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 35, 40]
rf_max_depths = [5, 10, 15, 20, None]
min_samples = [1, 0.01, 0.03, 0.1]
criterions = ['gini', 'entropy']

def train_random_forest():
    
    # list/dictionaries to store results
    scores = {}
    models = []
    model_coefs = []
    
    # for repeatable randomization
    random.seed(0)
    
    # fit 1 models
    for i in tqdm(range(50)):
        
        # randomly generate random forest params
        rf_params = {'max_features': max_features_list[random.randint(0, len(max_features_list) - 1)],
                     'max_depth': rf_max_depths[random.randint(0, len(rf_max_depths) - 1)],
                     'min_samples_leaf': min_samples[random.randint(0, len(min_samples) - 1)],
                     'criterion': criterions[random.randint(0, len(criterions) - 1)]
                    }
        
        # keep generating params if they've already been tested
        while rf_params in list(prev_results['Model_Coef']):
            rf_params = {'max_features': max_features_list[random.randint(0, len(max_features_list) - 1)],
                     'max_depth': rf_max_depths[random.randint(0, len(rf_max_depths) - 1)],
                     'min_samples_leaf': min_samples[random.randint(0, len(min_samples) - 1)],
                     'criterion': criterions[random.randint(0, len(criterions) - 1)]
                    }
        
        # cross validate over each season
        for season in list(X['Season'].unique()):
            
            # add season to scores dictionary
            if season not in scores:
                scores[season] = []
            
            # split into train and validation sets
            X_train = X[X['Season'] != season].drop(columns = ['Season'])
            X_val = X[X['Season'] == season].drop(columns = ['Season'])
            y_train = y[X_train.index]
            y_val = y[X_val.index]
            
            # fit random forest model
            rf_model = RandomForestClassifier(n_estimators = 500,
                                             criterion = rf_params['criterion'],
                                             max_depth = rf_params['max_depth'],
                                             min_samples_leaf = rf_params['min_samples_leaf'],
                                             max_features = rf_params['max_features'],
                                             random_state = 0).fit(X_train, y_train)
            
            # predict win probabilities
            predictions = rf_model.predict_proba(X_val)
            
            # calculate log loss and store score
            val_score = log_loss(y_val, predictions)
            scores[season].append(val_score)
            
        # retrain model on full dataset for feature importances
        rf_model = RandomForestClassifier(n_estimators = 1000,
                                         criterion = rf_params['criterion'],
                                         max_depth = rf_params['max_depth'],
                                         min_samples_leaf = rf_params['min_samples_leaf'],
                                         max_features = rf_params['max_features'],
                                         random_state = 0).fit(X.drop(columns = ['Season']), y)
        
        # store model details
        model_coefs.append(rf_params)
        models.append(rf_model)
        
    # return dataframe of results
    return pd.DataFrame({'Type': ['rf' for i in range(len(models))],
                         'Num_Features': ['All' for i in range(len(models))],
                         'Features': ['All' for i in range(len(models))],
                         'Model': models,
                         'Model_Coef': model_coefs,
                         '2008_Score': scores[2008],
                         '2009_Score': scores[2009],
                         '2010_Score': scores[2010],
                         '2011_Score': scores[2011],
                         '2012_Score': scores[2012],
                         '2013_Score': scores[2013],
                         '2014_Score': scores[2014],
                         '2015_Score': scores[2015],
                         '2016_Score': scores[2016],
                         '2017_Score': scores[2017],
                         '2018_Score': scores[2018],
                         '2019_Score': scores[2019]})

### XGBoost

In [7]:
# XGBoost parameters
etas = [0.003, 0.01, 0.03, 0.1, 0.3, 1, 3]
xgb_max_depths = [2, 6, 10, 14, 18]
min_child_weights = [0, 1, 3, 6, 10, 14]
gammas = [0, 0.01, 0.03, 0.1, 0.3, 1, 3]
subsamples = [0.8, 0.9, 1]
lambdas = [0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300]

def train_xgboost():
    
    # list/dictionaries to store results
    scores = {}
    models = []
    model_coefs = []
    
    # for repeatable randomization
    random.seed(0)
    
    # fit 1 models
    for i in tqdm(range(1)):
        
        # randomly generate XGBoost params
        xgb_params = {'eta': etas[random.randint(0, len(etas) - 1)],
                      'max_depth': xgb_max_depths[random.randint(0, len(xgb_max_depths) - 1)],
                      'min_child_weight': min_child_weights[random.randint(0, len(min_child_weights) - 1)],
                      'gamma': gammas[random.randint(0, len(gammas) - 1)],
                      'subsample': subsamples[random.randint(0, len(subsamples) - 1)],
                      'lambda': lambdas[random.randint(0, len(lambdas) - 1)],
                      'random_state': 0
                     }
        # keep randomly generating if params have already been tested
        while xgb_params in list(prev_results['Model_Coef']):
            # randomly generate XGBoost params
            xgb_params = {'eta': etas[random.randint(0, len(etas) - 1)],
                          'max_depth': xgb_max_depths[random.randint(0, len(xgb_max_depths) - 1)],
                          'min_child_weight': min_child_weights[random.randint(0, len(min_child_weights) - 1)],
                          'gamma': gammas[random.randint(0, len(gammas) - 1)],
                          'subsample': subsamples[random.randint(0, len(subsamples) - 1)],
                          'lambda': lambdas[random.randint(0, len(lambdas) - 1)],
                          'random_state': 0
                         }
        
        # cross validate over each season
        for season in list(X['Season'].unique()):
            
            # add season to scores dictionary
            if season not in scores:
                scores[season] = []
                
            # split into train and validation sets
            X_train = X[X['Season'] != season].drop(columns = ['Season'])
            X_val = X[X['Season'] == season].drop(columns = ['Season'])
            y_train = y[X_train.index]
            y_val = y[X_val.index]
            
            # fit XGBoost model
            xgb_model = XGBClassifier(**xgb_params).fit(X_train, y_train)
            
            # predict win probabilities
            predictions = xgb_model.predict_proba(X_val)
            
            # calculate log loss and store score
            val_score = log_loss(y_val, predictions)
            scores[season].append(val_score)
            
        # retrain model on full dataset for feature importances
        xgb_model = XGBClassifier(**xgb_params).fit(X.drop(columns = ['Season']), y)
        
        # store model details
        model_coefs.append(xgb_params)
        models.append(xgb_model)
        
    # return dataframe of results
    return pd.DataFrame({'Type': ['xgb' for i in range(len(models))],
                         'Num_Features': ['All' for i in range(len(models))],
                         'Features': ['All' for i in range(len(models))],
                         'Model': models,
                         'Model_Coef': model_coefs,
                         '2008_Score': scores[2008],
                         '2009_Score': scores[2009],
                         '2010_Score': scores[2010],
                         '2011_Score': scores[2011],
                         '2012_Score': scores[2012],
                         '2013_Score': scores[2013],
                         '2014_Score': scores[2014],
                         '2015_Score': scores[2015],
                         '2016_Score': scores[2016],
                         '2017_Score': scores[2017],
                         '2018_Score': scores[2018],
                         '2019_Score': scores[2019]})

### Results

In [8]:
# train the models
log_results = train_logistic_regression()
final_results = log_results.copy()
#rf_results = train_random_forest()
#xgb_results = train_xgboost()
#final_results = pd.concat([prev_results, log_results, rf_results, xgb_results], ignore_index = True)

100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [23:02<00:00,  3.46s/it]


In [52]:
# calculate percentiles of scores in relation to other models
percentile_cols = []
score_cols = []
for col in final_results.columns:
    if col.endswith("Score"):
        score_cols.append(col)
        final_results[col[0:4] + "_Percentile"] = final_results[col].rank(pct = True, ascending = False)
        percentile_cols.append(col[0:4] + "_Percentile")

In [53]:
# calculate average score and average percentile
final_results['Avg_Score'] = final_results[score_cols].mean(axis = 1)
final_results['Avg_Percentile'] = final_results[percentile_cols].mean(axis = 1)

In [54]:
# function to show feature importances of given model by results row index
def features(row_index):
    model = final_results.loc[row_index, 'Model']
    
    # if model is logistic regression, return coefficients
    if final_results.loc[row_index, 'Type'] == 'log':
        model_features = list(final_results.loc[row_index, 'Features'])
        sign_list = []
        coef_list = []
        for i in range(len(model_features)):
            coef = round(model.coef_[0][i], 3)
            coef_list.append(coef)
            if coef > 0:
                sign_list.append('+')
            else:
                sign_list.append('-')
        
        return_df =  pd.DataFrame({'Feature': model_features,
                             'Sign': sign_list,
                            'Coefficient': coef_list})
        return return_df.reindex(return_df['Coefficient'].abs().sort_values(ascending = False).index)
        
    # if model is random forest or XGBoost, return feature importance plot
    else:
        feat_importances = pd.Series(model.feature_importances_, index = X.drop(columns = ['Season']).columns)
        return feat_importances.nlargest(15).plot(kind = 'barh')

In [55]:
# function to show params of given model by results row index
def params(row_index):
    return final_results.loc[row_index, 'Model_Coef']

In [56]:
# print results
pd.set_option('display.max_rows', 100)
final_results[['Type', 'Num_Features', 'Features', 'Avg_Score', 'Avg_Percentile']].sort_values(by = ['Avg_Score'], ascending = True).head(100)

Unnamed: 0,Type,Num_Features,Features,Avg_Score,Avg_Percentile
1158,log,10,"Index(['TrankRating_x', 'ORB%_x', 'FTRVar_x', ...",0.557906,0.58185
1157,log,10,"Index(['TrankRating_x', 'ORB%_x', 'FTRVar_x', ...",0.558051,0.700636
308,log,40,"Index(['TeamrankRating_x', 'TrankRating_x', 'O...",0.558136,0.661604
938,log,10,"Index(['TrankRating_x', 'ORB%_x', 'FTRVar_x', ...",0.558146,0.596799
307,log,40,"Index(['TeamrankRating_x', 'TrankRating_x', 'O...",0.558173,0.704354
937,log,10,"Index(['TrankRating_x', 'ORB%_x', 'FTRVar_x', ...",0.558217,0.674621
1148,log,9,"Index(['TrankRating_x', 'ORB%_x', 'FTRVar_x', ...",0.558218,0.557203
1018,log,9,"Index(['TrankRating_x', 'ORB%_x', 'FTRVar_x', ...",0.558218,0.590977
1017,log,9,"Index(['TrankRating_x', 'ORB%_x', 'FTRVar_x', ...",0.558234,0.677476
1147,log,9,"Index(['TrankRating_x', 'ORB%_x', 'FTRVar_x', ...",0.558234,0.650989


In [57]:
features(1158)

Unnamed: 0,Feature,Sign,Coefficient
8,TrankNaiveUpsetProbability,+,0.06
5,TrankPredictedSpread,-,-0.06
9,TeamrankNaiveUpsetProbability,+,0.056
4,SeedDiff,+,0.04
6,xOffyOffTODiff,+,0.032
0,TrankRating_x,-,-0.031
3,OR%Var_y,+,0.029
7,xOffyDefAstAvg,+,0.027
2,FTRVar_x,+,0.027
1,ORB%_x,-,-0.021


In [58]:
features(1157)

Unnamed: 0,Feature,Sign,Coefficient
8,TrankNaiveUpsetProbability,+,0.024
9,TeamrankNaiveUpsetProbability,+,0.023
5,TrankPredictedSpread,-,-0.023
4,SeedDiff,+,0.017
0,TrankRating_x,-,-0.013
6,xOffyOffTODiff,+,0.011
7,xOffyDefAstAvg,+,0.009
3,OR%Var_y,+,0.009
2,FTRVar_x,+,0.009
1,ORB%_x,-,-0.007


In [66]:
final_results[['Type', 'Num_Features', 'Features', 'Model', 'Model_Coef', '2008_Score',
               '2009_Score', '2010_Score', '2011_Score', '2012_Score', '2013_Score',
               '2014_Score', '2015_Score', '2016_Score', '2017_Score', '2018_Score',
               '2019_Score']].to_csv('mydata/training_results_prob.csv', index = False)

In [64]:
final_results.loc[1157, 'Features']

Index(['TrankRating_x', 'ORB%_x', 'FTRVar_x', 'OR%Var_y', 'SeedDiff',
       'TrankPredictedSpread', 'xOffyOffTODiff', 'xOffyDefAstAvg',
       'TrankNaiveUpsetProbability', 'TeamrankNaiveUpsetProbability'],
      dtype='object')

In [65]:
pickle.dump(final_results.loc[1157, 'Model'], open('models/MProb.sav', 'wb'))