## Model Building

Build logistic regression, random forest, and XGBoost models to predict the probability of an upset in the NCAA tournament

Based on data output in MDataCleaning.ipynb, model used in MPredictions.ipynb

In [2]:
import pandas as pd
import random
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from tqdm import tqdm

In [None]:
# matchup data outputted from MDataCleaning.ipynb
matchups = pd.read_csv('mydata/matchups.csv')
matchups.head()

In [None]:
y = matchups['Upset']
X = matchups.drop(columns = ['Upset'])

### Logistic Regression

In [None]:
# lasso regularization for feature selection
lasso_reg = [i / 1000 for i in range(1, 350, 1)]

# ridge regularization to prevent overfitting
ridge_reg = [.0003, .001, .003, .01, .03]

# scale used to scale columns before applying regularization
scale = StandardScaler()

def train_logistic_regression():
    
    # list/dictionaries to store results
    feature_list = []
    scores = {}
    model_coefs = []
    models = []
    num_features = []
    
    # for each possible lasso regularization coefficient
    for c in tqdm(lasso_reg):
        
        # dont use season as a feature
        X_lasso = X.drop(columns = ['Season'])
        
        # scale columns before regularization
        X_lasso = pd.DataFrame(scale.fit_transform(X_lasso), columns = X_lasso.columns)
        
        # fit L1 logistic regression for feature selection
        lasso = LogisticRegression(penalty = 'l1', C = c, random_state = 0, solver = 'saga', max_iter = 10000).fit(X_lasso, y)
        
        # filter for the columns with nonzero coefficients and the season column
        zero_cols = []
        for i in range(len(lasso.coef_[0])):
            if lasso.coef_[0][i] == 0.0:
                zero_cols.append(X_lasso.columns[i])
        nonzero_X = X.drop(columns = zero_cols)
        
        # if there is at least 1 nonzero column and the same amount of features hasn't already been built
        if (len(nonzero_X.columns) > 1 and (len(nonzero_X.columns) - 1) not in num_features):
            
            # try each L2 regularization coefficient
            for c2 in ridge_reg:
                
                # cross validate over each season
                for season in list(X['Season'].unique()):
                    
                    # add season to scores dictionary
                    if season not in scores:
                        scores[season] = []
                        
                    # split into train and validation sets
                    X_train = nonzero_X[nonzero_X['Season'] != season].drop(columns = ['Season'])
                    X_val = nonzero_X[nonzero_X['Season'] == season].drop(columns = ['Season'])
                    X_train = pd.DataFrame(scale.fit_transform(X_train), columns = X_train.columns)
                    X_val = pd.DataFrame(scale.transform(X_val), columns = X_val.columns)
                    y_train = y[X_train.index]
                    y_val = y[X_val.index]
                    
                    # fit logistic regression
                    log_model = LogisticRegression(penalty = 'l2', C = c2, max_iter = 10000, random_state = 0, solver = "sag").fit(X_train, y_train)
                    
                    # predict win probabilities
                    predictions = log_model.predict_proba(X_val)
                    
                    # calculate log loss and store
                    val_score = log_loss(y_val, predictions)
                    scores[season].append(val_score)
                    
                # store model details
                feature_list.append(X_val.columns)
                num_features.append(len(X_val.columns))
                models.append(log_model)
                model_coefs.append({'lasso_coef': lasso_c, 'ridge_coef': ridge_c})
                
    # return dataframe of results
    return pd.DataFrame({'Type': ['log' for i in range(len(models))],
                         'Num_Features': num_features,
                         'Features': feature_list,
                         'Model': models,
                         'Model_Coef': model_coefs,
                         '2008_Score': scores[2008],
                         '2009_Score': scores[2009],
                         '2010_Score': scores[2010],
                         '2011_Score': scores[2011],
                         '2012_Score': scores[2012],
                         '2013_Score': scores[2013],
                         '2014_Score': scores[2014],
                         '2015_Score': scores[2015],
                         '2016_Score': scores[2016],
                         '2017_Score': scores[2017],
                         '2018_Score': scores[2018],
                         '2019_Score': scores[2019]})

### Random Forest

In [None]:
# random forest paramaters
max_features_list = [5, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 35, 40]
rf_max_depths = [5, 10, 15, 20, None]
min_samples = [1, 0.01, 0.03]
criterions = ['gini', 'entropy']

def train_random_forest():
    
    # list/dictionaries to store results
    scores = {}
    models = []
    model_coefs = []
    
    random.seed(0)
    
    # fit 10 models
    for i in tqdm(range(10)):
        
        # randomly generate random forest params
        rf_params = {'max_features': [random.randint(0, len(max_features_list) - 1)],
                     'max_depth': [random.randint(0, len(rf_max_depths) - 1)],
                     'min_samples_leaf': [random.randint(0, len(min_samples) - 1)],
                     'criterion': [random.randint(0, len(criterions) - 1)]
                    }
        
        # cross validate over each season
        for season in list(X['Season'].unique()):
            
            # add season to scores dictionary
            if season not in scores:
                scores[season] = []
            
            # split into train and validation sets
            X_train = X[X['Season'] != season].drop(columns = ['Season'])
            X_val = X[X['Season'] == season].drop(columns = ['Season'])
            y_train = y[X_train.index]
            y_val = y[X_val.index]
            
            # fit random forest model
            rf_model = RandomForestClassifier(n_estimators = 1000,
                                             criterion = rf_params['criterion'],
                                             max_depth = rf_params['max_depth'],
                                             min_samples_leaf = rf_params['min_samples_leaf'],
                                             max_features = rf_params['max_features'],
                                             random_state = 0).fit(X_train, y_train)
            
            # predict win probabilities
            predictions = rf_model.predict_proba(X_val)
            
            # calculate log loss and store score
            val_score = log_loss(y_val, predictions)
            scores[season].append(val_score)
            
        # store model details
        model_coefs.append(rf_params)
        models.append(rf_model)
        
    # return dataframe of results
    return pd.DataFrame({'Type': ['rf' for i in range(len(models))],
                         'Num_Features': ['All' for i in range(len(models))],
                         'Features': ['All' for i in range(len(models))],
                         'Model': models,
                         'Model_Coef': model_coefs,
                         '2008_Score': scores[2008],
                         '2009_Score': scores[2009],
                         '2010_Score': scores[2010],
                         '2011_Score': scores[2011],
                         '2012_Score': scores[2012],
                         '2013_Score': scores[2013],
                         '2014_Score': scores[2014],
                         '2015_Score': scores[2015],
                         '2016_Score': scores[2016],
                         '2017_Score': scores[2017],
                         '2018_Score': scores[2018],
                         '2019_Score': scores[2019]})

### XGBoost

In [None]:
# XGBoost parameters
etas = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3]
xgb_max_depths = [2, 6, 10, 14, 18]
min_child_weights = [0, 1, 3, 6, 10, 14]
gammas = [0, 0.01, 0.03, 0.1, 0.3, 1, 3]
subsamples = [0.8, 0.9, 1]
lambdas = [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000, 3000]

def train_xgboost():
    
    # list/dictionaries to store results
    scores = {}
    models = []
    model_coefs = []
    
    random.seed(0)
    
    # fit 10 models
    for i in tqdm(range(10)):
        
        # randomly generate XGBoost params
        xgb_params = {'eta': [random.randint(0, len(etas) - 1)],
                      'max_depth': [random.randint(0, len(xgb_max_depths) - 1)],
                      'min_child_weight': [random.randint(0, len(min_child_weights) - 1)],
                      'gamma': [random.randint(0, len(gammas) - 1)],
                      'subsample': [random.randint(0, len(subsamples) - 1)],
                      'lambda': [random.randint(0, len(lambdas) - 1)],
                      'random_state': 0
                     }
        
        # cross validate over each season
        for season in list(X['Season'].unique()):
            
            # add season to scores dictionary
            if season not in scores:
                scores[season] = []
                
            # split into train and validation sets
            X_train = X[X['Season'] != season].drop(columns = ['Season'])
            X_val = X[X['Season'] == season].drop(columns = ['Season'])
            y_train = y[X_train.index]
            y_val = y[X_val.index]
            
            # fit XGBoost model
            xgb_model = XGBClassifier(**xgb_params).fit(X_train, y_train)
            
            # predict win probabilities
            predictions = xgb_model.predict_proba(X_val)
            
            # calculate log loss and store score
            val_score = log_loss(y_val, predictions)
            scores[season].append(val_score)
            
        # store model details
        model_coefs.append(xgb_params)
        models.append(xgb_model)
        
    # return dataframe of results
    return pd.DataFrame({'Type': ['xgb' for i in range(len(models))],
                         'Num_Features': ['All' for i in range(len(models))],
                         'Features': ['All' for i in range(len(models))],
                         'Model': models,
                         'Model_Coef': model_coefs,
                         '2008_Score': scores[2008],
                         '2009_Score': scores[2009],
                         '2010_Score': scores[2010],
                         '2011_Score': scores[2011],
                         '2012_Score': scores[2012],
                         '2013_Score': scores[2013],
                         '2014_Score': scores[2014],
                         '2015_Score': scores[2015],
                         '2016_Score': scores[2016],
                         '2017_Score': scores[2017],
                         '2018_Score': scores[2018],
                         '2019_Score': scores[2019]})

### Results

In [None]:
# train the models
log_results = train_logistic_regression()
rf_results = train_random_forest()
xgb_results = train_xgboost()
final_results = pd.concat([log_results, rf_results, xgb_results], ignore_index = True)

In [None]:
# calculate percentiles of scores in relation to other models
percentile_cols = []
score_cols = []
for col in final_results.columns:
    if col.endswith("Score"):
        score_cols.append(col)
        final_results[col[0:4] + "_Percentile"] = final_results[col].rank(pct = True, ascending = False)
        percentile_cols.append(col[0:4] + "_Percentile")

In [None]:
# calculate average score and average percentile
final_results['Avg_Score'] = final_results[score_cols].mean(axis = 1)
final_results['Avg_Percentile'] = final_results[percentile_cols].mean(axis = 1)

In [None]:
# function to show feature importances of given model by results row index
def features(row_index):
    model = final_results.loc[row_index, 'Model']
    
    # if model is logistic regression, return coefficients
    if final_results.loc[row_index, 'Type'] == 'log':
        result_string = 'Feature Coefficients: \n\n'
        
        model_features = final_results.loc[row_index, 'Features']
        for i in range(len(model_features)):
            result_string = result_string + model_features[i] + ': ' + model.coef_[0][i] + '\n'
        print(result_string)
        
    # if model is random forest or XGBoost, return feature importance plot
    else:
        feat_importances = pd.Series(model.feature_importances_, index = X.drop(columns = ['Season']).columns)
        return feat_importances.nlargest(15).plot(kind = 'barh')

In [None]:
# print results
pd.set_option('display.max_rows', None)
final_results[['Type', 'Num_Features', 'Features', 'Model', 'Model_Coef', 'Avg_Score', 'Avg_Percentile']].sort_values(by = ['Avg_Score'], ascending = True)