In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# game data outputted from W1_DataCleaning.ipynb, with stats, goes back to 2010
game_data = pd.read_csv('mydata/womens/matchups.csv')
game_data.head()

Unnamed: 0,Season,MooreRating_x,MooreRating_y,Seed_x,3ptRate_x,Ast%_x,FT%_x,OppFT%_x,Opp3ptRate_x,OppAst%_x,...,xOffyOffFTRateDiff,AbsxOffyDefAstDiff,AbsyOffxDefAstDiff,xOffyDefAstAvg,yOffxDefAstAvg,xOffyOffAstDiff,TotalPossVarSum,GameScoreVarSum,AvgTotalPoss,MooreNaiveUpsetProbability
0,2010.0,91.81,78.17,4,0.153633,0.562874,0.726562,0.687204,0.318363,0.449704,...,0.103332,0.046362,0.116533,0.539693,0.50797,0.031723,323.329209,249.79526,142.390152,0.111138
1,2010.0,91.81,82.29,4,0.153633,0.562874,0.726562,0.687204,0.318363,0.449704,...,0.137651,0.109423,0.21603,0.617586,0.557719,0.059867,498.758776,240.489802,139.451,0.192651
2,2010.0,95.45,91.81,1,0.230377,0.550165,0.678383,0.679487,0.302521,0.512938,...,-0.085583,0.100461,0.049937,0.499935,0.537906,-0.037971,356.530636,259.872359,139.373125,0.374739
3,2010.0,92.39,91.81,2,0.226961,0.53085,0.662016,0.665517,0.309621,0.603648,...,-0.118697,0.081146,0.040774,0.490277,0.583261,-0.092984,359.897075,306.88258,141.7125,0.481327
4,2010.0,114.8,91.81,1,0.276465,0.623,0.723077,0.630556,0.288594,0.486111,...,-0.052871,0.173296,0.076763,0.536352,0.524493,0.011859,372.720173,234.122715,141.959375,0.016799


In [3]:
len(game_data)

630

In [4]:
# game data outputted from W1_DataCleaning.ipynb, no stats, goes back to 2005
game_data_no_stats = pd.read_csv('mydata/womens/matchups_no_stats.csv')
game_data_no_stats.head()

Unnamed: 0,Season,MooreRating_x,MooreRating_y,Seed_x,Seed_y,Upset,ScoreDiff,SeedDiff,MooreRatingPredictedSpread
0,2005.0,86.08,66.75,5,12,0,22.0,-7,19.33
1,2005.0,86.08,84.71,5,4,0,9.0,1,1.37
2,2005.0,95.16,66.2,2,15,0,21.0,-13,28.96
3,2005.0,95.16,79.34,2,10,0,23.0,-8,15.82
4,2005.0,82.85,79.34,7,10,1,-3.0,-3,3.51


In [5]:
len(game_data_no_stats)

945

In [6]:
X_stats = game_data.drop(columns = ['Upset'])
y_stats = game_data['Upset']
X_no_stats = game_data_no_stats.drop(columns = ['Upset'])
y_no_stats = game_data_no_stats['Upset']

### Logistic Regression

In [None]:
# lasso regularization for feature selection
# coefficients range from 0.02 to ~2000
# each coefficient is 6% larger than last, there are 200 coefficients in total
lasso_reg = [0.02 * (1.06) ** i for i in range(200)]

# ridge regularization to prevent overfitting
ridge_reg = [1e-7, 3e-7, 1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3]

# scale used to scale columns before applying regularization
scale = StandardScaler()

def train_logistic_regression(stats, features = None):
    
    # if using the stats dataframe or the non stats dataframe
    if stats:
        X = X_stats
        y = y_stats
        lasso_coef = lasso_reg
    else:
        X = X_stats
        y = y_stats
        lasso_coef = [None]
        
    # list/dictionaries to store results
    feature_list = []
    scores = {}
    model_coefs = []
    models = []
    num_features = []
    
    # for each possible lasso regularization coefficient
    for c in tqdm(lasso_coef):
        
        if c is not None:
            # dont use season as a feature, use inputted features if given
            if features:
                X_lasso = X[features]
            else:
                X_lasso = X.drop(columns = ['Season'])

            # scale columns before regularization
            X_lasso = pd.DataFrame(scale.fit_transform(X_lasso), columns = X_lasso.columns)

            # fit L1 logistic regression for feature selection
            lasso = LogisticRegression(penalty = 'l1', C = c, random_state = 0, solver = 'saga', max_iter = 10000).fit(X_lasso, y)

            # filter for the columns with nonzero coefficients and the season column
            zero_cols = []
            if features:
                zero_cols = list(set(X.drop(columns = ['Season']).columns) - set(features))
            for i in range(len(lasso.coef_[0])):
                if lasso.coef_[0][i] == 0.0:
                    zero_cols.append(X_lasso.columns[i])
            nonzero_X = X.drop(columns = zero_cols)
        else:
            nonzero_X = X.copy()
        
        # if there is at least 1 nonzero column and the same amount of features hasn't already been built
        if (len(nonzero_X.columns) > 1 and (len(nonzero_X.columns) - 1) not in num_features):
            
            # try each L2 regularization coefficient
            for c2 in ridge_reg:
                
                # cross validate over each season
                for season in list(X['Season'].unique()):
                    
                    # add season to scores dictionary
                    if season not in scores:
                        scores[season] = []
                        
                    # split into train and validation sets
                    X_train = nonzero_X[nonzero_X['Season'] != season].drop(columns = ['Season'])
                    X_val = nonzero_X[nonzero_X['Season'] == season].drop(columns = ['Season'])
                    X_train = pd.DataFrame(scale.fit_transform(X_train), columns = X_train.columns)
                    X_val = pd.DataFrame(scale.transform(X_val), columns = X_val.columns)
                    y_train = y[X_train.index]
                    y_val = y[X_val.index]
                    
                    # fit logistic regression
                    log_model = LogisticRegression(penalty = 'l2', C = c2, max_iter = 10000, random_state = 0, solver = "sag").fit(X_train, y_train)
                    
                    # predict win probabilities
                    predictions = log_model.predict_proba(X_val)
                    
                    # calculate log loss and store
                    val_score = log_loss(y_val, predictions)
                    scores[season].append(val_score)
                    
                # retrain model on full dataset for coefficients
                log_model = LogisticRegression(penalty = 'l2', C = c2, max_iter = 10000, random_state = 0, solver = "sag").fit(pd.DataFrame(scale.fit_transform(nonzero_X), columns = nonzero_X.columns).drop(columns = ['Season']), y)
                
                # store model details
                feature_list.append(nonzero_X.drop(columns = ['Season']).columns)
                num_features.append(len(nonzero_X.drop(columns = ['Season']).columns))
                models.append(log_model)
                model_coefs.append({'lasso_coef': c, 'ridge_coef': c2})
                
    # if using stats dataframe, add nulls for scores of years we don't have data for
    if 2005 not in scores:
        scores[2005] = [None for i in range(len(models))]
        scores[2006] = [None for i in range(len(models))]
        scores[2007] = [None for i in range(len(models))]
        scores[2008] = [None for i in range(len(models))]
        scores[2009] = [None for i in range(len(models))]
                
    # return dataframe of results
    return pd.DataFrame({'Type': ['log' for i in range(len(models))],
                         'Num_Features': num_features,
                         'Features': feature_list,
                         'Model': models,
                         'Model_Coef': model_coefs,
                         'Stats': [stats for i in range(len(models))],
                         '2005_Score': scores[2005],
                         '2006_Score': scores[2006],
                         '2007_Score': scores[2007],
                         '2008_Score': scores[2008],
                         '2009_Score': scores[2009],
                         '2010_Score': scores[2010],
                         '2011_Score': scores[2011],
                         '2012_Score': scores[2012],
                         '2013_Score': scores[2013],
                         '2014_Score': scores[2014],
                         '2015_Score': scores[2015],
                         '2016_Score': scores[2016],
                         '2017_Score': scores[2017],
                         '2018_Score': scores[2018],
                         '2019_Score': scores[2019]})

### K-Nearest Neighbors

The purpose of the knn model is due to the high collinearity between SeedDiff and MooreRatingDiff being a problem for the logistic regression. Therefore, it will only use the dataframe without stats

In [None]:
ks = [10 + i for i in range(90)]
weights = ['uniform', 'distance']

def train_knn():
    
    # to store model info during cross validation
    scores = {}
    model_coefs = []
    models = []
    
    # for repeatable randomization
    random.seed(0)
    
    # train 50 models
    for i in range(50):
        
        knn_params = {'n_neighbors': ks[random.randint(0, len(ks) - 1)],
                     'weights': weights[random.randint(0, len(weights) - 1)]}
        
        # for each season
        for season in list(X['Season'].unique()):

            # add season to scores dictionary
            if season not in scores:
                scores[season] = []

                # split into train and validation sets
                X_train = X[X['Season'] != season].drop(columns = ['Season'])
                X_val = X[X['Season'] == season].drop(columns = ['Season'])
                X_train = pd.DataFrame(scale.fit_transform(X_train), columns = X_train.columns)
                X_val = pd.DataFrame(scale.transform(X_val), columns = X_val.columns)
                y_train = y[X_train.index]
                y_val = y[X_val.index]

                # fit knn
                knn_model = KNeighborsClassifier(n_neighbors = knn_params['n_neighbors'],
                                                 weights = knn_params['weights']).fit(X_train, y_train)

                # predict win probabilities
                predictions = knn_model.predict_proba(X_val)

                # calculate log loss and store
                val_score = log_loss(y_val, predictions)
                scores[season].append(val_score)
                
        # store model details, no need to store KNN model yet
        models.append(None)
        model_coefs.append({'k': knn_params['n_neighbors'], 'weights': knn_params['weights']})
    
    # return dataframe of results
    return pd.DataFrame({'Type': ['knn' for i in range(len(models))],
                         'Num_Features': 'All',
                         'Features': 'All',
                         'Model': models,
                         'Model_Coef': model_coefs,
                         'Stats': [stats for i in range(len(models))],
                         '2005_Score': scores[2005],
                         '2006_Score': scores[2006],
                         '2007_Score': scores[2007],
                         '2008_Score': scores[2008],
                         '2009_Score': scores[2009],
                         '2010_Score': scores[2010],
                         '2011_Score': scores[2011],
                         '2012_Score': scores[2012],
                         '2013_Score': scores[2013],
                         '2014_Score': scores[2014],
                         '2015_Score': scores[2015],
                         '2016_Score': scores[2016],
                         '2017_Score': scores[2017],
                         '2018_Score': scores[2018],
                         '2019_Score': scores[2019]})

### Results

In [None]:
# train the models
log_results1 = train_logistic_regression(True)
log_results2 = train_logistic_regression(False)
knn_results = train_knn()
final_results = pd.concat([log_results1, log_results2, knn_results], ignore_index = True)

In [None]:
# calculate percentiles of scores in relation to other models
score_cols = []
for col in final_results.columns:
    if col.endswith("Score"):
        score_cols.append(col)

In [None]:
# calculate average score and average percentile
final_results['Avg_Score'] = final_results[score_cols].mean(axis = 1)

In [None]:
# function to show feature importances of given model by results row index
def features(row_index):
    
    # if model is logistic regression, return coefficients
    if final_results.loc[row_index, 'Type'] == 'log':
        
        model_features = list(final_results.loc[row_index, 'Features'])
        sign_list = []
        coef_list = []
        for i in range(len(model_features)):
            coef = round(model.coef_[0][i], 3)
            coef_list.append(coef)
            if coef > 0:
                sign_list.append('+')
            else:
                sign_list.append('-')
        
        return_df =  pd.DataFrame({'Feature': model_features,
                             'Sign': sign_list,
                            'Coefficient': coef_list})
        return return_df.reindex(return_df['Coefficient'].abs().sort_values(ascending = False).index)
        
    # KNN doesn't really have a feature importance
    else:
        print('All features have equal weight in KNN')

In [None]:
# print results
pd.set_option('display.max_rows', None)
final_results[['Type', 'Model_Coef', 'Avg_Score']].sort_values(by = ['Avg_Score'], ascending = True)