In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# game data outputted from W1_DataCleaning.ipynb
game_data = pd.read_csv('mydata/womens/game_data.csv')
game_data.head()

In [None]:
# create a df for a logistic regression model to predict win probability

win_prob_df = pd.DataFrame()

# Response variable
win_prob_df['Win'] = (game_data['Score_x'] > game_data['Score_y']).astype('int64')

# Predictors

# Difference in NCAA tournament Seeds
win_prob_df['SeedDiff'] = game_data['Seed_x'] - game_data['Seed_y']

# Difference in efficiency metrics and strength of schedule metrics
win_prob_df['MooreRatingDiff'] = game_data['MooreRating_x'] - game_data['MooreRating_y']

# season
win_prob_df['Season'] = game_data['Season']

win_prob_df.head()

In [None]:
# To make team x be the team with the higher rating and team y be the team with the lower rating
def switch_teams(row)
    # if rating x is less than rating y
    if row['MooreRating_x'] < row['MooreRating_y']:
        underdog = row['Score_x']  # "Worse" team's score
        favorite = row['Score_y'# "Better" team's score
        row['Score_x'] = favorite
        row['Score_y'] = underdog
        underdog = row['TeamID_x']  # "Worse" team's ID
        favorite = row['TeamID_y']  # "Better" team's ID
        row['TeamID_x'] = favorite
        row['TeamID_y'] = underdog
        underdog = row['MooreRating_x']  # "Worse" team's rating
        favorite = row['MooreRating_y']  # "Better" team's rating
        row['MooreRating_x'] = favorite
        row['MooreRating_y'] = underdog
    return row

In [None]:
game_data = game_data.apply(switch_teams, axis = 1)
game_data.head(60)

In [None]:
# create a df for a knn classifier model to predict upset probability

upset_prob_df = pd.DataFrame()

# Response variable
upset_prob_df['Upset'] = (game_data['Score_x'] > game_data['Score_y']).astype('int64')

# Predictors

# Difference in NCAA tournament Seeds
upset_prob_df['SeedDiff'] = game_data['Seed_x'] - game_data['Seed_y']

# Difference in efficiency metrics and strength of schedule metrics
upset_prob_df['MooreRatingDiff'] = game_data['MooreRating_x'] - game_data['MooreRating_y']

# season
upset_prob_df['Season'] = game_data['Season']

upset_prob_df.head()

### Logistic Regression

In [None]:
ridge_coef = [1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1e0, 3e0, 1e1, 3e1, 1e2, 3e2]

def train_logistic_regression():
    
    X = win_prob_df.drop(columns = ['Win'])
    y = win_prob_df['Win']
    
    # to store model info during cross validation
    scores = {}
    model_coefs = []
    models = []
    
    # for each ridge regularization coefficient
    for c in ridge_coef:
        
        # for each season
        for season in list(X['Season'].unique()):

            # add season to scores dictionary
            if season not in scores:
                scores[season] = []

                # split into train and validation sets
                X_train = X[X['Season'] != season].drop(columns = ['Season'])
                X_val = X[X['Season'] == season].drop(columns = ['Season'])
                X_train = pd.DataFrame(scale.fit_transform(X_train), columns = X_train.columns)
                X_val = pd.DataFrame(scale.transform(X_val), columns = X_val.columns)
                y_train = y[X_train.index]
                y_val = y[X_val.index]

                # fit logistic regression
                log_model = LogisticRegression(penalty = 'l2', C = c, max_iter = 10000, random_state = 0, solver = "sag").fit(X_train, y_train)

                # predict win probabilities
                predictions = log_model.predict_proba(X_val)

                # calculate log loss and store
                val_score = log_loss(y_val, predictions)
                scores[season].append(val_score)
                
        # retrain model on full dataset for coefficients
        log_model = LogisticRegression(penalty = 'l2', C = c, max_iter = 10000, random_state = 0, solver = "sag").fit(pd.DataFrame(scale.fit_transform(X), columns = X.columns).drop(columns = ['Season']), y)

        # store model details
        models.append(log_model)
        model_coefs.append({'ridge_coef': c})
    
    # return dataframe of results
    return pd.DataFrame({'Type': ['log' for i in range(len(models))],
                         'Model': models,
                         'Model_Coef': model_coefs,
                         '2008_Score': scores[2008],
                         '2009_Score': scores[2009],
                         '2010_Score': scores[2010],
                         '2011_Score': scores[2011],
                         '2012_Score': scores[2012],
                         '2013_Score': scores[2013],
                         '2014_Score': scores[2014],
                         '2015_Score': scores[2015],
                         '2016_Score': scores[2016],
                         '2017_Score': scores[2017],
                         '2018_Score': scores[2018],
                         '2019_Score': scores[2019]})

### K-Nearest Neighbors

In [None]:
ks = [10 + i for i in range(99)]
weights = ['uniform', 'distance']

def train_knn():
    
    X = upset_prob_df.drop(columns = ['Upset'])
    y = upset_prob_df['Upset']
    
    # to store model info during cross validation
    scores = {}
    model_coefs = []
    models = []
    
    # for repeatable randomization
    random.seed(0)
    
    # train 50 models
    for i in range(50):
        
        knn_params = {'n_neighbors': ks[random.randint(0, len(ks) - 1)],
                     'weights': weights[random.randint(0, len(weights) - 1)]}
        
        # for each season
        for season in list(X['Season'].unique()):

            # add season to scores dictionary
            if season not in scores:
                scores[season] = []

                # split into train and validation sets
                X_train = X[X['Season'] != season].drop(columns = ['Season'])
                X_val = X[X['Season'] == season].drop(columns = ['Season'])
                X_train = pd.DataFrame(scale.fit_transform(X_train), columns = X_train.columns)
                X_val = pd.DataFrame(scale.transform(X_val), columns = X_val.columns)
                y_train = y[X_train.index]
                y_val = y[X_val.index]

                # fit knn
                knn_model = KNeighborsClassifier(n_neighbors = knn_params['n_neighbors'],
                                                 weights = knn_params['weights']).fit(X_train, y_train)

                # predict win probabilities
                predictions = knn_model.predict_proba(X_val)

                # calculate log loss and store
                val_score = log_loss(y_val, predictions)
                scores[season].append(val_score)
                
        # store model details, no need to store KNN model yet
        models.append(None)
        model_coefs.append({'k': knn_params['n_neighbors'], 'weights': knn_params['weights']})
    
    # return dataframe of results
    return pd.DataFrame({'Type': ['knn' for i in range(len(models))],
                         'Model': models,
                         'Model_Coef': model_coefs,
                         '2008_Score': scores[2008],
                         '2009_Score': scores[2009],
                         '2010_Score': scores[2010],
                         '2011_Score': scores[2011],
                         '2012_Score': scores[2012],
                         '2013_Score': scores[2013],
                         '2014_Score': scores[2014],
                         '2015_Score': scores[2015],
                         '2016_Score': scores[2016],
                         '2017_Score': scores[2017],
                         '2018_Score': scores[2018],
                         '2019_Score': scores[2019]})

### Results

In [None]:
# train the models
log_results = train_logistic_regression()
knn_results = train_knn()
final_results = pd.concat([log_results, knn_results], ignore_index = True)

In [None]:
# calculate percentiles of scores in relation to other models
percentile_cols = []
score_cols = []
for col in final_results.columns:
    if col.endswith("Score"):
        score_cols.append(col)
        final_results[col[0:4] + "_Percentile"] = final_results[col].rank(pct = True, ascending = False)
        percentile_cols.append(col[0:4] + "_Percentile")

In [None]:
# calculate average score and average percentile
final_results['Avg_Score'] = final_results[score_cols].mean(axis = 1)
final_results['Avg_Percentile'] = final_results[percentile_cols].mean(axis = 1)

In [None]:
# function to show feature importances of given model by results row index
def features(row_index):
    
    # if model is logistic regression, return coefficients
    if final_results.loc[row_index, 'Type'] == 'log':
        
        model = final_results.loc[row_index, 'Model']
        
        result_string = 'Feature Coefficients: \n\n'
        
        model_features = final_results.loc[row_index, 'Features']
        for i in range(len(model_features)):
            result_string = result_string + model_features[i] + ': ' + model.coef_[0][i] + '\n'
        print(result_string)
        
    # KNN doesn't really have a feature importance
    else:
        print('All features have equal weight in KNN')

In [None]:
# print results
pd.set_option('display.max_rows', None)
final_results[['Type', 'Model_Coef', 'Avg_Score', 'Avg_Percentile']].sort_values(by = ['Avg_Score'], ascending = True)