In [29]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# load in datasets separately
cbb = pd.read_csv("cbb.csv")
cbb2020 = pd.read_csv("cbb20.csv")


# perform linear regression to get a rough idea which stats are most associated with wins
Y = cbb[['W']] # dependent variable is wins
# team stats are the independent variables
X = cbb[['ADJOE','ADJDE','BARTHAG','EFG_O','EFG_D','TOR','TORD','ORB','DRB','FTR','FTRD','2P_O','2P_D','3P_O','3P_D','ADJ_T','WAB']]
x_vars = list(X.columns.values) # make a list of independend variable names for dataframe

# perform linear regression
cbb_reg = LinearRegression()
cbb_reg.fit(X, Y)

# create dataframe to show coefficients and variables together
coefficients = pd.concat([pd.DataFrame({'Variable':X.columns}),pd.DataFrame(np.transpose(cbb_reg.coef_))], axis = 1)
coef_sum = coefficients.rename(columns={0:'Coefficient'}) # make new df after renaming coefficient column
coef_sum

Unnamed: 0,Variable,Coefficient
0,ADJOE,-0.184692
1,ADJDE,0.349388
2,BARTHAG,-2.871174
3,EFG_O,1.107144
4,EFG_D,-0.99567
5,TOR,-0.488813
6,TORD,0.760188
7,ORB,0.212232
8,DRB,-0.422526
9,FTR,0.02399


In [30]:
# function for predicting team wins in each conference

def total_win_predict(past_data, pred_data):
    confs = pred_data.CONF.unique() # conference names to loop through
    accuracy_scores = [] # empty list to be filled with accuracy scores for each conference
    for i in confs:
        # mask stats by conference i
        conf_past = past_data.loc[past_data['CONF'] == i] 
        conf_pred = pred_data.loc[pred_data['CONF'] == i]
        # create list of team names and actual wins for dataframes
        teams = list(conf_pred.TEAM)
        actual_wins = list(conf_pred.W)
        
        # split train and test data (past is train, current is test)
        # features are the stats, target is the number of wins
        features_train = conf_past[['ADJOE','ADJDE','BARTHAG','EFG_O','EFG_D','TOR','TORD','ORB','DRB','FTR','FTRD','2P_O','2P_D','3P_O','3P_D','ADJ_T','WAB']]
        target_train = conf_past[['W']]
        features_test = conf_pred[['ADJOE','ADJDE','BARTHAG','EFG_O','EFG_D','TOR','TORD','ORB','DRB','FTR','FTRD','2P_O','2P_D','3P_O','3P_D','ADJ_T','WAB']]
        target_test = conf_pred[['W']]
        
        # perform logistic regression
        log_reg = LogisticRegression().fit(features_train, target_train)
        # make win prediction based on trained logistic regression model
        wins_pred = log_reg.predict(features_test)
        # calculate accuracy (how close predicted wins are to actual for the conference (how many were predicted correctly))
        accuracy = accuracy_score(target_test, wins_pred)
        # add accuracy score to empty accuracy list
        accuracy_scores.append(accuracy)
        
        # create dataframe comparing predicted wins and actual wins 
        data = {'TEAM': teams, 'Predicted Wins': wins_pred, 'Actual Wins': actual_wins}
        conf_summary = pd.DataFrame.from_dict(data)
        
        print(i)
        print(conf_summary)
        print("Accuracy: \n", accuracy)
        
    # create dataframe with accuracy score for each conference
    accuracy_data = {'CONF': confs, 'Prediction Accuracy': accuracy_scores}
    accuracy_summary = pd.DataFrame.from_dict(accuracy_data)
    print(accuracy_summary)
        
    

In [31]:
total_win_predict(cbb, cbb2020)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


B12
            TEAM  Predicted Wins  Actual Wins
0         Kansas              31           28
1         Baylor              24           26
2  West Virginia              27           21
3     Texas Tech              23           18
4       Oklahoma              20           19
5   Oklahoma St.              17           18
6          Texas              24           19
7            TCU              24           16
8       Iowa St.              24           12
9     Kansas St.              11           11
Accuracy: 
 0.1
WCC
               TEAM  Predicted Wins  Actual Wins
0           Gonzaga              22           31
1               BYU              17           24
2      Saint Mary's              17           25
3     San Francisco              22           22
4           Pacific              20           23
5        Pepperdine              17           16
6       Santa Clara              19           19
7  Loyola Marymount              22           11
8         San Diego          

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


ASun
                 TEAM  Predicted Wins  Actual Wins
0             Liberty              17           30
1       North Florida              19           21
2        Jacksonville              10           14
3            Lipscomb              14           16
4             Stetson              17           16
5                NJIT              10            9
6       North Alabama              10           13
7  Florida Gulf Coast              14           10
8        Kennesaw St.               5            1
Accuracy: 
 0.0
SB
                   TEAM  Predicted Wins  Actual Wins
0             Texas St.              17           21
1           Georgia St.              20           19
2          UT Arlington              16           14
3           Little Rock              22           21
4      Georgia Southern              21           20
5         South Alabama              22           20
6      Coastal Carolina              20           16
7       Appalachian St.              21   

NEC
                   TEAM  Predicted Wins  Actual Wins
0        St. Francis PA              18           22
1         Robert Morris              18           20
2          Sacred Heart              16           20
3             Merrimack              13           20
4                Bryant              16           15
5          LIU Brooklyn              18           15
6   Fairleigh Dickinson              18           11
7      Mount St. Mary's              15           11
8        St. Francis NY              13           13
9                Wagner              13            8
10  Central Connecticut              13            4
Accuracy: 
 0.09090909090909091
SWAC
                     TEAM  Predicted Wins  Actual Wins
0                Southern              18           17
1        Prairie View A&M              22           19
2             Jackson St.              13           14
3          Texas Southern              13           16
4           Grambling St.              24       

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [32]:
# function predicting wins for only the power 6 conferences

def total_win_predict_POWER(past_data, pred_data):
    power_confs = np.array(['B12','B10','ACC','BE','P12','SEC'], dtype = object) # power conference names only to loop through
    accuracy_scores = [] # empty list to be filled with accuracy scores for each power conference
    for i in power_confs:
        conf_past = past_data.loc[past_data['CONF'] == i]
        conf_pred = pred_data.loc[pred_data['CONF'] == i]
        # create list of team names and actual wins for dataframes
        teams = list(conf_pred.TEAM)
        actual_wins = list(conf_pred.W)
        
        # split train and test data (past is train, current is test)
        # features are the stats, target is the number of wins
        features_train = conf_past[['ADJOE','ADJDE','BARTHAG','EFG_O','EFG_D','TOR','TORD','ORB','DRB','FTR','FTRD','2P_O','2P_D','3P_O','3P_D','ADJ_T','WAB']]
        target_train = conf_past[['W']]
        features_test = conf_pred[['ADJOE','ADJDE','BARTHAG','EFG_O','EFG_D','TOR','TORD','ORB','DRB','FTR','FTRD','2P_O','2P_D','3P_O','3P_D','ADJ_T','WAB']]
        target_test = conf_pred[['W']]
        
        # perform logistic regression
        log_reg = LogisticRegression().fit(features_train, target_train)
        # make win prediction based on trained logistic regression model
        wins_pred = log_reg.predict(features_test)
        # calculate accuracy (how close predicted wins are to actual for the conference (how many were predicted correctly))
        accuracy = accuracy_score(target_test, wins_pred)
        # add accuracy score to empty accuracy list
        accuracy_scores.append(accuracy)
        
        # create dataframe comparing predicted wins and actual wins
        data = {'TEAM': teams, 'Predicted Wins': wins_pred, 'Actual Wins': actual_wins}
        conf_summary = pd.DataFrame.from_dict(data)
        
        print(i)
        print(conf_summary)
        print("Accuracy: \n", accuracy)
    
    # create dataframe with accuracy score for each power conference
    accuracy_data = {'CONF': power_confs, 'Prediction Accuracy': accuracy_scores}
    accuracy_summary = pd.DataFrame.from_dict(accuracy_data)
    print(accuracy_summary)

In [33]:
power_pred = total_win_predict_POWER(cbb,cbb2020)

B12
            TEAM  Predicted Wins  Actual Wins
0         Kansas              31           28
1         Baylor              24           26
2  West Virginia              27           21
3     Texas Tech              23           18
4       Oklahoma              20           19
5   Oklahoma St.              17           18
6          Texas              24           19
7            TCU              24           16
8       Iowa St.              24           12
9     Kansas St.              11           11
Accuracy: 
 0.1
B10
            TEAM  Predicted Wins  Actual Wins
0   Michigan St.              26           22
1       Ohio St.              27           21
2       Michigan              26           19
3       Penn St.              24           21
4      Wisconsin              15           21
5         Purdue              15           16
6       Maryland              19           24
7      Minnesota              26           15
8       Illinois              19           21
9        R

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
