In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import brier_score_loss

## Men's Data

In [2]:
# Load data into Pandas dataframes
mRegDetail = pd.read_csv('data/men data/MRegularSeasonDetailedResults.csv')
mTournCompact = pd.read_csv('data/men data/MNCAATourneyCompactResults.csv')
mNames = pd.read_csv('data/men data/MTeamSpellings.csv')

In [3]:
display(mRegDetail.columns.values)

array(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore',
       'WLoc', 'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA',
       'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA',
       'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO',
       'LStl', 'LBlk', 'LPF'], dtype=object)

In [4]:
# Split regular season detailed results into dataframes focused on outcome for one team
mRegWinners = pd.DataFrame()
mRegLossers = pd.DataFrame()

# Establish new columns for that includes stats for one team
columns = ['Season', 'TeamID', 'Score', 'OppScore',
       'NumOT', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA',
       'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'OppFGM', 'OppFGA',
       'OppFGM3', 'OppFGA3', 'OppFTM', 'OppFTA', 'OppOR', 'OppDR', 'OppAst', 'OppTO',
       'OppStl', 'OppBlk', 'OppPF']

In [5]:
# Split winners from regular season
mRegWinners[columns] = mRegDetail[['Season', 'WTeamID', 'WScore', 'LScore',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA',
       'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA',
       'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO',
       'LStl', 'LBlk', 'LPF']]

# Add wins and losses columns
mRegWinners['Wins'] = 1
mRegWinners['Losses'] = 0

In [6]:
# Split lossers from regular season
mRegLossers[columns] = mRegDetail[['Season', 'LTeamID', 'LScore', 'WScore',
       'NumOT', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA',
       'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 'WFGM', 'WFGA',
       'WFGM3', 'WFGA3', 'WFTM', 'WFTA','WOR', 'WDR', 'WAst', 'WTO',
       'WStl', 'WBlk', 'WPF']]

# Add wins and losses columns
mRegLossers['Wins'] = 0
mRegLossers['Losses'] = 1

In [7]:
# Combine all games into one dataframe
mAllRegDetail = pd.concat([mRegWinners, mRegLossers])

In [8]:
# Sum stats for each season for each team for regular season
mRegSeasonDetail = mAllRegDetail.groupby(['Season', 'TeamID']).sum(numeric_only=True)
mRegSeasonDetail['NumGames'] = mRegSeasonDetail['Wins'] + mRegSeasonDetail['Losses']

In [9]:
display(mRegSeasonDetail.columns.values)

array(['Score', 'OppScore', 'NumOT', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM',
       'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'OppFGM',
       'OppFGA', 'OppFGM3', 'OppFGA3', 'OppFTM', 'OppFTA', 'OppOR',
       'OppDR', 'OppAst', 'OppTO', 'OppStl', 'OppBlk', 'OppPF', 'Wins',
       'Losses', 'NumGames'], dtype=object)

In [10]:
mRegSeasonDetail

Unnamed: 0_level_0,Unnamed: 1_level_0,Score,OppScore,NumOT,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,...,OppOR,OppDR,OppAst,OppTO,OppStl,OppBlk,OppPF,Wins,Losses,NumGames
Season,TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2003,1102,1603,1596,0,536,1114,219,583,312,479,117,...,269,564,256,363,152,44,514,12,16,28
2003,1103,2127,2110,8,733,1508,147,434,514,698,264,...,325,595,418,414,173,77,606,13,14,27
2003,1104,1940,1820,1,673,1601,178,556,416,586,380,...,305,634,327,388,155,89,539,17,11,28
2003,1105,1866,1993,4,634,1602,197,540,401,568,351,...,343,686,411,489,244,109,496,7,19,26
2003,1106,1781,1785,1,656,1548,171,494,298,461,344,...,317,626,330,422,246,89,452,13,15,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025,1476,1964,2056,0,676,1561,250,701,362,499,204,...,258,644,299,289,196,111,457,13,16,29
2025,1477,1995,2321,1,713,1714,260,828,309,480,246,...,311,728,491,404,286,119,476,5,26,31
2025,1478,2091,2356,7,718,1609,221,669,434,605,216,...,302,671,464,318,215,84,512,7,22,29
2025,1479,1842,2009,3,635,1507,196,549,376,466,174,...,233,662,389,351,161,79,472,12,16,28


In [11]:
mRegSeasonDetail.describe()

Unnamed: 0,Score,OppScore,NumOT,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,...,OppOR,OppDR,OppAst,OppTO,OppStl,OppBlk,OppPF,Wins,Losses,NumGames
count,7981.0,7981.0,7981.0,7981.0,7981.0,7981.0,7981.0,7981.0,7981.0,7981.0,...,7981.0,7981.0,7981.0,7981.0,7981.0,7981.0,7981.0,7981.0,7981.0,7981.0
mean,2074.908407,2074.908407,2.038592,730.481894,1670.609698,201.372886,586.539782,412.571733,589.485904,308.813557,...,308.813557,701.262123,387.635509,392.439669,192.658564,98.668964,540.972685,14.841373,14.841373,29.682747
std,283.860997,229.252643,1.76708,101.760242,193.137376,46.954447,120.412234,78.573769,107.677541,70.248796,...,58.974359,85.703983,57.324465,67.874634,31.737869,20.763284,74.427939,6.139575,5.296653,2.806203
min,292.0,339.0,0.0,99.0,229.0,36.0,104.0,58.0,85.0,25.0,...,37.0,111.0,80.0,37.0,31.0,11.0,75.0,0.0,0.0,4.0
25%,1896.0,1947.0,1.0,666.0,1561.0,168.0,502.0,360.0,520.0,263.0,...,271.0,650.0,352.0,349.0,172.0,85.0,496.0,10.0,11.0,28.0
50%,2081.0,2086.0,2.0,731.0,1681.0,198.0,580.0,411.0,589.0,311.0,...,312.0,705.0,388.0,391.0,192.0,98.0,543.0,15.0,15.0,30.0
75%,2263.0,2223.0,3.0,796.0,1796.0,232.0,666.0,464.0,658.0,356.0,...,349.0,756.0,425.0,435.0,213.0,111.0,590.0,19.0,19.0,32.0
max,3016.0,2874.0,12.0,1113.0,2282.0,389.0,1209.0,709.0,1020.0,555.0,...,523.0,1040.0,645.0,704.0,334.0,181.0,795.0,34.0,31.0,36.0


In [12]:
# Replace 0s in Losses with a small number to avoid dividing by 0
mRegSeasonDetail['Losses'] = mRegSeasonDetail['Losses'].replace(0, 1e-5)

In [13]:
# Create ___ per game stat for each boxscore stat
mRegSeasonFeatures = pd.DataFrame()

stats = ['Score', 'OppScore', 'NumOT', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA',
       'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'OppFGM', 'OppFGA',
       'OppFGM3', 'OppFGA3', 'OppFTM', 'OppFTA', 'OppOR', 'OppDR', 'OppAst', 'OppTO',
       'OppStl', 'OppBlk', 'OppPF']

for col in stats:
    mRegSeasonFeatures[col + '_PerGame'] = mRegSeasonDetail[col] / mRegSeasonDetail['NumGames']

In [14]:
# Create additional features
mRegSeasonFeatures['PointRatio'] = mRegSeasonDetail['Score'] / mRegSeasonDetail['OppScore'] # Points ratio
mRegSeasonFeatures['W/L'] = mRegSeasonDetail['Wins'] / mRegSeasonDetail['Losses'] # Win/Loss ratio
mRegSeasonFeatures['MOV'] = (mRegSeasonDetail['Score'] - mRegSeasonDetail['OppScore']) / mRegSeasonDetail['NumGames'] # Margin of victory
mRegSeasonFeatures['TORatio'] = mRegSeasonFeatures['TO_PerGame'] / mRegSeasonFeatures['OppTO_PerGame'] # Turnover ratio
mRegSeasonFeatures['FGM%'] = mRegSeasonDetail['FGM'] / mRegSeasonDetail['FGA'] # Scoring efficiency 
mRegSeasonFeatures['FG3%M'] = mRegSeasonDetail['FGM3'] / mRegSeasonDetail['FGA3'] # 3-Point efficiency
mRegSeasonFeatures['FGA3%'] = mRegSeasonDetail['FGA3'] / mRegSeasonDetail['FGA'] # 3-Point attempt rate
mRegSeasonFeatures['FTM%'] = mRegSeasonDetail['FTM'] / mRegSeasonDetail['FTA'] # Free throw makes %
mRegSeasonFeatures['FTA%'] = mRegSeasonDetail['FTA'] / mRegSeasonDetail['FGA'] # Free throw attempt rate
mRegSeasonFeatures['OppFTA%'] = mRegSeasonDetail['OppFTA'] / mRegSeasonDetail['OppFGA'] # Opponent free throw attempt rate
mRegSeasonFeatures['OR%'] = mRegSeasonDetail['OR'] / (mRegSeasonDetail['OR'] + mRegSeasonDetail['OppDR']) # Offensive rebound %
mRegSeasonFeatures['DR%'] = mRegSeasonDetail['DR'] / (mRegSeasonDetail['DR'] + mRegSeasonDetail['OppOR']) # Defensive rebound %

In [15]:
mRegSeasonFeatures

Unnamed: 0_level_0,Unnamed: 1_level_0,Score_PerGame,OppScore_PerGame,NumOT_PerGame,FGM_PerGame,FGA_PerGame,FGM3_PerGame,FGA3_PerGame,FTM_PerGame,FTA_PerGame,OR_PerGame,...,MOV,TORatio,FGM%,FG3%M,FGA3%,FTM%,FTA%,OppFTA%,OR%,DR%
Season,TeamID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2003,1102,57.250000,57.000000,0.000000,19.142857,39.785714,7.821429,20.821429,11.142857,17.107143,4.178571,...,0.250000,0.881543,0.481149,0.375643,0.523339,0.651357,0.429982,0.453704,0.171806,0.636486
2003,1103,78.777778,78.148148,0.296296,27.148148,55.851852,5.444444,16.074074,19.037037,25.851852,9.777778,...,0.629630,0.823671,0.486074,0.338710,0.287798,0.736390,0.462865,0.388564,0.307334,0.623407
2003,1104,69.285714,65.000000,0.035714,24.035714,57.178571,6.357143,19.857143,14.857143,20.928571,13.571429,...,4.285714,0.958763,0.420362,0.320144,0.347283,0.709898,0.366021,0.308880,0.374753,0.687179
2003,1105,71.769231,76.653846,0.153846,24.384615,61.615385,7.576923,20.769231,15.423077,21.846154,13.500000,...,-4.884615,0.991820,0.395755,0.364815,0.337079,0.705986,0.354557,0.415525,0.338476,0.636653
2003,1106,63.607143,63.750000,0.035714,23.428571,55.285714,6.107143,17.642857,10.642857,16.464286,12.285714,...,-0.142857,1.130332,0.423773,0.346154,0.319121,0.646421,0.297804,0.411371,0.354639,0.678173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025,1476,67.724138,70.896552,0.000000,23.310345,53.827586,8.620690,24.172414,12.482759,17.206897,7.034483,...,-3.172414,1.124567,0.433056,0.356633,0.449071,0.725451,0.319667,0.330079,0.240566,0.715859
2025,1477,64.354839,74.870968,0.032258,23.000000,55.290323,8.387097,26.709677,9.967742,15.483871,7.935484,...,-10.516129,1.121287,0.415986,0.314010,0.483081,0.643750,0.280047,0.364834,0.252567,0.654444
2025,1478,72.103448,81.241379,0.241379,24.758621,55.482759,7.620690,23.068966,14.965517,20.862069,7.448276,...,-9.137931,1.185535,0.446240,0.330344,0.415786,0.717355,0.376010,0.371396,0.243517,0.676313
2025,1479,65.785714,71.750000,0.107143,22.678571,53.821429,7.000000,19.607143,13.428571,16.642857,6.214286,...,-5.964286,0.777778,0.421367,0.357013,0.364300,0.806867,0.309224,0.382775,0.208134,0.686406


In [16]:
mTournCompact

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0
...,...,...,...,...,...,...,...,...
2513,2024,146,1301,76,1181,64,N,0
2514,2024,146,1345,72,1397,66,N,0
2515,2024,152,1163,86,1104,72,N,0
2516,2024,152,1345,63,1301,50,N,0


In [17]:
# Split compact tournament results into individual rows of winners and losers to introduce target feature "Result"
mWTourney = pd.DataFrame()
mWTourney[['Season', 'Team1', 'Team2']] = mTournCompact[['Season', 'WTeamID', 'LTeamID']]
mWTourney['Result'] = 1

mLTourney = pd.DataFrame()
mLTourney[['Season', 'Team1', 'Team2']] = mTournCompact[['Season', 'LTeamID', 'WTeamID']]
mLTourney['Result'] = 0

# Join individual together and drop games earlier than 03 to match up with regular season data
mTourneyInput = pd.concat([mWTourney, mLTourney])
mTourneyInput = mTourneyInput[mTourneyInput['Season'] >= 2003].reset_index(drop=True)

In [18]:
mTourneyInput

Unnamed: 0,Season,Team1,Team2,Result
0,2003,1421,1411,1
1,2003,1112,1436,1
2,2003,1113,1272,1
3,2003,1141,1166,1
4,2003,1143,1301,1
...,...,...,...,...
2759,2024,1181,1301,0
2760,2024,1397,1345,0
2761,2024,1104,1163,0
2762,2024,1301,1345,0


In [19]:
# Merge two team stats from mRegSeasonFeatures
mTourneyFinal = mTourneyInput.merge(mRegSeasonFeatures, left_on=['Season', 'Team1'], right_index=True, suffixes=('', '_T1'))
mTourneyFinal = mTourneyFinal.merge(mRegSeasonFeatures, left_on=['Season', 'Team2'], right_index=True, suffixes=('_T1', '_T2'))

# Drop columns that are not needed
mTourneyFinal.drop(columns=['Season', 'Team1', 'Team2'], inplace=True)

# Calculate the differences (Team1 - Team2) for the features for input to logistic regression
featureCols = [col for col in mRegSeasonFeatures if col not in ['Season', 'TeamID']]
for col in featureCols:
       mTourneyFinal[col + '_Diff'] = mTourneyFinal[col + '_T1'] - mTourneyFinal[col + '_T2']
       
# Drop all _T1 and _T2, keep only _Diff and Result
mTourneyFinal = mTourneyFinal[[col + '_Diff' for col in featureCols] + ['Result']]

In [20]:
mTourneyFinal

Unnamed: 0,Score_PerGame_Diff,OppScore_PerGame_Diff,NumOT_PerGame_Diff,FGM_PerGame_Diff,FGA_PerGame_Diff,FGM3_PerGame_Diff,FGA3_PerGame_Diff,FTM_PerGame_Diff,FTA_PerGame_Diff,OR_PerGame_Diff,...,TORatio_Diff,FGM%_Diff,FG3%M_Diff,FGA3%_Diff,FTM%_Diff,FTA%_Diff,OppFTA%_Diff,OR%_Diff,DR%_Diff,Result
0,-1.593103,7.614943,0.139080,-0.354023,1.526437,0.549425,-0.500000,-1.434483,-7.135632,-0.890805,...,0.200650,-0.018262,0.039433,-0.017801,0.142815,-0.139292,0.059755,-0.015697,-0.047099,1
1,17.421182,7.112069,0.002463,5.493842,9.852217,1.759852,4.588670,4.673645,5.448276,2.213054,...,-0.205109,0.016969,0.009777,0.028274,0.043580,0.030435,-0.012280,0.022337,-0.050163,1
2,1.448276,3.344828,0.034483,0.931034,-3.103448,-3.000000,-7.482759,2.586207,3.310345,-0.379310,...,-0.013110,0.040251,-0.030989,-0.113271,0.016122,0.078997,0.010485,0.026172,0.001629,1
3,0.102403,8.908046,-0.030303,-2.076280,-4.764890,-1.142111,-2.553814,5.397074,5.142111,-0.292581,...,0.351889,0.005763,-0.008284,-0.016226,0.072864,0.129120,0.077414,0.021658,-0.021651,1
4,2.082759,1.758621,0.070115,3.011494,5.390805,-1.552874,-5.465517,-2.387356,-0.949425,1.508046,...,-0.021196,0.009399,0.022444,-0.131799,-0.084846,-0.051395,-0.102222,0.015999,0.009181,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759,3.482639,-5.256944,-0.083333,1.142361,-1.753472,1.288194,1.736111,-0.090278,0.194444,-0.055556,...,0.061522,0.032591,0.031070,0.039218,-0.011645,0.012902,-0.043363,0.024170,0.024492,0
2760,-3.925189,-2.276515,-0.090909,-1.077652,3.356061,0.385417,5.075758,-2.155303,-3.812500,-0.280303,...,-0.355366,-0.043992,-0.066100,0.063189,0.028050,-0.085009,0.127730,-0.050729,-0.036588,0
2761,9.279412,16.650735,0.062500,1.818015,6.102941,2.268382,6.310662,3.375000,3.283088,0.943015,...,0.143301,-0.018671,-0.001546,0.058938,0.041092,0.019318,0.071454,-0.018376,-0.046929,0
2762,-7.032828,2.542929,-0.007576,-1.126263,2.578283,-1.277778,-0.035354,-3.502525,-5.194444,-1.974747,...,-0.354789,-0.039121,-0.061963,-0.015370,0.012308,-0.103297,0.089519,-0.098329,-0.031517,0


In [21]:
mTourneyFinal.describe()

Unnamed: 0,Score_PerGame_Diff,OppScore_PerGame_Diff,NumOT_PerGame_Diff,FGM_PerGame_Diff,FGA_PerGame_Diff,FGM3_PerGame_Diff,FGA3_PerGame_Diff,FTM_PerGame_Diff,FTA_PerGame_Diff,OR_PerGame_Diff,...,TORatio_Diff,FGM%_Diff,FG3%M_Diff,FGA3%_Diff,FTM%_Diff,FTA%_Diff,OppFTA%_Diff,OR%_Diff,DR%_Diff,Result
count,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,...,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0,2764.0
mean,0.0,0.0,0.0,0.0,8.226255e-17,0.0,1.0282820000000001e-17,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.6066899999999998e-19,0.0,3.213381e-19,0.0,0.0,-1.6066899999999998e-19,0.5
std,7.312093,6.355166,0.072274,2.914355,4.695264,1.80667,4.223095,2.702947,3.648958,2.494943,...,0.195575,0.032604,0.037261,0.07191683,0.050658,0.06419949,0.079799,0.05649,0.04051447,0.50009
min,-22.892857,-25.035714,-0.293169,-9.357143,-22.07143,-6.0625,-14.27849,-10.205882,-13.094118,-9.214286,...,-0.606506,-0.135937,-0.13976,-0.2727839,-0.172779,-0.2451213,-0.289826,-0.191864,-0.1449883,0.0
25%,-4.857429,-4.330079,-0.035104,-1.987506,-3.081258,-1.211338,-2.766433,-1.836522,-2.40554,-1.675716,...,-0.13651,-0.021822,-0.023834,-0.04953287,-0.034681,-0.04227682,-0.055724,-0.037664,-0.02744192,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
75%,4.857429,4.330079,0.035104,1.987506,3.081258,1.211338,2.766433,1.836522,2.40554,1.675716,...,0.13651,0.021822,0.023834,0.04953287,0.034681,0.04227682,0.055724,0.037664,0.02744192,1.0
max,22.892857,25.035714,0.293169,9.357143,22.07143,6.0625,14.27849,10.205882,13.094118,9.214286,...,0.606506,0.135937,0.13976,0.2727839,0.172779,0.2451213,0.289826,0.191864,0.1449883,1.0


## Logistic Regression

In [22]:
# Setup scaler
scaler = StandardScaler()

# Setup input and target
X = mTourneyFinal.drop(columns=['Result'])
y = mTourneyFinal['Result']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit scaler on training data and transform both train and test sets
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)  # Use transform (not fit_transform) on test set to prevent data leakage

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Get model predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
print(y_pred_proba[:5]) # Sanity check

# Calculate logg loss
logLoss = log_loss(y_test, y_pred_proba)
print(f'Log Loss: {logLoss}')

# Calculate brier score
y_pred_proba_class1 = y_pred_proba[:, 1]
brier_score = brier_score_loss(y_test, y_pred_proba_class1)
print(f'Brier Score: {brier_score}')

[[0.8324654  0.1675346 ]
 [0.46510955 0.53489045]
 [0.09471392 0.90528608]
 [0.56849522 0.43150478]
 [0.33720608 0.66279392]]
Log Loss: 0.6013579939175102
Brier Score: 0.20769966982731866


## Hyperparameter Tuning

In [23]:
# # Perform grid search
# grid = {
#     'C': [0.001, 0.1, 1, 10, 100],
#     'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
#     'max_iter': [100, 250, 500, 750, 1000],
#     'tol': [1e-4, 1e-5]
# }
# 
# gridLog = LogisticRegression()
# gridSearch = GridSearchCV(gridLog, grid, cv=5, scoring=['precision', 'recall', 'f1'], refit='f1')
# gridSearch.fit(X_train, y_train)
# print(f'Best Params: {gridSearch.best_params_}')

In [24]:
# Train with new parameters
newModel = LogisticRegression(
    C=1,
    max_iter=100,
    solver='saga',
    tol=0.0001
)
newModel.fit(X_train, y_train)

# Get model predictions
y_pred = newModel.predict(X_test)
y_pred_proba = newModel.predict_proba(X_test)

# Compute Metrics log loss and brier score
logLoss = log_loss(y_test, y_pred_proba)
print(f'Log Loss: {logLoss}')
y_pred_proba_class1 = y_pred_proba[:, 1]
brier_score = brier_score_loss(y_test, y_pred_proba_class1)
print(f'Brier Score: {brier_score}')

Log Loss: 0.6013059888683515
Brier Score: 0.20771566236729694


