In [1]:
import numpy as np
import pandas as pd
import os
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from scipy.interpolate import UnivariateSpline
import statsmodels.api as sm
import matplotlib.pyplot as plt
import collections

In [2]:
tourney_results = pd.read_csv('NCAATourneyDetailedResults.csv')
seeds = pd.read_csv('NCAATourneySeeds.csv')
regular_results = pd.read_csv('RegularSeasonDetailedResults.csv')

def prepare_data(df):
    dfswap = df[['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT', 
    'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
    'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

    dfswap.loc[df['WLoc'] == 'H', 'WLoc'] = 'A'
    dfswap.loc[df['WLoc'] == 'A', 'WLoc'] = 'H'
    df.columns.values[6] = 'location'
    dfswap.columns.values[6] = 'location'    
      
    df.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(df.columns)]
    dfswap.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(dfswap.columns)]

    output = pd.concat([df, dfswap]).reset_index(drop=True)
    output.loc[output.location=='N','location'] = '0'
    output.loc[output.location=='H','location'] = '1'
    output.loc[output.location=='A','location'] = '-1'
    output.location = output.location.astype(int)
    
    output['PointDiff'] = output['T1_Score'] - output['T2_Score']
    
    return output

In [3]:
regular_data = prepare_data(regular_results)
tourney_data = prepare_data(tourney_results)

In [4]:
regular_data.shape

(164082, 35)

In [5]:
tourney_data.shape

(2096, 35)

In [6]:
boxscore_cols = [
        'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_OR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_PF', 
        'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_OR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk',  
        'PointDiff']

funcs = [np.mean]

In [7]:
season_statistics = regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg(funcs)
season_statistics.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,T1_OR,T1_Ast,T1_TO,T1_Stl,T1_PF,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_Blk,PointDiff
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
Season,T1_TeamID,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
2003,1102,19.142857,39.785714,7.821429,20.821429,4.178571,13.0,11.428571,5.964286,18.75,19.285714,42.428571,4.75,12.428571,9.607143,9.142857,12.964286,5.428571,1.571429,0.25
2003,1103,27.148148,55.851852,5.444444,16.074074,9.777778,15.222222,12.62963,7.259259,19.851852,27.777778,57.0,6.666667,18.37037,12.037037,15.481481,15.333333,6.407407,2.851852,0.62963
2003,1104,24.035714,57.178571,6.357143,19.857143,13.571429,12.107143,13.285714,6.607143,18.035714,23.25,55.5,6.357143,19.142857,10.892857,11.678571,13.857143,5.535714,3.178571,4.285714
2003,1105,24.384615,61.615385,7.576923,20.769231,13.5,14.538462,18.653846,9.307692,20.230769,27.0,58.961538,6.269231,17.538462,13.192308,15.807692,18.807692,9.384615,4.192308,-4.884615
2003,1106,23.428571,55.285714,6.107143,17.642857,12.285714,11.678571,17.035714,8.357143,18.178571,21.714286,53.392857,4.785714,15.214286,11.321429,11.785714,15.071429,8.785714,3.178571,-0.142857


In [8]:
season_statistics = regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg(funcs).reset_index()
season_statistics.head()

Unnamed: 0_level_0,Season,T1_TeamID,T1_FGM,T1_FGA,T1_FGM3,T1_FGA3,T1_OR,T1_Ast,T1_TO,T1_Stl,...,T2_FGM,T2_FGA,T2_FGM3,T2_FGA3,T2_OR,T2_Ast,T2_TO,T2_Stl,T2_Blk,PointDiff
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,mean,mean,mean,mean,mean,mean,mean,...,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
0,2003,1102,19.142857,39.785714,7.821429,20.821429,4.178571,13.0,11.428571,5.964286,...,19.285714,42.428571,4.75,12.428571,9.607143,9.142857,12.964286,5.428571,1.571429,0.25
1,2003,1103,27.148148,55.851852,5.444444,16.074074,9.777778,15.222222,12.62963,7.259259,...,27.777778,57.0,6.666667,18.37037,12.037037,15.481481,15.333333,6.407407,2.851852,0.62963
2,2003,1104,24.035714,57.178571,6.357143,19.857143,13.571429,12.107143,13.285714,6.607143,...,23.25,55.5,6.357143,19.142857,10.892857,11.678571,13.857143,5.535714,3.178571,4.285714
3,2003,1105,24.384615,61.615385,7.576923,20.769231,13.5,14.538462,18.653846,9.307692,...,27.0,58.961538,6.269231,17.538462,13.192308,15.807692,18.807692,9.384615,4.192308,-4.884615
4,2003,1106,23.428571,55.285714,6.107143,17.642857,12.285714,11.678571,17.035714,8.357143,...,21.714286,53.392857,4.785714,15.214286,11.321429,11.785714,15.071429,8.785714,3.178571,-0.142857


In [9]:
season_statistics.columns = [''.join(col).strip() for col in season_statistics.columns.values]
season_statistics.head()

Unnamed: 0,Season,T1_TeamID,T1_FGMmean,T1_FGAmean,T1_FGM3mean,T1_FGA3mean,T1_ORmean,T1_Astmean,T1_TOmean,T1_Stlmean,...,T2_FGMmean,T2_FGAmean,T2_FGM3mean,T2_FGA3mean,T2_ORmean,T2_Astmean,T2_TOmean,T2_Stlmean,T2_Blkmean,PointDiffmean
0,2003,1102,19.142857,39.785714,7.821429,20.821429,4.178571,13.0,11.428571,5.964286,...,19.285714,42.428571,4.75,12.428571,9.607143,9.142857,12.964286,5.428571,1.571429,0.25
1,2003,1103,27.148148,55.851852,5.444444,16.074074,9.777778,15.222222,12.62963,7.259259,...,27.777778,57.0,6.666667,18.37037,12.037037,15.481481,15.333333,6.407407,2.851852,0.62963
2,2003,1104,24.035714,57.178571,6.357143,19.857143,13.571429,12.107143,13.285714,6.607143,...,23.25,55.5,6.357143,19.142857,10.892857,11.678571,13.857143,5.535714,3.178571,4.285714
3,2003,1105,24.384615,61.615385,7.576923,20.769231,13.5,14.538462,18.653846,9.307692,...,27.0,58.961538,6.269231,17.538462,13.192308,15.807692,18.807692,9.384615,4.192308,-4.884615
4,2003,1106,23.428571,55.285714,6.107143,17.642857,12.285714,11.678571,17.035714,8.357143,...,21.714286,53.392857,4.785714,15.214286,11.321429,11.785714,15.071429,8.785714,3.178571,-0.142857


In [10]:
season_statistics_T1 = season_statistics.copy()
season_statistics_T2 = season_statistics.copy()

season_statistics_T1.columns = ["T1_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T1.columns)]
season_statistics_T2.columns = ["T2_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T2.columns)]
season_statistics_T1.columns.values[0] = "Season"
season_statistics_T2.columns.values[0] = "Season"

In [11]:
tourney_data = tourney_data[['Season', 'DayNum', 'T1_TeamID', 'T1_Score', 'T2_TeamID' ,'T2_Score']]
tourney_data.head()

Unnamed: 0,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score
0,2003,134,1421,92,1411,84
1,2003,136,1112,80,1436,51
2,2003,136,1113,84,1272,71
3,2003,136,1141,79,1166,73
4,2003,136,1143,76,1301,74


In [12]:
tourney_data = pd.merge(tourney_data, season_statistics_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, season_statistics_T2, on = ['Season', 'T2_TeamID'], how = 'left')

In [13]:
tourney_data.head()

Unnamed: 0,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score,T1_FGMmean,T1_FGAmean,T1_FGM3mean,T1_FGA3mean,...,T2_opponent_FGMmean,T2_opponent_FGAmean,T2_opponent_FGM3mean,T2_opponent_FGA3mean,T2_opponent_ORmean,T2_opponent_Astmean,T2_opponent_TOmean,T2_opponent_Stlmean,T2_opponent_Blkmean,T2_PointDiffmean
0,2003,134,1421,92,1411,84,24.37931,56.793103,6.482759,18.0,...,25.666667,60.4,7.533333,23.166667,11.933333,13.766667,14.333333,8.0,2.6,1.966667
1,2003,136,1112,80,1436,51,30.321429,65.714286,7.035714,20.071429,...,22.758621,55.068966,7.068966,21.448276,9.586207,13.275862,13.0,7.103448,3.655172,4.655172
2,2003,136,1113,84,1272,71,27.206897,56.896552,4.0,12.586207,...,23.275862,57.862069,5.896552,18.310345,12.344828,13.310345,15.068966,7.275862,3.172414,8.689655
3,2003,136,1141,79,1166,73,26.62069,52.689655,6.827586,17.931034,...,23.878788,55.333333,4.878788,14.30303,11.060606,12.363636,17.060606,6.333333,2.575758,14.909091
4,2003,136,1143,76,1301,74,27.344828,58.724138,6.413793,17.034483,...,23.433333,53.133333,5.733333,17.0,10.533333,12.566667,14.633333,7.433333,2.833333,4.4


In [16]:
tourney_data.shape

(2096, 44)

In [17]:
last14days_stats_T1 = regular_data.loc[regular_data.DayNum>118].reset_index(drop=True)
last14days_stats_T1['win'] = np.where(last14days_stats_T1['PointDiff']>0,1,0)
last14days_stats_T1 = last14days_stats_T1.groupby(['Season','T1_TeamID'])['win'].mean().reset_index(name='T1_win_ratio_14d')

last14days_stats_T2 = regular_data.loc[regular_data.DayNum>118].reset_index(drop=True)
last14days_stats_T2['win'] = np.where(last14days_stats_T2['PointDiff']<0,1,0)
last14days_stats_T2 = last14days_stats_T2.groupby(['Season','T2_TeamID'])['win'].mean().reset_index(name='T2_win_ratio_14d')

In [18]:
tourney_data = pd.merge(tourney_data, last14days_stats_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, last14days_stats_T2, on = ['Season', 'T2_TeamID'], how = 'left')

In [19]:
regular_season_effects = regular_data[['Season','T1_TeamID','T2_TeamID','PointDiff']].copy()
regular_season_effects['T1_TeamID'] = regular_season_effects['T1_TeamID'].astype(str)
regular_season_effects['T2_TeamID'] = regular_season_effects['T2_TeamID'].astype(str)
regular_season_effects['win'] = np.where(regular_season_effects['PointDiff']>0,1,0)
march_madness = pd.merge(seeds[['Season','TeamID']],seeds[['Season','TeamID']],on='Season')
march_madness.columns = ['Season', 'T1_TeamID', 'T2_TeamID']
march_madness.T1_TeamID = march_madness.T1_TeamID.astype(str)
march_madness.T2_TeamID = march_madness.T2_TeamID.astype(str)
regular_season_effects = pd.merge(regular_season_effects, march_madness, on = ['Season','T1_TeamID','T2_TeamID'])
regular_season_effects.shape

(9316, 5)

In [20]:
march_madness.head()

Unnamed: 0,Season,T1_TeamID,T2_TeamID
0,1985,1207,1207
1,1985,1207,1210
2,1985,1207,1228
3,1985,1207,1260
4,1985,1207,1374


In [27]:
regular_season_effects.head()

Unnamed: 0,Season,T1_TeamID,T2_TeamID,PointDiff,win
0,2003,1104,1328,6,1
1,2003,1272,1393,7,1
2,2003,1323,1237,44,1
3,2003,1242,1221,24,1
4,2003,1390,1462,1,1


In [22]:
def team_quality(season):
    formula = 'win~-1+T1_TeamID+T2_TeamID'
    glm = sm.GLM.from_formula(formula=formula, 
                              data=regular_season_effects.loc[regular_season_effects.Season==season,:], 
                              family=sm.families.Binomial()).fit()
    
    quality = pd.DataFrame(glm.params).reset_index()
    quality.columns = ['TeamID','quality']
    quality['Season'] = season
    quality['quality'] = np.exp(quality['quality'])
    quality = quality.loc[quality.TeamID.str.contains('T1_')].reset_index(drop=True)
    quality['TeamID'] = quality['TeamID'].apply(lambda x: x[10:14]).astype(int)
    return quality

In [30]:
glm_quality = pd.concat([team_quality(2003),
                         team_quality(2004),
                         team_quality(2005),
                         team_quality(2006),
                         team_quality(2007),
                         team_quality(2008),
                         team_quality(2009),
                         team_quality(2010),
                         team_quality(2011),
                         team_quality(2012),
                         team_quality(2013),
                         team_quality(2014),
                         team_quality(2015),
                         team_quality(2016),
                         team_quality(2017),
                         team_quality(2018)]).reset_index(drop=True)

  n_endog_mu = self._clean((1. - endog) / (1. - mu))
  t = np.exp(-z)
  endog_mu = self._clean(endog / mu)
  n_endog_mu = self._clean((1. - endog) / (1. - mu))
  endog_mu = self._clean(endog / mu)
  # Remove the CWD from sys.path while we load stuff.


In [31]:
glm_quality_T1 = glm_quality.copy()
glm_quality_T2 = glm_quality.copy()
glm_quality_T1.columns = ['T1_TeamID','T1_quality','Season']
glm_quality_T2.columns = ['T2_TeamID','T2_quality','Season']

In [32]:
tourney_data = pd.merge(tourney_data, glm_quality_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, glm_quality_T2, on = ['Season', 'T2_TeamID'], how = 'left')

In [33]:
tourney_data.head()

Unnamed: 0,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score,T1_FGMmean,T1_FGAmean,T1_FGM3mean,T1_FGA3mean,...,T2_opponent_ORmean,T2_opponent_Astmean,T2_opponent_TOmean,T2_opponent_Stlmean,T2_opponent_Blkmean,T2_PointDiffmean,T1_win_ratio_14d,T2_win_ratio_14d,T1_quality,T2_quality
0,2003,134,1421,92,1411,84,24.37931,56.793103,6.482759,18.0,...,11.933333,13.766667,14.333333,8.0,2.6,1.966667,1.0,0.833333,1.344291e-49,1.568229e-18
1,2003,136,1112,80,1436,51,30.321429,65.714286,7.035714,20.071429,...,9.586207,13.275862,13.0,7.103448,3.655172,4.655172,0.666667,1.0,4.934326,1.656707e-18
2,2003,136,1113,84,1272,71,27.206897,56.896552,4.0,12.586207,...,12.344828,13.310345,15.068966,7.275862,3.172414,8.689655,0.666667,0.75,0.3521347,1.194127
3,2003,136,1141,79,1166,73,26.62069,52.689655,6.827586,17.931034,...,11.060606,12.363636,17.060606,6.333333,2.575758,14.909091,1.0,1.0,,2.604195
4,2003,136,1143,76,1301,74,27.344828,58.724138,6.413793,17.034483,...,10.533333,12.566667,14.633333,7.433333,2.833333,4.4,0.333333,0.6,0.9548861,0.09477385


In [34]:
seeds['seed'] = seeds['Seed'].apply(lambda x: int(x[1:3]))
seeds.head()

Unnamed: 0,Season,Seed,TeamID,seed
0,1985,W01,1207,1
1,1985,W02,1210,2
2,1985,W03,1228,3
3,1985,W04,1260,4
4,1985,W05,1374,5


In [35]:
seeds_T1 = seeds[['Season','TeamID','seed']].copy()
seeds_T2 = seeds[['Season','TeamID','seed']].copy()
seeds_T1.columns = ['Season','T1_TeamID','T1_seed']
seeds_T2.columns = ['Season','T2_TeamID','T2_seed']

In [36]:
tourney_data = pd.merge(tourney_data, seeds_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, seeds_T2, on = ['Season', 'T2_TeamID'], how = 'left')

In [37]:
tourney_data["Seed_diff"] = tourney_data["T1_seed"] - tourney_data["T2_seed"]

In [38]:
tourney_data.head()

Unnamed: 0,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score,T1_FGMmean,T1_FGAmean,T1_FGM3mean,T1_FGA3mean,...,T2_opponent_Stlmean,T2_opponent_Blkmean,T2_PointDiffmean,T1_win_ratio_14d,T2_win_ratio_14d,T1_quality,T2_quality,T1_seed,T2_seed,Seed_diff
0,2003,134,1421,92,1411,84,24.37931,56.793103,6.482759,18.0,...,8.0,2.6,1.966667,1.0,0.833333,1.344291e-49,1.568229e-18,16,16,0
1,2003,136,1112,80,1436,51,30.321429,65.714286,7.035714,20.071429,...,7.103448,3.655172,4.655172,0.666667,1.0,4.934326,1.656707e-18,1,16,-15
2,2003,136,1113,84,1272,71,27.206897,56.896552,4.0,12.586207,...,7.275862,3.172414,8.689655,0.666667,0.75,0.3521347,1.194127,10,7,3
3,2003,136,1141,79,1166,73,26.62069,52.689655,6.827586,17.931034,...,6.333333,2.575758,14.909091,1.0,1.0,,2.604195,11,6,5
4,2003,136,1143,76,1301,74,27.344828,58.724138,6.413793,17.034483,...,7.433333,2.833333,4.4,0.333333,0.6,0.9548861,0.09477385,8,9,-1


In [39]:
features = list(season_statistics_T1.columns[2:999]) + \
    list(season_statistics_T2.columns[2:999]) + \
    list(seeds_T1.columns[2:999]) + \
    list(seeds_T2.columns[2:999]) + \
    list(last14days_stats_T1.columns[2:999]) + \
    list(last14days_stats_T2.columns[2:999]) + \
    ["Seed_diff"] + ["T1_quality","T2_quality"]

len(features)

45

In [40]:
features

['T1_FGMmean',
 'T1_FGAmean',
 'T1_FGM3mean',
 'T1_FGA3mean',
 'T1_ORmean',
 'T1_Astmean',
 'T1_TOmean',
 'T1_Stlmean',
 'T1_PFmean',
 'T1_opponent_FGMmean',
 'T1_opponent_FGAmean',
 'T1_opponent_FGM3mean',
 'T1_opponent_FGA3mean',
 'T1_opponent_ORmean',
 'T1_opponent_Astmean',
 'T1_opponent_TOmean',
 'T1_opponent_Stlmean',
 'T1_opponent_Blkmean',
 'T1_PointDiffmean',
 'T2_FGMmean',
 'T2_FGAmean',
 'T2_FGM3mean',
 'T2_FGA3mean',
 'T2_ORmean',
 'T2_Astmean',
 'T2_TOmean',
 'T2_Stlmean',
 'T2_PFmean',
 'T2_opponent_FGMmean',
 'T2_opponent_FGAmean',
 'T2_opponent_FGM3mean',
 'T2_opponent_FGA3mean',
 'T2_opponent_ORmean',
 'T2_opponent_Astmean',
 'T2_opponent_TOmean',
 'T2_opponent_Stlmean',
 'T2_opponent_Blkmean',
 'T2_PointDiffmean',
 'T1_seed',
 'T2_seed',
 'T1_win_ratio_14d',
 'T2_win_ratio_14d',
 'Seed_diff',
 'T1_quality',
 'T2_quality']

In [None]:
param = {} 
#param['objective'] = 'reg:linear'
param['eval_metric'] =  'mae'
param['booster'] = 'gbtree'
param['eta'] = 0.02 #change to ~0.02 for final run
param['subsample'] = 0.35
param['colsample_bytree'] = 0.7
param['num_parallel_tree'] = 10 #recommend 10
param['min_child_weight'] = 40
param['gamma'] = 10
param['max_depth'] =  3
param['silent'] = 1

In [70]:
params = {
    'eval_metric':'mae',
    'booster': 'gbtree',
    'eta':0.02,
    'objective':'binary:logistic', # 多分类的问题
    'num_parallel_tree':10,
   # 'num_class': 10,               # 类别数，与 multisoftmax 并用
    'gamma': 10,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 3,               # 构建树的深度，越大越容易过拟合
   # 'lambda': 2,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.35,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 40,
    'silent': 1                  # 设置成1则没有运行信息输出，最好是设置为0.
   # 'eta': 0.007,                  # 如同学习率
   # 'seed': 1000,
   # 'nthread': 4,                  # cpu 线程数
}

In [41]:
df_train = tourney_data.loc[tourney_data['Season'] < 2018]
df_test =tourney_data.loc[tourney_data['Season'] >= 2018]


In [42]:
y_train= df_train['T1_Score'] - df_train['T2_Score']
X_train = df_train[features].values

In [43]:
y_test = df_test['T1_Score'] - df_test['T2_Score']
X_test = df_test[features].values

In [47]:
import xgboost as xgb
dtrain=xgb.DMatrix(X_train,label=y_train)
dtest=xgb.DMatrix(X_test)

  if getattr(data, 'base', None) is not None and \


In [48]:
watchlist = [(dtrain,'train')]

In [51]:
bst=xgb.train(params,dtrain,num_boost_round=300,evals=watchlist)

[0]	train-mae:11.3368
[1]	train-mae:11.2585
[2]	train-mae:11.1822
[3]	train-mae:11.1058
[4]	train-mae:11.0312
[5]	train-mae:10.9611
[6]	train-mae:10.8895
[7]	train-mae:10.8212
[8]	train-mae:10.7549
[9]	train-mae:10.6909
[10]	train-mae:10.6295
[11]	train-mae:10.5675
[12]	train-mae:10.5103
[13]	train-mae:10.4551
[14]	train-mae:10.4009
[15]	train-mae:10.3475
[16]	train-mae:10.2943
[17]	train-mae:10.2432
[18]	train-mae:10.1926
[19]	train-mae:10.1436
[20]	train-mae:10.0951
[21]	train-mae:10.0479
[22]	train-mae:10.0036
[23]	train-mae:9.96011
[24]	train-mae:9.91947
[25]	train-mae:9.87771
[26]	train-mae:9.83853
[27]	train-mae:9.80012
[28]	train-mae:9.76223
[29]	train-mae:9.72527
[30]	train-mae:9.68975
[31]	train-mae:9.65447
[32]	train-mae:9.62106
[33]	train-mae:9.58777
[34]	train-mae:9.55497
[35]	train-mae:9.52539
[36]	train-mae:9.49495
[37]	train-mae:9.46654
[38]	train-mae:9.44101
[39]	train-mae:9.41311
[40]	train-mae:9.38453
[41]	train-mae:9.35813
[42]	train-mae:9.33307
[43]	train-mae:9.3091

In [52]:
ypred=bst.predict(dtest)

In [55]:
yy = ypred

In [57]:
yy[yy > 0] = 1
yy[yy < 0] = 0

In [64]:
yy

array([1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 0.,
       1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0.,
       1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0.,
       0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 1., 0.,
       1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.],
      dtype=float32)

In [59]:
yytest = y_test

In [60]:
yytest[yytest > 0] = 1
yytest[yytest < 0] = 0

In [66]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (auc, classification_report, roc_auc_score, accuracy_score,
                             f1_score, log_loss, roc_curve, confusion_matrix, precision_score, recall_score)
from sklearn.preprocessing import StandardScaler
from math import sin, cos, sqrt, atan2, radians
import random
import statsmodels.api as sm

In [67]:
accuracy_score(yytest, yy)

0.6865671641791045

In [None]:
##bi

In [68]:
y_train_b = y_train

In [73]:
y_train_b[y_train_b>0]=1
y_train_b[y_train_b<0]=0

In [82]:
y_train= df_train['T1_Score'] - df_train['T2_Score']

In [84]:
y_train_b = y_train

In [85]:
y_train_b[y_train_b>0]=1
y_train_b[y_train_b<0]=0

In [87]:
import xgboost as xgb
dtrain=xgb.DMatrix(X_train,label=y_train_b)
dtest=xgb.DMatrix(X_test)

  if getattr(data, 'base', None) is not None and \


In [94]:
bst_b =xgb.train(params,dtrain,num_boost_round=500,evals=watchlist)

[0]	train-mae:11.413
[1]	train-mae:11.411
[2]	train-mae:11.4092
[3]	train-mae:11.4074
[4]	train-mae:11.4056
[5]	train-mae:11.4039
[6]	train-mae:11.4022
[7]	train-mae:11.4004
[8]	train-mae:11.3987
[9]	train-mae:11.397
[10]	train-mae:11.3954
[11]	train-mae:11.3938
[12]	train-mae:11.3923
[13]	train-mae:11.3909
[14]	train-mae:11.3893
[15]	train-mae:11.388
[16]	train-mae:11.3865
[17]	train-mae:11.3851
[18]	train-mae:11.3838
[19]	train-mae:11.3824
[20]	train-mae:11.3809
[21]	train-mae:11.3796
[22]	train-mae:11.3782
[23]	train-mae:11.3769
[24]	train-mae:11.3757
[25]	train-mae:11.3746
[26]	train-mae:11.3734
[27]	train-mae:11.3724
[28]	train-mae:11.3712
[29]	train-mae:11.3701
[30]	train-mae:11.3689
[31]	train-mae:11.3679
[32]	train-mae:11.3669
[33]	train-mae:11.3658
[34]	train-mae:11.3647
[35]	train-mae:11.3637
[36]	train-mae:11.3627
[37]	train-mae:11.3617
[38]	train-mae:11.3609
[39]	train-mae:11.3599
[40]	train-mae:11.359
[41]	train-mae:11.3581
[42]	train-mae:11.3572
[43]	train-mae:11.3563
[44

[348]	train-mae:11.3017
[349]	train-mae:11.3016
[350]	train-mae:11.3016
[351]	train-mae:11.3015
[352]	train-mae:11.3015
[353]	train-mae:11.3015
[354]	train-mae:11.3015
[355]	train-mae:11.3015
[356]	train-mae:11.3015
[357]	train-mae:11.3015
[358]	train-mae:11.3014
[359]	train-mae:11.3014
[360]	train-mae:11.3014
[361]	train-mae:11.3013
[362]	train-mae:11.3013
[363]	train-mae:11.3013
[364]	train-mae:11.3012
[365]	train-mae:11.3012
[366]	train-mae:11.3012
[367]	train-mae:11.3012
[368]	train-mae:11.3012
[369]	train-mae:11.3011
[370]	train-mae:11.3011
[371]	train-mae:11.3011
[372]	train-mae:11.301
[373]	train-mae:11.301
[374]	train-mae:11.3009
[375]	train-mae:11.3008
[376]	train-mae:11.3008
[377]	train-mae:11.3008
[378]	train-mae:11.3008
[379]	train-mae:11.3007
[380]	train-mae:11.3007
[381]	train-mae:11.3007
[382]	train-mae:11.3007
[383]	train-mae:11.3007
[384]	train-mae:11.3007
[385]	train-mae:11.3006
[386]	train-mae:11.3006
[387]	train-mae:11.3006
[388]	train-mae:11.3005
[389]	train-mae:11

In [95]:
ypred2=bst_b.predict(dtest)

In [96]:
ypred2

array([0.5210945 , 0.541719  , 0.43084982, 0.5125707 , 0.376961  ,
       0.25899237, 0.83568925, 0.6437013 , 0.75873977, 0.74846816,
       0.8185584 , 0.59772426, 0.42452264, 0.7733173 , 0.7292883 ,
       0.7170608 , 0.5414491 , 0.7728888 , 0.7674894 , 0.83166325,
       0.77565384, 0.38319972, 0.8315221 , 0.6053841 , 0.5188347 ,
       0.38364926, 0.16742955, 0.81392604, 0.7271405 , 0.81414944,
       0.8546394 , 0.3104234 , 0.6471178 , 0.17190696, 0.6961669 ,
       0.8163629 , 0.741322  , 0.6174454 , 0.8173826 , 0.64963096,
       0.2873306 , 0.66567534, 0.76627856, 0.8446838 , 0.40225244,
       0.21564496, 0.6580369 , 0.2432388 , 0.81542903, 0.16862576,
       0.2426734 , 0.8151444 , 0.20814016, 0.35571864, 0.32798728,
       0.80952024, 0.83294487, 0.7495166 , 0.47377715, 0.6810198 ,
       0.5115403 , 0.8098089 , 0.5214017 , 0.5735391 , 0.7780406 ,
       0.50684685, 0.56783104, 0.47897974, 0.4467671 , 0.5588544 ,
       0.48619455, 0.632006  , 0.7398139 , 0.1674447 , 0.36019

In [97]:
y_pred_value = [round(value) for value in ypred2]

In [98]:
accuracy_score(yytest, y_pred_value)

0.7164179104477612

In [99]:
log_loss(yytest, ypred2)

0.6089694502193537