In [2]:
import numpy as np
import pandas as pd
#import shap
import joblib
import time
import argparse
import shap

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import RobustScaler,Normalizer,QuantileTransformer,PowerTransformer,StandardScaler
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder,OneHotEncoder
from sklearn.metrics import mean_squared_error,r2_score,mean_squared_log_error,mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score,cross_validate

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from fastai.tabular.all import *
import optuna

import warnings
warnings.filterwarnings(action="ignore")

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
np.set_printoptions(threshold=1000)
pd.options.display.max_info_columns=1000

DATA_PATH = Path('../data')

In [3]:
df_in = pd.read_csv(DATA_PATH/'df_targ2_gtr.csv')
df_in.head()

Unnamed: 0,date,date_playerId,target1,target2,target3,target4,numberOfFollowers,plateAppearances,numberOfFollowersT,ageStart,dayNight,age,atBats,totalBases,primaryPositionName,leagueRankT,pctT,saves,yearPlay,BMI,rbi,playerId,homeRuns,homeWinsT,gamesStartedPitching,runsScored,birthCountry,lastTenLossesT,runsScoredT,grassWinsT,birthStateProvince,gamesInSeries,birthCity,hits,winsPitching,sportGamesBackT,dayLossesT,awayWinPct,strikeOutsPitching,divisionIdT,lastTenWinsT,grassLossesT,homeLossesG,awayWinsG,lossesPitching,leagueGamesBackT,outsPitching,saveOpportunities,inningsPitched,divisionGamesBackT,awayLossesT,leftOnBase,baseOnBalls,lossesT,turfWinsT,homeLossesT,bAward,bTrans
0,20180329,20180330_571466,0.002557,5.806027,0.042254,1.180622,29144,3,821935,23,0.0,30,3,0,0.0,7,0.0,0,7,26.77564,0,275.0,0,0,0,0,21.0,0,0,0,18.0,0,274.0,0,0,0,0,0.0,0,5.0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0
1,20180329,20180330_518934,0.975237,4.217009,0.876319,0.801785,0,5,518432,22,0.0,33,2,0,6.0,10,0.0,0,11,26.776316,0,150.0,0,0,0,0,21.0,1,2,0,5.0,0,655.0,0,0,1,0,0.0,0,3.0,0,1,0,0,0,1,0,0,0,1,1,0,3,1,0,0,0,0
2,20180329,20180330_593974,9.8e-05,3.431903,0.002296,0.246504,74,0,821935,25,0.0,29,0,0,5.0,7,0.0,0,4,29.427276,0,396.0,0,0,0,0,9.0,0,0,0,42.0,0,533.0,0,0,0,0,0.0,0,5.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,20180329,20180330_475054,0.200101,5.063232,0.016994,0.682426,0,0,535665,26,0.0,34,0,0,5.0,7,1.0,0,8,22.528251,0,75.0,0,1,0,0,21.0,0,6,0,38.0,0,592.0,0,0,0,0,0.0,1,1.0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0
4,20180329,20180330_640461,0.014848,2.153166,0.079916,0.225745,0,2,533453,24,0.0,29,2,0,4.0,5,1.0,0,5,26.959459,0,710.0,0,1,0,0,21.0,0,6,1,54.0,0,502.0,0,0,0,0,0.0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [3]:
param_lgb = {
    'device_type':'gpu',
    'gpu_device_id':1,
    'gpu_platform_id': 0,
    #'gpu_use_dp': False,
    'n_jobs' : 2,
    'seed':42,
    
    }


In [4]:
df_in = pd.read_csv(DATA_PATH/'targets_gameV2.csv')
len(df_in.columns),len(df_in)

(139, 137251)

In [5]:
df_in2 = pd.read_csv(DATA_PATH/'targets_nogameV2.csv')
df_in2.dropna(axis=1,inplace=True)
len(df_in2.columns),len(df_in2)

(122, 579404)

In [6]:
#df_in2.isna().sum()
#df_in2.head()

In [7]:
col_key = ['engagementMetricsDate','date','gamePk']
col_label = ['target1','target2','target3','target4']
col_num = ['age','ageStart','yearPlay','BMI','numberOfFollowers',
           'divisionRank','leagueRank','leagueGamesBack','sportGamesBack',
           'divisionGamesBack','wins','losses','pct','runsAllowed','runsScoredT',
           'homeWins','homeLosses','awayWins','awayLosses','lastTenWins',
           'lastTenLosses','extraInningWins','extraInningLosses','oneRunWins',
           'oneRunLosses','dayWins','dayLosses','nightWins','nightLosses','grassWins',
           'grassLosses','turfWins','turfLosses','divWins','divLosses',
           'numberOfFollowersT','flyOuts','groundOuts','runsScored','doubles',
           'triples','homeRuns','strikeOuts','baseOnBalls','intentionalWalks',
           'hits','hitByPitch','atBats','caughtStealing','stolenBases',
           'groundIntoDoublePlay','groundIntoTriplePlay','plateAppearances','totalBases',
           'rbi','leftOnBase','sacBunts','sacFlies','catchersInterference','pickoffs',
           'flyOutsPitching','airOutsPitching','groundOutsPitching','runsPitching',
           'doublesPitching','triplesPitching','homeRunsPitching','strikeOutsPitching',
           'baseOnBallsPitching','intentionalWalksPitching','hitsPitching',
           'hitByPitchPitching','atBatsPitching','caughtStealingPitching','stolenBasesPitching',
           'inningsPitched','earnedRuns','battersFaced','outsPitching','pitchesThrown',
           'balls','strikes','hitBatsmen','balks','wildPitches','pickoffsPitching',
           'rbiPitching','gamesFinishedPitching','inheritedRunners',
           'inheritedRunnersScored','catchersInterferencePitching','sacBuntsPitching',
           'sacFliesPitching','gamesInSeries','homeWinsT','homeLossesT','homeWinPct',
           'awayWinsT','awayLossesT','awayWinPct','scheduledInnings', 'gameNumber','battingOrder'
          ]
col_cat = ['birthCity','birthStateProvince','birthCountry','primaryPositionName',
          'status','awardName','divisionId','teamId','playerId',
           'detailedGameState', 'doubleHeader', 'dayNight', 'seriesDescription'
         ]
col_flag = ['divisionChamp','divisionLeader','gamesPlayedBatting',
            'gamesPlayedPitching','gamesStartedPitching','completeGamesPitching',
            'shutoutsPitching','winsPitching','lossesPitching','saveOpportunities',
            'saves','holds','blownSaves','isTie','homeWinner','awayWinner'
           ]
len(col_key+col_label+col_num+col_cat+col_flag)
col_miss = [x for x in df_in.columns if x not in col_key+col_label+col_num+col_cat+col_flag]
col_miss

[]

In [8]:
#df_in2.head(2)

In [9]:
col_key2 = ['engagementMetricsDate','date']
col_label2 = ['target1','target2','target3','target4']
col_num2 = ['age','ageStart','yearPlay','BMI','numberOfFollowers',
           'divisionRank','leagueRank','leagueGamesBack','sportGamesBack',
           'divisionGamesBack','wins','losses','pct','runsAllowed','runsScoredT',
           'homeWins','homeLosses','awayWins','awayLosses','lastTenWins',
           'lastTenLosses','extraInningWins','extraInningLosses','oneRunWins',
           'oneRunLosses','dayWins','dayLosses','nightWins','nightLosses','grassWins',
           'grassLosses','turfWins','turfLosses','divWins','divLosses',
           'numberOfFollowersT','flyOuts','groundOuts','runsScored','doubles',
           'triples','homeRuns','strikeOuts','baseOnBalls','intentionalWalks',
           'hits','hitByPitch','atBats','caughtStealing','stolenBases',
           'groundIntoDoublePlay','groundIntoTriplePlay','plateAppearances','totalBases',
           'rbi','leftOnBase','sacBunts','sacFlies','catchersInterference','pickoffs',
           'flyOutsPitching','airOutsPitching','groundOutsPitching','runsPitching',
           'doublesPitching','triplesPitching','homeRunsPitching','strikeOutsPitching',
           'baseOnBallsPitching','intentionalWalksPitching','hitsPitching',
           'hitByPitchPitching','atBatsPitching','caughtStealingPitching','stolenBasesPitching',
           'inningsPitched','earnedRuns','battersFaced','outsPitching','pitchesThrown',
           'balls','strikes','hitBatsmen','balks','wildPitches','pickoffsPitching',
           'rbiPitching','gamesFinishedPitching','inheritedRunners',
           'inheritedRunnersScored','catchersInterferencePitching','sacBuntsPitching',
           'sacFliesPitching','battingOrder'
          ]
col_cat2 = ['birthCity','birthStateProvince','birthCountry','primaryPositionName',
          'status','awardName','divisionId','teamId','playerId',
         ]
col_flag2 = ['divisionChamp','divisionLeader','gamesPlayedBatting',
            'gamesPlayedPitching','gamesStartedPitching','completeGamesPitching',
            'shutoutsPitching','winsPitching','lossesPitching','saveOpportunities',
            'saves','holds','blownSaves'
           ]
len(col_key2+col_label2+col_num2+col_cat2+col_flag2)
col_miss2 = [x for x in df_in2.columns if x not in col_key2+col_label2+col_num2+col_cat2+col_flag2]
col_miss2

[]

In [10]:
df_in[col_flag] = df_in[col_flag].astype('str')
df_in2[col_flag2] = df_in2[col_flag2].astype('str')

In [11]:
# target1: player+team
col_t1_game = [
    'rbi','homeRuns','numberOfFollowers','numberOfFollowersT',
    'plateAppearances','totalBases','ageStart','primaryPositionName',
    'dayNight','pct','inningsPitched','grassWins','grassLosses','dayWins','age',
    'atBatsPitching','hitsPitching','gamesStartedPitching','winsPitching','leagueRank',
    'strikeOutsPitching','divWins','playerId','hits','awayLosses',
    'divisionGamesBack','BMI','saves','homeLossesT','teamId','awayWins',
    'lastTenWins','leagueGamesBack','yearPlay','runsPitching',
    'gamesInSeries','oneRunWins','airOutsPitching','wins',
    'birthCity','divisionRank','awardName','earnedRuns',
    'divLosses','awayLossesT','extraInningLosses','homeWins',
    'awayWinPct','turfWins','battingOrder'
]
#
col_t1_nogame = [
    'status','numberOfFollowers','primaryPositionName','age',
    'ageStart','yearPlay','numberOfFollowersT','playerId',
    'divLosses','runsAllowed','BMI','dayWins','pct','teamId',
    'battingOrder'
    
]

col_t1_game_cat = [x for x in col_cat+col_flag if x in col_t1_game]
col_t1_nogame_cat = [x for x in col_cat+col_flag if x in col_t1_nogame]

In [12]:
# all
#cat_encorder = OrdinalEncoder()
#df_in[col_cat+col_flag] = cat_encorder.fit_transform(df_in[col_cat+col_flag])

#cat_encorder2 = OrdinalEncoder()
#df_in2[col_cat2+col_flag2] = cat_encorder2.fit_transform(df_in2[col_cat2+col_flag2])

#target1
df_in = df_in[col_key+col_label+col_t1_game].copy(deep=True)
cat_encorder = OrdinalEncoder()
df_in[col_t1_game_cat] = cat_encorder.fit_transform(df_in[col_t1_game_cat])

df_in2 = df_in2[col_key2+col_label2+col_t1_nogame].copy(deep=True)
cat_encorder2 = OrdinalEncoder()
df_in2[col_t1_nogame_cat] = cat_encorder2.fit_transform(df_in2[col_t1_nogame_cat])


In [13]:
#df_in11 = pd.get_dummies(df_in, columns=col_flag)
#df_in22 = pd.get_dummies(df_in2, columns=col_flag2)

In [14]:
#df_in2.head(2)
#df_in.head(1)
#
#df_cat.shape,len(col_cat)
#cat_encorder.categories_
#df_cat

In [15]:
dataTrain = df_in[df_in.date<20210401].copy(deep=True)
dataTest = df_in[df_in.date>=20210401].copy(deep=True)

dataTrain2 = df_in2[df_in2.date<20210401].copy(deep=True)
dataTest2 = df_in2[df_in2.date>=20210401].copy(deep=True)

In [16]:
dataTrain.shape, dataTest.shape,dataTrain2.shape, dataTest2.shape

((126010, 57), (11241, 57), (554416, 21), (24988, 21))

In [17]:
'''
# stardar scaler
scaler = StandardScaler()
dataTrain[col_num+col_cat+col_flag] = scaler.fit_transform(dataTrain[col_num+col_cat+col_flag])
dataTest[col_num+col_cat+col_flag] = scaler.transform(dataTest[col_num+col_cat+col_flag])
#dataTrain[col_num] = scaler.fit_transform(dataTrain[col_num])
#dataTest[col_num] = scaler.transform(dataTest[col_num])

#
scaler2 = StandardScaler()
dataTrain2[col_num2+col_cat2+col_flag2] = scaler2.fit_transform(dataTrain2[col_num2+col_cat2+col_flag2])
dataTest2[col_num2+col_cat2+col_flag2] = scaler2.transform(dataTest2[col_num2+col_cat2+col_flag2])
'''

'\n# stardar scaler\nscaler = StandardScaler()\ndataTrain[col_num+col_cat+col_flag] = scaler.fit_transform(dataTrain[col_num+col_cat+col_flag])\ndataTest[col_num+col_cat+col_flag] = scaler.transform(dataTest[col_num+col_cat+col_flag])\n#dataTrain[col_num] = scaler.fit_transform(dataTrain[col_num])\n#dataTest[col_num] = scaler.transform(dataTest[col_num])\n\n#\nscaler2 = StandardScaler()\ndataTrain2[col_num2+col_cat2+col_flag2] = scaler2.fit_transform(dataTrain2[col_num2+col_cat2+col_flag2])\ndataTest2[col_num2+col_cat2+col_flag2] = scaler2.transform(dataTest2[col_num2+col_cat2+col_flag2])\n'

In [17]:
trainX1 = dataTrain.drop(columns=col_label+col_key)
trainy1 = dataTrain[col_label]
    
testX1 = dataTest.drop(columns=col_label+col_key)
testy1 = dataTest[col_label]

trainX2 = dataTrain2.drop(columns=col_label2+col_key2)
trainy2 = dataTrain2[col_label2]
    
testX2 = dataTest2.drop(columns=col_label2+col_key2)
testy2 = dataTest2[col_label2]

In [18]:
trainX1.head(1)

Unnamed: 0,rbi,homeRuns,numberOfFollowers,numberOfFollowersT,plateAppearances,totalBases,ageStart,primaryPositionName,dayNight,pct,inningsPitched,grassWins,grassLosses,dayWins,age,atBatsPitching,hitsPitching,gamesStartedPitching,winsPitching,leagueRank,strikeOutsPitching,divWins,playerId,hits,awayLosses,divisionGamesBack,BMI,saves,homeLossesT,teamId,awayWins,lastTenWins,leagueGamesBack,yearPlay,runsPitching,gamesInSeries,oneRunWins,airOutsPitching,wins,birthCity,divisionRank,awardName,earnedRuns,divLosses,awayLossesT,extraInningLosses,homeWins,awayWinPct,turfWins,battingOrder
0,0.0,0.0,8446.0,543804.0,2.0,1.0,26.0,5.0,0.0,1.0,6.0,1.0,0.0,1.0,33.0,19.0,1.0,1.0,2.0,4.0,6.0,0.0,118.0,1.0,0.0,0.0,27.703135,2.0,1.0,29.0,1.0,1.0,0.0,7.0,0.0,3.0,1.0,7.0,1.0,655.0,2.0,71.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,900.0


In [19]:
trainX2.head(2)

Unnamed: 0,status,numberOfFollowers,primaryPositionName,age,ageStart,yearPlay,numberOfFollowersT,playerId,divLosses,runsAllowed,BMI,dayWins,pct,teamId,battingOrder
0,15.0,0.0,5.0,28.0,26.0,2.0,0.0,867.0,0.0,0.0,29.681929,0.0,0.0,0.0,0.0
1,15.0,569.0,5.0,26.0,25.0,1.0,0.0,737.0,0.0,0.0,29.156057,0.0,0.0,0.0,0.0


In [20]:
def optimize(trial,x,y,testx,testy):
    
    #metric=[]
    #cv = KFold(n_splits=5,shuffle=True,random_state=42)
    
    param = {
    'num_iterations': 10000, 
    'early_stopping_round': 100,
    'seed':42,
    'n_jobs' : 2,
    #'verbose_eval' : False,
    #'verbose': 10000,
    #'gpu_use_dp': False,
    'device_type': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    #'boosting_type': 'gbdt',
    #'objective': 'mse',
    'objective': trial.suggest_categorical('objective',['mse','mae','huber']),
    'learning_rate' : trial.suggest_loguniform('learning_rate', 1e-4,1e-1),
    'n_estimators' : trial.suggest_int('n_estimators',100,5000),
    'max_depth' : trial.suggest_int('max_depth', 3, 20),
    'num_leaves' : trial.suggest_int('num_leaves', 2, 2**17),
    'max_bin' : trial.suggest_int('max_bin', 10, 250), 
    'feature_fraction' : trial.suggest_uniform('feature_fraction', 0.1, 1),
    'bagging_fraction' : trial.suggest_uniform('bagging_fraction', 0.1, 1),    
    'bagging_freq' : trial.suggest_int('bagging_freq', 1, 100),
    'min_sum_hessian_in_leaf' : trial.suggest_int('min_sum_hessian_in_leaf', 1, 10),
    'reg_alpha' : trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
    'reg_lambda' : trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1),
}

    ml = lgb.LGBMRegressor(**param)
    ml.fit(x,y,eval_set=[(testx, testy)],verbose=10000)
    val_pred  = ml.predict(testx)

    # metric
    metric = mean_absolute_error(testy,val_pred)
    return metric

In [21]:
optimization_function = partial(optimize, x=trainX1.values, y=trainy1['target1'].values,testx=testX1.values,testy=testy1['target1'].values)
study = optuna.create_study(direction="minimize")

start = time.time()
study.optimize(optimization_function,n_trials=100)
end = time.time()

print('Time consumed (min): {:.1f}'.format((end-start)/60))
print('Best trial: ')
print(study.best_params)

Training until validation scores don't improve for 100 rounds
[10000]	valid_0's huber: 2.36054
Did not meet early stopping. Best iteration is:
[10000]	valid_0's huber: 2.36054


[I 2021-07-27 11:42:11,326] Finished trial#0 with value: 2.92418904870169 with parameters: {'objective': 'huber', 'learning_rate': 0.002688896053288635, 'n_estimators': 4751, 'max_depth': 10, 'num_leaves': 67782, 'max_bin': 24, 'feature_fraction': 0.7330355889442327, 'bagging_fraction': 0.6702957782261766, 'bagging_freq': 80, 'min_sum_hessian_in_leaf': 9, 'reg_alpha': 0.0028952847773464235, 'reg_lambda': 0.000884755861629172}. Best is trial#0 with value: 2.92418904870169.


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[91]	valid_0's l2: 64.7166


[I 2021-07-27 11:42:22,470] Finished trial#1 with value: 3.3954009011203388 with parameters: {'objective': 'mse', 'learning_rate': 0.06299550682052982, 'n_estimators': 2853, 'max_depth': 16, 'num_leaves': 2375, 'max_bin': 130, 'feature_fraction': 0.8855771380715233, 'bagging_fraction': 0.6671070746766784, 'bagging_freq': 68, 'min_sum_hessian_in_leaf': 5, 'reg_alpha': 3.9165472903890386e-05, 'reg_lambda': 0.00020852906489794254}. Best is trial#0 with value: 2.92418904870169.


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[772]	valid_0's l1: 2.79904


[I 2021-07-27 11:46:15,249] Finished trial#2 with value: 2.7990400774222923 with parameters: {'objective': 'mae', 'learning_rate': 0.015495476597067143, 'n_estimators': 2995, 'max_depth': 17, 'num_leaves': 27902, 'max_bin': 214, 'feature_fraction': 0.8378470332294045, 'bagging_fraction': 0.732016804022755, 'bagging_freq': 26, 'min_sum_hessian_in_leaf': 1, 'reg_alpha': 1.8651162979119317e-05, 'reg_lambda': 5.8653439715796546e-05}. Best is trial#2 with value: 2.7990400774222923.


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[554]	valid_0's l1: 2.89102


[I 2021-07-27 11:46:18,384] Finished trial#3 with value: 2.8910222048946617 with parameters: {'objective': 'mae', 'learning_rate': 0.04721453679549863, 'n_estimators': 3865, 'max_depth': 3, 'num_leaves': 10767, 'max_bin': 236, 'feature_fraction': 0.5399649792561161, 'bagging_fraction': 0.10203017908811625, 'bagging_freq': 78, 'min_sum_hessian_in_leaf': 10, 'reg_alpha': 4.5628245201707806e-05, 'reg_lambda': 3.713812242195577e-05}. Best is trial#2 with value: 2.7990400774222923.


Training until validation scores don't improve for 100 rounds
[10000]	valid_0's huber: 2.34009
Did not meet early stopping. Best iteration is:
[10000]	valid_0's huber: 2.34009


[I 2021-07-27 11:48:36,302] Finished trial#4 with value: 2.8988641100606825 with parameters: {'objective': 'huber', 'learning_rate': 0.005441474001862024, 'n_estimators': 2790, 'max_depth': 7, 'num_leaves': 26522, 'max_bin': 13, 'feature_fraction': 0.9378989416375815, 'bagging_fraction': 0.5412601972832596, 'bagging_freq': 12, 'min_sum_hessian_in_leaf': 8, 'reg_alpha': 0.00014479767982276853, 'reg_lambda': 5.08340098520591e-05}. Best is trial#2 with value: 2.7990400774222923.


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4704]	valid_0's huber: 2.36427


[I 2021-07-27 12:07:21,896] Finished trial#5 with value: 2.940889213137328 with parameters: {'objective': 'huber', 'learning_rate': 0.006817215805053559, 'n_estimators': 1770, 'max_depth': 17, 'num_leaves': 66543, 'max_bin': 116, 'feature_fraction': 0.9778260581790854, 'bagging_fraction': 0.3093591159405502, 'bagging_freq': 42, 'min_sum_hessian_in_leaf': 9, 'reg_alpha': 1.5411863519888612e-05, 'reg_lambda': 0.00041000431007555367}. Best is trial#2 with value: 2.7990400774222923.


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[514]	valid_0's l2: 64.2473


[I 2021-07-27 12:07:41,002] Finished trial#6 with value: 3.257810139754814 with parameters: {'objective': 'mse', 'learning_rate': 0.010511245577614086, 'n_estimators': 354, 'max_depth': 9, 'num_leaves': 52641, 'max_bin': 191, 'feature_fraction': 0.9910519699527515, 'bagging_fraction': 0.3569612357168005, 'bagging_freq': 32, 'min_sum_hessian_in_leaf': 8, 'reg_alpha': 0.022989628737517854, 'reg_lambda': 0.023010209042198027}. Best is trial#2 with value: 2.7990400774222923.


Training until validation scores don't improve for 100 rounds
[10000]	valid_0's huber: 2.57515
Did not meet early stopping. Best iteration is:
[10000]	valid_0's huber: 2.57515


[I 2021-07-27 12:16:06,453] Finished trial#7 with value: 3.2241860005542167 with parameters: {'objective': 'huber', 'learning_rate': 0.0005631370650803901, 'n_estimators': 2085, 'max_depth': 10, 'num_leaves': 122497, 'max_bin': 196, 'feature_fraction': 0.25866449650255463, 'bagging_fraction': 0.603663994609523, 'bagging_freq': 84, 'min_sum_hessian_in_leaf': 3, 'reg_alpha': 1.0625609899674857e-05, 'reg_lambda': 9.149978835991903e-05}. Best is trial#2 with value: 2.7990400774222923.


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4278]	valid_0's l2: 65.1082


[I 2021-07-27 12:16:23,121] Finished trial#8 with value: 3.2951201400904444 with parameters: {'objective': 'mse', 'learning_rate': 0.002049091358328922, 'n_estimators': 4745, 'max_depth': 3, 'num_leaves': 130324, 'max_bin': 24, 'feature_fraction': 0.49816197303645227, 'bagging_fraction': 0.31289789739099294, 'bagging_freq': 93, 'min_sum_hessian_in_leaf': 2, 'reg_alpha': 0.00016996933582342995, 'reg_lambda': 0.004785300779877857}. Best is trial#2 with value: 2.7990400774222923.


Training until validation scores don't improve for 100 rounds
[10000]	valid_0's l1: 2.93935
Did not meet early stopping. Best iteration is:
[10000]	valid_0's l1: 2.93935


[I 2021-07-27 12:17:27,194] Finished trial#9 with value: 2.939351014236521 with parameters: {'objective': 'mae', 'learning_rate': 0.001178726679104098, 'n_estimators': 260, 'max_depth': 3, 'num_leaves': 5014, 'max_bin': 67, 'feature_fraction': 0.17482568807325616, 'bagging_fraction': 0.5880015619440387, 'bagging_freq': 51, 'min_sum_hessian_in_leaf': 3, 'reg_alpha': 0.0004620642901005423, 'reg_lambda': 0.0006861186767326661}. Best is trial#2 with value: 2.7990400774222923.


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[993]	valid_0's l1: 2.79482


[I 2021-07-27 12:32:37,866] Finished trial#10 with value: 2.794818001402892 with parameters: {'objective': 'mae', 'learning_rate': 0.013841374498520211, 'n_estimators': 3568, 'max_depth': 20, 'num_leaves': 98164, 'max_bin': 239, 'feature_fraction': 0.7283295368122418, 'bagging_fraction': 0.9136141514902671, 'bagging_freq': 2, 'min_sum_hessian_in_leaf': 1, 'reg_alpha': 0.002257673034451712, 'reg_lambda': 1.0126890719988943e-05}. Best is trial#10 with value: 2.794818001402892.


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[678]	valid_0's l1: 2.80353


[I 2021-07-27 12:44:10,178] Finished trial#11 with value: 2.803525841535144 with parameters: {'objective': 'mae', 'learning_rate': 0.02111778723447647, 'n_estimators': 3560, 'max_depth': 20, 'num_leaves': 103616, 'max_bin': 250, 'feature_fraction': 0.7350996933679534, 'bagging_fraction': 0.994721284282139, 'bagging_freq': 1, 'min_sum_hessian_in_leaf': 1, 'reg_alpha': 0.003027878572142937, 'reg_lambda': 1.0464837794233616e-05}. Best is trial#10 with value: 2.794818001402892.


Training until validation scores don't improve for 100 rounds


In [None]:
optimization_function = partial(optimize, x=trainX2.values, y=trainy2['target1'].values,testx=testX2.values,testy=testy2['target1'].values)
study = optuna.create_study(direction="minimize")

start = time.time()
study.optimize(optimization_function,n_trials=100)
end = time.time()

print('Time consumed (min): {:.1f}'.format((end-start)/60))
print('Best trial: ')
print(study.best_params)

In [None]:
#model1 = lgb.LGBMRegressor(**params)
#model1.fit(trainX1.values,trainy1['target1'].values,eval_set=[(testX1.values, testy1['target1'].values)],verbose=100)
#joblib.dump(model1,'target1.pkl')
#model_score = cross_validate(model,trainX.values,trainy.values,scoring=scoring,cv=5)
#print('test abs:', model_score['test_mn_abs'], '\ntest sq', model_score['test_mn_sq'])

#model2 = lgb.LGBMRegressor(**params)
#model2.fit(trainX1.values,trainy1['target2'].values,eval_set=[(testX1.values, testy1['target2'].values)],verbose=100)
#joblib.dump(model2,'target2.pkl')

#model3 = lgb.LGBMRegressor(**params)
#model3.fit(trainX1.values,trainy1['target3'].values,eval_set=[(testX1.values, testy1['target3'].values)],verbose=100)
#joblib.dump(model3,'target3.pkl')

#model4 = lgb.LGBMRegressor(**params)
#model4.fit(trainX1.values,trainy1['target4'].values,eval_set=[(testX1.values, testy1['target4'].values)],verbose=100)
#joblib.dump(model4,'target4.pkl')

In [None]:
#pred1 = model1.predict(testX1.values)
#pred2 = model2.predict(testX1.values)
#pred3 = model3.predict(testX1.values)
#pred4 = model4.predict(testX1.values)

#score_lgb1 = mean_absolute_error(pred1, testy1['target1'].values)
#score_lgb2 = mean_absolute_error(pred2, testy1['target2'].values)
#score_lgb3 = mean_absolute_error(pred3, testy1['target3'].values)
#score_lgb4 = mean_absolute_error(pred4, testy1['target4'].values)
#score1 = (score_lgb1+score_lgb2+score_lgb3+score_lgb4)/4
#print('mae:', score_lgb1,score_lgb2,score_lgb3,score_lgb4)
#print('score1:', score1)
#print(score_lgb1)

In [None]:
#explainer = shap.Explainer(model4)
#shap_values = explainer(testX1)
#shap.plots.waterfall(shap_values[0])
#shap.plots.bar(shap_values,max_display=50)

In [None]:
#shap_values = explainer(trainX1)
#shap.plots.waterfall(shap_values[0])
#shap.plots.bar(shap_values,max_display=50)

In [None]:
# nogame

#model1 = lgb.LGBMRegressor(**params1)
#model1.fit(trainX2.values,trainy2['target1'].values,eval_set=[(testX2.values, testy2['target1'].values)],verbose=100)
#joblib.dump(model1,'target1_nogame.pkl')
#model_score = cross_validate(model,trainX.values,trainy.values,scoring=scoring,cv=5)
#print('test abs:', model_score['test_mn_abs'], '\ntest sq', model_score['test_mn_sq'])

#model2 = lgb.LGBMRegressor(**params)
#model2.fit(trainX2.values,trainy2['target2'].values,eval_set=[(testX2.values, testy2['target2'].values)],verbose=100)
#joblib.dump(model2,'target2_nogame.pkl')

#model3 = lgb.LGBMRegressor(**params)
#model3.fit(trainX2.values,trainy2['target3'].values,eval_set=[(testX2.values, testy2['target3'].values)],verbose=100)
#joblib.dump(model3,'target3_nogame.pkl')

#model4 = lgb.LGBMRegressor(**params)
#model4.fit(trainX2.values,trainy2['target4'].values,eval_set=[(testX2.values, testy2['target4'].values)],verbose=100)
#joblib.dump(model4,'target4_nogame.pkl')

In [None]:
#pred1 = model1.predict(testX2.values)
#pred2 = model2.predict(testX2.values)
#pred3 = model3.predict(testX2.values)
#pred4 = model4.predict(testX2.values)

In [None]:
#score_lgb1 = mean_absolute_error(pred1, testy2['target1'].values)
#score_lgb2 = mean_absolute_error(pred2, testy2['target2'].values)
#score_lgb3 = mean_absolute_error(pred3, testy2['target3'].values)
#score_lgb4 = mean_absolute_error(pred4, testy2['target4'].values)
#score2 = (score_lgb1+score_lgb2+score_lgb3+score_lgb4)/4
#print('mae:', score_lgb1,score_lgb2,score_lgb3,score_lgb4)
#print('score2:', score2)
#print(score_lgb1)

In [None]:
#explainer2 = shap.Explainer(model4)
#shap_values2 = explainer2(testX2)
#shap.plots.waterfall(shap_values[0])
#shap.plots.bar(shap_values2,max_display=50)

In [None]:
#shap_values2 = explainer2(trainX2)
#shap.plots.waterfall(shap_values[0])
#shap.plots.bar(shap_values2,max_display=50)

In [None]:
#(score1+score2)/2

In [None]:
'''
trainX = df_in.drop(columns=['Quantity_norm'])
trainy = df_in.Quantit_norm

model = xgb.XGBRegressor(**param_xgb)
model.fit(trainX.values,trainy.values)

''';

In [None]:
'''
trainX, testX, trainy, testy = train_test_split(
    df_in.drop(columns=['Quantity_norm']), df_in.Quantity_norm, 
    test_size=0.3, random_state=42,
    shuffle=True,stratify=df_in.CreationMonth.values
    )

model = CatBoostRegressor(**param_cb)
model.fit(trainX.values,trainy.values, eval_set=(testX.values,testy.values),use_best_model=True)

''';