In [29]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import RobustScaler
from tqdm import tqdm
import gc
import warnings
from time import time
import xgboost as xgb
xgb.__version__

'1.6.0-dev'

In [4]:
optuna_xgb_output_100 = pd.read_csv('optuna_xgb_output_100.csv')
optuna_xgb_output_100.head()

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_num_class,params_objective,params_subsample,params_tree_method,state
0,0,0.945909,0.0013,0.522037,mlogloss,3.997992,0.004254,17,184,6,multi:softprob,0.555999,gpu_hist,COMPLETE
1,1,0.961847,0.001564,0.87807,mlogloss,0.001703,0.042972,13,6,6,multi:softprob,0.492398,gpu_hist,COMPLETE
2,2,0.913161,0.006048,0.777071,mlogloss,0.006837,0.006029,3,228,6,multi:softprob,0.983047,gpu_hist,COMPLETE
3,3,0.961159,0.009042,0.510839,mlogloss,0.211926,0.090567,20,8,6,multi:softprob,0.949864,gpu_hist,COMPLETE
4,4,0.961205,0.048,0.506861,mlogloss,0.030254,0.085369,9,187,6,multi:softprob,0.690307,gpu_hist,COMPLETE


In [9]:
optuna_xgb_output_100.sort_values(by='value', ascending=False).head(1)

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_num_class,params_objective,params_subsample,params_tree_method,state
64,64,0.962117,0.003147,0.787834,mlogloss,5.12532,0.026111,17,30,6,multi:softprob,0.728339,gpu_hist,COMPLETE


In [56]:
params = {'objective': 'multi:softprob',
          'tree_method': 'gpu_hist',
          'lambda': 5.12532,
          'alpha': 0.003147,
          'colsample_bytree': 0.787834,
          'subsample': 0.8486386194504626,
          'learning_rate': 0.026111,
          'max_depth': 17,
          'min_child_weight': 30,
          'num_class':6,
          'eval_metric': 'merror'}

In [10]:
%%time
train = pd.read_csv('../../TPS_2021/input/tabular-playground-series-dec-2021/train.csv')
test = pd.read_csv('../../TPS_2021/input/tabular-playground-series-dec-2021/test.csv')

CPU times: user 6.31 s, sys: 1.04 s, total: 7.35 s
Wall time: 7.77 s


In [11]:
train.drop(columns=['Soil_Type7', 'Soil_Type15'], inplace=True) 
test.drop(columns=['Soil_Type7', 'Soil_Type15'], inplace=True)

In [13]:
test.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,4000000,2763,78,20,377,88,3104,218,213,195,...,0,0,0,0,0,0,0,0,0,0
1,4000001,2826,153,11,264,39,295,219,238,148,...,0,0,0,0,0,0,0,0,0,0
2,4000002,2948,57,19,56,44,852,202,217,163,...,0,0,1,0,0,0,0,0,0,0
3,4000003,2926,119,6,158,134,2136,234,240,142,...,0,0,0,0,0,0,0,0,0,0
4,4000004,2690,10,4,38,108,3589,213,221,229,...,0,0,0,0,0,0,0,0,0,0


In [15]:
train.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,0,3189,40,8,30,13,3270,206,234,193,...,0,0,0,0,0,0,0,0,0,1
1,1,3026,182,5,280,29,3270,233,240,106,...,0,0,0,0,0,0,0,0,0,2
2,2,3106,13,7,351,37,2914,208,234,137,...,0,0,0,0,0,0,0,0,0,1
3,3,3022,276,13,192,16,3034,207,238,156,...,0,0,0,0,0,0,0,0,0,2
4,4,2906,186,13,266,22,2916,231,231,154,...,0,0,0,0,0,0,0,0,0,2


In [14]:
features = test.columns[1:]

In [16]:
target = train['Cover_Type'].values

In [17]:
perm_dict = {1:2, 2:1, 3:3, 4:6, 7:4, 6:5, 5:7,}
inv_perm = {v: k for k, v in perm_dict.items()}
inv_perm

{2: 1, 1: 2, 3: 3, 6: 4, 4: 7, 5: 6, 7: 5}

In [18]:
train['Cover_Type'].replace(perm_dict, inplace=True)

In [19]:
train = train[train.Cover_Type !=7]

In [20]:
target = train[['Cover_Type']].values - 1
train.drop(['Cover_Type', 'Id'], axis=1, inplace=True)
test.drop(['Id', ], axis=1, inplace=True)

In [22]:
np.unique(target)

array([0, 1, 2, 3, 4, 5])

In [27]:
%%time
train_test = pd.concat([train, test], axis =0)
RS = RobustScaler()
RS.fit(train_test)
del train_test
gc.collect()
gc.collect()
train[features] = RS.transform(train)
test[features] = RS.transform(test)
del RS
gc.collect()
gc.collect()

CPU times: user 9.95 s, sys: 6.68 s, total: 16.6 s
Wall time: 16.7 s


0

In [35]:
xgtest = xgb.DMatrix(test[features].values)

In [58]:
y_oof = np.zeros((train.shape[0], 6))
preds_total = 0

In [59]:
%%time
NFOLDS = 10
start_time = time()
kf = StratifiedKFold(NFOLDS, shuffle=True, random_state=137)
for i, (train_index, test_index) in enumerate(kf.split(train,target)):
    #if i < 4:
        print(i)
        xgtrain, xgval = train[features].values[train_index], train[features].values[test_index]
        y_train, y_val = target[train_index], target[test_index]
        xgtrain = xgb.DMatrix(xgtrain, label = y_train ) 
        xgval = xgb.DMatrix(xgval, label = y_val ) 
        watchlist = [(xgtrain, 'train'), (xgval, 'eval')]
        
        clf = xgb.train(params, xgtrain, num_boost_round=1000, evals=watchlist, 
                verbose_eval=100)
        
        preds = clf.predict(xgtest)
        ypred = clf.predict(xgval)
        y_oof[test_index] = ypred
        preds_total += preds/NFOLDS
        #np.save("preds_hist_"+str(i), preds)
        #p.save("oof_preds_hist_"+str(i), ypred)
        print('Time elapsed:', time()-start_time)
        #clf.save_model('xgb_gpu_hist_1812_2_fold_'+str(i)+'.model')
        del clf
        gc.collect()

        print('Fold Accuracy:', accuracy_score(y_val, ypred.argmax(axis=1)))
        

0
[0]	train-merror:0.06849	eval-merror:0.06971
[100]	train-merror:0.04141	eval-merror:0.04482
[200]	train-merror:0.03628	eval-merror:0.04167
[300]	train-merror:0.03248	eval-merror:0.03995
[400]	train-merror:0.03010	eval-merror:0.03918
[500]	train-merror:0.02827	eval-merror:0.03893
[600]	train-merror:0.02665	eval-merror:0.03871
[700]	train-merror:0.02505	eval-merror:0.03870
[800]	train-merror:0.02352	eval-merror:0.03871
[900]	train-merror:0.02205	eval-merror:0.03861
[999]	train-merror:0.02068	eval-merror:0.03865
Time elapsed: 436.9231073856354
Fold Accuracy: 0.9613525
1
[0]	train-merror:0.06851	eval-merror:0.06924
[100]	train-merror:0.04152	eval-merror:0.04442
[200]	train-merror:0.03635	eval-merror:0.04102
[300]	train-merror:0.03249	eval-merror:0.03932
[400]	train-merror:0.03016	eval-merror:0.03866
[500]	train-merror:0.02830	eval-merror:0.03842
[600]	train-merror:0.02670	eval-merror:0.03828
[700]	train-merror:0.02517	eval-merror:0.03826
[800]	train-merror:0.02357	eval-merror:0.03817
[90

In [61]:
y_oof.shape

(3999999, 6)

In [42]:
ypred.shape

(400000, 6)

In [63]:
preds_total.shape

(1000000, 6)

In [80]:
np.save('../metafeatures/train/y_oof_xgb_1', y_oof)
np.save('../metafeatures/test/preds_total_xgb_1', preds_total)

In [68]:
submission = pd.read_csv('../../TPS_2021/input/tabular-playground-series-dec-2021/sample_submission.csv')

In [76]:
submission['Cover_Type'] = preds_total.argmax(axis=1)+1
submission['Cover_Type'].replace(inv_perm, inplace=True)
submission.head(10)

Unnamed: 0,Id,Cover_Type
0,4000000,2
1,4000001,2
2,4000002,2
3,4000003,2
4,4000004,2
5,4000005,3
6,4000006,2
7,4000007,1
8,4000008,2
9,4000009,3


In [78]:
np.unique(submission['Cover_Type'].values)

array([1, 2, 3, 4, 6, 7])

In [79]:
submission.to_csv('../submissions/submission_xgb_best_optuna_1.csv', index=False)

In [82]:
%%time
y_oof = np.zeros((train.shape[0], 6))
preds_total = 0
NFOLDS = 10
start_time = time()
kf = StratifiedKFold(NFOLDS, shuffle=True, random_state=137)
for i, (train_index, test_index) in enumerate(kf.split(train,target)):
    #if i < 4:
        print(i)
        xgtrain, xgval = train[features].values[train_index], train[features].values[test_index]
        y_train, y_val = target[train_index], target[test_index]
        xgtrain = xgb.DMatrix(xgtrain, label = y_train ) 
        xgval = xgb.DMatrix(xgval, label = y_val ) 
        watchlist = [(xgtrain, 'train'), (xgval, 'eval')]
        params['random_state'] = 224
        
        clf = xgb.train(params, xgtrain, num_boost_round=900, evals=watchlist, 
                verbose_eval=100)
        
        preds = clf.predict(xgtest)
        ypred = clf.predict(xgval)
        y_oof[test_index] = ypred
        preds_total += preds/NFOLDS
        #np.save("preds_hist_"+str(i), preds)
        #p.save("oof_preds_hist_"+str(i), ypred)
        print('Time elapsed:', time()-start_time)
        #clf.save_model('xgb_gpu_hist_1812_2_fold_'+str(i)+'.model')
        del clf
        gc.collect()

        print('Fold Accuracy:', accuracy_score(y_val, ypred.argmax(axis=1)))
        
np.save('../metafeatures/train/y_oof_xgb_2', y_oof)
np.save('../metafeatures/test/preds_total_xgb_2', preds_total)

submission['Cover_Type'] = preds_total.argmax(axis=1)+1
submission['Cover_Type'].replace(inv_perm, inplace=True)


submission.to_csv('../submissions/submission_xgb_best_optuna_2.csv', index=False)

0
[0]	train-merror:0.06314	eval-merror:0.06440
[100]	train-merror:0.04134	eval-merror:0.04469
[200]	train-merror:0.03625	eval-merror:0.04155
[300]	train-merror:0.03252	eval-merror:0.04005
[400]	train-merror:0.03008	eval-merror:0.03921
[500]	train-merror:0.02824	eval-merror:0.03898
[600]	train-merror:0.02658	eval-merror:0.03876
[700]	train-merror:0.02500	eval-merror:0.03866
[800]	train-merror:0.02344	eval-merror:0.03873
[899]	train-merror:0.02205	eval-merror:0.03873
Time elapsed: 398.19673252105713
Fold Accuracy: 0.961265
1
[0]	train-merror:0.06332	eval-merror:0.06409
[100]	train-merror:0.04144	eval-merror:0.04424
[200]	train-merror:0.03632	eval-merror:0.04102
[300]	train-merror:0.03254	eval-merror:0.03919
[400]	train-merror:0.03014	eval-merror:0.03861
[500]	train-merror:0.02832	eval-merror:0.03833
[600]	train-merror:0.02661	eval-merror:0.03820
[700]	train-merror:0.02510	eval-merror:0.03817
[800]	train-merror:0.02352	eval-merror:0.03803
[899]	train-merror:0.02206	eval-merror:0.03807
Tim

In [85]:
%%time
y_oof = np.zeros((train.shape[0], 6))
preds_total = 0
NFOLDS = 10
start_time = time()
kf = StratifiedKFold(NFOLDS, shuffle=True, random_state=137)
for i, (train_index, test_index) in enumerate(kf.split(train,target)):
    #if i < 4:
        print(i)
        xgtrain, xgval = train[features].values[train_index], train[features].values[test_index]
        y_train, y_val = target[train_index], target[test_index]
        xgtrain = xgb.DMatrix(xgtrain, label = y_train ) 
        xgval = xgb.DMatrix(xgval, label = y_val ) 
        watchlist = [(xgtrain, 'train'), (xgval, 'eval')]
        params['random_state'] = 227
        params['learning_rate'] = 0.013
        
        
        clf = xgb.train(params, xgtrain, num_boost_round=1500, evals=watchlist, 
                verbose_eval=300)
        
        preds = clf.predict(xgtest)
        ypred = clf.predict(xgval)
        y_oof[test_index] = ypred
        preds_total += preds/NFOLDS
        #np.save("preds_hist_"+str(i), preds)
        #p.save("oof_preds_hist_"+str(i), ypred)
        print('Time elapsed:', time()-start_time)
        #clf.save_model('xgb_gpu_hist_1812_2_fold_'+str(i)+'.model')
        del clf
        gc.collect()

        print('Fold Accuracy:', accuracy_score(y_val, ypred.argmax(axis=1)))
        
np.save('../metafeatures/train/y_oof_xgb_3', y_oof)
np.save('../metafeatures/test/preds_total_xgb_3', preds_total)

submission['Cover_Type'] = preds_total.argmax(axis=1)+1
submission['Cover_Type'].replace(inv_perm, inplace=True)


submission.to_csv('../submissions/submission_xgb_best_optuna_3.csv', index=False)

0
[0]	train-merror:0.06060	eval-merror:0.06245
[300]	train-merror:0.03869	eval-merror:0.04308
[600]	train-merror:0.03251	eval-merror:0.04006
[900]	train-merror:0.02916	eval-merror:0.03891
[1200]	train-merror:0.02667	eval-merror:0.03891
[1499]	train-merror:0.02436	eval-merror:0.03884
Time elapsed: 689.9151842594147
Fold Accuracy: 0.961165
1
[0]	train-merror:0.06064	eval-merror:0.06180
[300]	train-merror:0.03872	eval-merror:0.04239
[600]	train-merror:0.03255	eval-merror:0.03929
[900]	train-merror:0.02921	eval-merror:0.03846
[1200]	train-merror:0.02673	eval-merror:0.03823
[1499]	train-merror:0.02443	eval-merror:0.03812
Time elapsed: 1375.2375202178955
Fold Accuracy: 0.961875
2
[0]	train-merror:0.06088	eval-merror:0.06259
[300]	train-merror:0.03868	eval-merror:0.04297
[600]	train-merror:0.03245	eval-merror:0.03988
[900]	train-merror:0.02917	eval-merror:0.03904
[1200]	train-merror:0.02672	eval-merror:0.03886
[1499]	train-merror:0.02433	eval-merror:0.03875
Time elapsed: 2062.909682750702
Fol