In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import RobustScaler
from tqdm import tqdm
import gc
import warnings
from time import time
import xgboost as xgb
xgb.__version__

'1.6.0-dev'

In [3]:
optuna_xgb_output_100 = pd.read_csv('optuna_xgb_fe_output.csv')
optuna_xgb_output_100.head()

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_num_class,params_objective,params_subsample,params_tree_method,state
0,0,0.956893,0.154753,0.938749,mlogloss,0.779656,0.001448,20,73,6,multi:softprob,0.483028,gpu_hist,COMPLETE
1,1,0.953298,0.030681,0.40686,mlogloss,8.650397,0.050611,3,59,6,multi:softprob,0.738625,gpu_hist,COMPLETE
2,2,0.92948,0.001916,0.868243,mlogloss,0.010653,0.001334,5,32,6,multi:softprob,0.459071,gpu_hist,COMPLETE
3,3,0.913466,0.072575,0.301956,mlogloss,0.30736,0.006653,3,245,6,multi:softprob,0.963456,gpu_hist,COMPLETE
4,4,0.962164,1.413603,0.904432,mlogloss,0.756485,0.037433,20,190,6,multi:softprob,0.564118,gpu_hist,COMPLETE


In [4]:
optuna_xgb_output_100.sort_values(by='value', ascending=False).head(1)

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_num_class,params_objective,params_subsample,params_tree_method,state
143,143,0.962738,0.005631,0.643151,mlogloss,3.121097,0.022424,11,17,6,multi:softprob,0.82267,gpu_hist,COMPLETE


In [5]:
params = {'objective': 'multi:softprob',
          'tree_method': 'gpu_hist',
          'lambda': 3.121097,
          'alpha': 0.005631,
          'colsample_bytree': 0.643151,
          'subsample': 0.82267,
          'learning_rate': 0.022424,
          'max_depth': 11,
          'min_child_weight': 17,
          'num_class':6,
          'eval_metric': 'merror'}

In [10]:
%%time
train = pd.read_csv('../../TPS_2021/input/tabular-playground-series-dec-2021/train.csv')
test = pd.read_csv('../../TPS_2021/input/tabular-playground-series-dec-2021/test.csv')

CPU times: user 5.78 s, sys: 461 ms, total: 6.24 s
Wall time: 6.24 s


In [11]:
train.drop(columns=['Soil_Type7', 'Soil_Type15', 'Id'], inplace=True) 
test.drop(columns=['Soil_Type7', 'Soil_Type15', 'Id'], inplace=True)

In [8]:
test.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,4000000,2763,78,20,377,88,3104,218,213,195,...,0,0,0,0,0,0,0,0,0,0
1,4000001,2826,153,11,264,39,295,219,238,148,...,0,0,0,0,0,0,0,0,0,0
2,4000002,2948,57,19,56,44,852,202,217,163,...,0,0,1,0,0,0,0,0,0,0
3,4000003,2926,119,6,158,134,2136,234,240,142,...,0,0,0,0,0,0,0,0,0,0
4,4000004,2690,10,4,38,108,3589,213,221,229,...,0,0,0,0,0,0,0,0,0,0


In [9]:
train.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,0,3189,40,8,30,13,3270,206,234,193,...,0,0,0,0,0,0,0,0,0,1
1,1,3026,182,5,280,29,3270,233,240,106,...,0,0,0,0,0,0,0,0,0,2
2,2,3106,13,7,351,37,2914,208,234,137,...,0,0,0,0,0,0,0,0,0,1
3,3,3022,276,13,192,16,3034,207,238,156,...,0,0,0,0,0,0,0,0,0,2
4,4,2906,186,13,266,22,2916,231,231,154,...,0,0,0,0,0,0,0,0,0,2


In [12]:
new_names = {
    "Horizontal_Distance_To_Hydrology": "x_dist_hydrlgy",
    "Vertical_Distance_To_Hydrology": "y_dist_hydrlgy",
    "Horizontal_Distance_To_Roadways": "x_dist_rdwys",
    "Horizontal_Distance_To_Fire_Points": "x_dist_firepts"
}

train.rename(new_names, axis=1, inplace=True)
test.rename(new_names, axis=1, inplace=True)

train["Aspect"][train["Aspect"] < 0] += 360
train["Aspect"][train["Aspect"] > 359] -= 360

test["Aspect"][test["Aspect"] < 0] += 360
test["Aspect"][test["Aspect"] > 359] -= 360

# Manhhattan distance to Hydrology
train["mnhttn_dist_hydrlgy"] = np.abs(train["x_dist_hydrlgy"]) + np.abs(train["y_dist_hydrlgy"])
test["mnhttn_dist_hydrlgy"] = np.abs(test["x_dist_hydrlgy"]) + np.abs(test["y_dist_hydrlgy"])

# Euclidean distance to Hydrology
train["ecldn_dist_hydrlgy"] = (train["x_dist_hydrlgy"]**2 + train["y_dist_hydrlgy"]**2)**0.5
test["ecldn_dist_hydrlgy"] = (test["x_dist_hydrlgy"]**2 + test["y_dist_hydrlgy"]**2)**0.5

train.loc[train["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
test.loc[test["Hillshade_9am"] < 0, "Hillshade_9am"] = 0

train.loc[train["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
test.loc[test["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0

train.loc[train["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
test.loc[test["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

train.loc[train["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
test.loc[test["Hillshade_9am"] > 255, "Hillshade_9am"] = 255

train.loc[train["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
test.loc[test["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255

train.loc[train["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
test.loc[test["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

features_Hillshade = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
soil_features = [x for x in train.columns if x.startswith("Soil_Type")]
wilderness_features = [x for x in train.columns if x.startswith("Wilderness_Area")]

def addFeature(X):
    # Thanks @mpwolke : https://www.kaggle.com/mpwolke/tooezy-where-are-you-no-camping-here
    X["Soil_Count"] = X[soil_features].apply(sum, axis=1)

    # Thanks @yannbarthelemy : https://www.kaggle.com/yannbarthelemy/tps-december-first-simple-feature-engineering
    X["Wilderness_Area_Count"] = X[wilderness_features].apply(sum, axis=1)
    X["Hillshade_mean"] = X[features_Hillshade].mean(axis=1)
    X['amp_Hillshade'] = X[features_Hillshade].max(axis=1) - X[features_Hillshade].min(axis=1)
    
addFeature(train)
addFeature(test)

cols = [
    "Elevation",
    "Aspect",
    "mnhttn_dist_hydrlgy",
    "ecldn_dist_hydrlgy",
    "Slope",
    "x_dist_hydrlgy",
    "y_dist_hydrlgy",
    "x_dist_rdwys",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "x_dist_firepts",
    
    "Soil_Count","Wilderness_Area_Count","Hillshade_mean","amp_Hillshade"
]

scaler = RobustScaler()
train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.transform(test[cols])

features = test.columns

perm_dict = {1:2, 2:1, 3:3, 4:6, 7:4, 6:5, 5:7,}

train['Cover_Type'].replace(perm_dict, inplace=True)

train = train[train.Cover_Type !=7]

def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 438.69 Mb (76.0% reduction)
Mem. usage decreased to 101.09 Mb (77.2% reduction)


In [13]:
target = train[['Cover_Type']].values
train.drop(['Cover_Type'], axis=1, inplace=True)

In [14]:
target -= 1

In [15]:
np.unique(target)

array([0, 1, 2, 3, 4, 5], dtype=int8)

In [16]:
xgtest = xgb.DMatrix(test[features].values)

In [17]:
y_oof = np.zeros((train.shape[0], 6))
preds_total = 0

In [18]:
%%time
NFOLDS = 10
start_time = time()
kf = StratifiedKFold(NFOLDS, shuffle=True, random_state=137)
for i, (train_index, test_index) in enumerate(kf.split(train,target)):
    #if i < 4:
        print(i)
        xgtrain, xgval = train[features].values[train_index], train[features].values[test_index]
        y_train, y_val = target[train_index], target[test_index]
        xgtrain = xgb.DMatrix(xgtrain, label = y_train ) 
        xgval = xgb.DMatrix(xgval, label = y_val ) 
        watchlist = [(xgtrain, 'train'), (xgval, 'eval')]
        
        clf = xgb.train(params, xgtrain, num_boost_round=1000, evals=watchlist, 
                verbose_eval=100)
        
        preds = clf.predict(xgtest)
        ypred = clf.predict(xgval)
        y_oof[test_index] = ypred
        preds_total += preds/NFOLDS
        #np.save("preds_hist_"+str(i), preds)
        #p.save("oof_preds_hist_"+str(i), ypred)
        print('Time elapsed:', time()-start_time)
        #clf.save_model('xgb_gpu_hist_1812_2_fold_'+str(i)+'.model')
        del clf
        gc.collect()

        print('Fold Accuracy:', accuracy_score(y_val, ypred.argmax(axis=1)))
        

0
[0]	train-merror:0.39528	eval-merror:0.39532
[100]	train-merror:0.04542	eval-merror:0.04745
[200]	train-merror:0.03908	eval-merror:0.04215
[300]	train-merror:0.03565	eval-merror:0.03959
[400]	train-merror:0.03372	eval-merror:0.03886
[500]	train-merror:0.03249	eval-merror:0.03852
[600]	train-merror:0.03146	eval-merror:0.03837
[700]	train-merror:0.03055	eval-merror:0.03829
[800]	train-merror:0.02974	eval-merror:0.03822
[900]	train-merror:0.02890	eval-merror:0.03816
[999]	train-merror:0.02811	eval-merror:0.03818
Time elapsed: 249.41647624969482
Fold Accuracy: 0.9618225
1
[0]	train-merror:0.39524	eval-merror:0.39557
[100]	train-merror:0.04551	eval-merror:0.04713
[200]	train-merror:0.03908	eval-merror:0.04172
[300]	train-merror:0.03566	eval-merror:0.03938
[400]	train-merror:0.03378	eval-merror:0.03823
[500]	train-merror:0.03246	eval-merror:0.03784
[600]	train-merror:0.03150	eval-merror:0.03756
[700]	train-merror:0.03062	eval-merror:0.03738
[800]	train-merror:0.02978	eval-merror:0.03731
[9

In [23]:
print('OOF Accuracy:', accuracy_score(target, y_oof.argmax(axis=1)))

OOF Accuracy: 0.9623077405769351


In [19]:
y_oof.shape

(3999999, 6)

In [20]:
ypred.shape

(399999, 6)

In [21]:
preds_total.shape

(1000000, 6)

In [22]:
np.save('../metafeatures/train/y_oof_xgb_fe_1', y_oof)
np.save('../metafeatures/test/preds_total_xgb_fe_1', preds_total)

In [24]:
submission = pd.read_csv('../../TPS_2021/input/tabular-playground-series-dec-2021/sample_submission.csv')

In [26]:
inv_perm = {v: k for k, v in perm_dict.items()}
inv_perm

{2: 1, 1: 2, 3: 3, 6: 4, 4: 7, 5: 6, 7: 5}

In [27]:
submission['Cover_Type'] = preds_total.argmax(axis=1)+1
submission['Cover_Type'].replace(inv_perm, inplace=True)
submission.head(10)

Unnamed: 0,Id,Cover_Type
0,4000000,2
1,4000001,2
2,4000002,2
3,4000003,2
4,4000004,2
5,4000005,3
6,4000006,2
7,4000007,1
8,4000008,2
9,4000009,3


In [28]:
np.unique(submission['Cover_Type'].values)

array([1, 2, 3, 4, 6, 7])

In [30]:
submission.to_csv('../submissions/submission_xgb_best_optuna_fe_1.csv', index=False)

In [31]:
%%time
y_oof = np.zeros((train.shape[0], 6))
preds_total = 0
NFOLDS = 10
start_time = time()
kf = StratifiedKFold(NFOLDS, shuffle=True, random_state=137)
for i, (train_index, test_index) in enumerate(kf.split(train,target)):
    #if i < 4:
        print(i)
        xgtrain, xgval = train[features].values[train_index], train[features].values[test_index]
        y_train, y_val = target[train_index], target[test_index]
        xgtrain = xgb.DMatrix(xgtrain, label = y_train ) 
        xgval = xgb.DMatrix(xgval, label = y_val ) 
        watchlist = [(xgtrain, 'train'), (xgval, 'eval')]
        params['random_state'] = 224
        
        clf = xgb.train(params, xgtrain, num_boost_round=900, evals=watchlist, 
                verbose_eval=100)
        
        preds = clf.predict(xgtest)
        ypred = clf.predict(xgval)
        y_oof[test_index] = ypred
        preds_total += preds/NFOLDS
        #np.save("preds_hist_"+str(i), preds)
        #p.save("oof_preds_hist_"+str(i), ypred)
        print('Time elapsed:', time()-start_time)
        #clf.save_model('xgb_gpu_hist_1812_2_fold_'+str(i)+'.model')
        del clf
        gc.collect()

        print('Fold Accuracy:', accuracy_score(y_val, ypred.argmax(axis=1)))
        
np.save('../metafeatures/train/y_oof_xgb_fe_2', y_oof)
np.save('../metafeatures/test/preds_total_xgb_fe_2', preds_total)

submission['Cover_Type'] = preds_total.argmax(axis=1)+1
submission['Cover_Type'].replace(inv_perm, inplace=True)


submission.to_csv('../submissions/submission_xgb_best_optuna_fe_2.csv', index=False)

0
[0]	train-merror:0.08476	eval-merror:0.08587
[100]	train-merror:0.04469	eval-merror:0.04673
[200]	train-merror:0.03890	eval-merror:0.04198
[300]	train-merror:0.03549	eval-merror:0.03961
[400]	train-merror:0.03368	eval-merror:0.03886
[500]	train-merror:0.03250	eval-merror:0.03861
[600]	train-merror:0.03156	eval-merror:0.03848
[700]	train-merror:0.03065	eval-merror:0.03839
[800]	train-merror:0.02976	eval-merror:0.03818
[899]	train-merror:0.02898	eval-merror:0.03817
Time elapsed: 224.8425693511963
Fold Accuracy: 0.9618325
1
[0]	train-merror:0.08463	eval-merror:0.08579
[100]	train-merror:0.04471	eval-merror:0.04643
[200]	train-merror:0.03894	eval-merror:0.04177
[300]	train-merror:0.03551	eval-merror:0.03931
[400]	train-merror:0.03372	eval-merror:0.03823
[500]	train-merror:0.03253	eval-merror:0.03778
[600]	train-merror:0.03155	eval-merror:0.03757
[700]	train-merror:0.03065	eval-merror:0.03745
[800]	train-merror:0.02982	eval-merror:0.03740
[899]	train-merror:0.02907	eval-merror:0.03738
Tim

In [32]:
print('OOF Accuracy:', accuracy_score(target, y_oof.argmax(axis=1)))

OOF Accuracy: 0.9622002405500601


In [33]:
%%time
y_oof = np.zeros((train.shape[0], 6))
preds_total = 0
NFOLDS = 10
start_time = time()
kf = StratifiedKFold(NFOLDS, shuffle=True, random_state=137)
for i, (train_index, test_index) in enumerate(kf.split(train,target)):
    #if i < 4:
        print(i)
        xgtrain, xgval = train[features].values[train_index], train[features].values[test_index]
        y_train, y_val = target[train_index], target[test_index]
        xgtrain = xgb.DMatrix(xgtrain, label = y_train ) 
        xgval = xgb.DMatrix(xgval, label = y_val ) 
        watchlist = [(xgtrain, 'train'), (xgval, 'eval')]
        params['random_state'] = 227
        #params['learning_rate'] = 0.013
        
        
        clf = xgb.train(params, xgtrain, num_boost_round=1100, evals=watchlist, 
                verbose_eval=300)
        
        preds = clf.predict(xgtest)
        ypred = clf.predict(xgval)
        y_oof[test_index] = ypred
        preds_total += preds/NFOLDS
        #np.save("preds_hist_"+str(i), preds)
        #p.save("oof_preds_hist_"+str(i), ypred)
        print('Time elapsed:', time()-start_time)
        #clf.save_model('xgb_gpu_hist_1812_2_fold_'+str(i)+'.model')
        del clf
        gc.collect()

        print('Fold Accuracy:', accuracy_score(y_val, ypred.argmax(axis=1)))
        
np.save('../metafeatures/train/y_oof_xgb_fe_3', y_oof)
np.save('../metafeatures/test/preds_total_xgb_fe_3', preds_total)

submission['Cover_Type'] = preds_total.argmax(axis=1)+1
submission['Cover_Type'].replace(inv_perm, inplace=True)


submission.to_csv('../submissions/submission_xgb_best_optuna_fe_3.csv', index=False)

0
[0]	train-merror:0.05879	eval-merror:0.06003
[300]	train-merror:0.03550	eval-merror:0.03963
[600]	train-merror:0.03152	eval-merror:0.03845
[900]	train-merror:0.02896	eval-merror:0.03819
[1099]	train-merror:0.02735	eval-merror:0.03808
Time elapsed: 264.17012095451355
Fold Accuracy: 0.9619225
1
[0]	train-merror:0.05902	eval-merror:0.05998
[300]	train-merror:0.03554	eval-merror:0.03933
[600]	train-merror:0.03158	eval-merror:0.03754
[900]	train-merror:0.02903	eval-merror:0.03722
[1099]	train-merror:0.02744	eval-merror:0.03721
Time elapsed: 528.1693861484528
Fold Accuracy: 0.9627925
2
[0]	train-merror:0.05905	eval-merror:0.06015
[300]	train-merror:0.03544	eval-merror:0.03991
[600]	train-merror:0.03150	eval-merror:0.03830
[900]	train-merror:0.02893	eval-merror:0.03819
[1099]	train-merror:0.02735	eval-merror:0.03810
Time elapsed: 792.3533139228821
Fold Accuracy: 0.9619025
3
[0]	train-merror:0.05906	eval-merror:0.05995
[300]	train-merror:0.03556	eval-merror:0.03920
[600]	train-merror:0.03157

In [34]:
print('OOF Accuracy:', accuracy_score(target, y_oof.argmax(axis=1)))

OOF Accuracy: 0.9623142405785602


In [35]:
%%time
y_oof = np.zeros((train.shape[0], 6))
preds_total = 0
NFOLDS = 10
start_time = time()
kf = StratifiedKFold(NFOLDS, shuffle=True, random_state=137)
for i, (train_index, test_index) in enumerate(kf.split(train,target)):
    #if i < 4:
        print(i)
        xgtrain, xgval = train[features].values[train_index], train[features].values[test_index]
        y_train, y_val = target[train_index], target[test_index]
        xgtrain = xgb.DMatrix(xgtrain, label = y_train ) 
        xgval = xgb.DMatrix(xgval, label = y_val ) 
        watchlist = [(xgtrain, 'train'), (xgval, 'eval')]
        params['random_state'] = 233
        #params['learning_rate'] = 0.013
        
        
        clf = xgb.train(params, xgtrain, num_boost_round=1200, evals=watchlist, 
                verbose_eval=300)
        
        preds = clf.predict(xgtest)
        ypred = clf.predict(xgval)
        y_oof[test_index] = ypred
        preds_total += preds/NFOLDS
        #np.save("preds_hist_"+str(i), preds)
        #p.save("oof_preds_hist_"+str(i), ypred)
        print('Time elapsed:', time()-start_time)
        #clf.save_model('xgb_gpu_hist_1812_2_fold_'+str(i)+'.model')
        del clf
        gc.collect()

        print('Fold Accuracy:', accuracy_score(y_val, ypred.argmax(axis=1)))
        
np.save('../metafeatures/train/y_oof_xgb_fe_4', y_oof)
np.save('../metafeatures/test/preds_total_xgb_fe_4', preds_total)

submission['Cover_Type'] = preds_total.argmax(axis=1)+1
submission['Cover_Type'].replace(inv_perm, inplace=True)


submission.to_csv('../submissions/submission_xgb_best_optuna_fe_4.csv', index=False)

0
[0]	train-merror:0.09101	eval-merror:0.09243
[300]	train-merror:0.03553	eval-merror:0.03971
[600]	train-merror:0.03140	eval-merror:0.03852
[900]	train-merror:0.02884	eval-merror:0.03825
[1199]	train-merror:0.02648	eval-merror:0.03810
Time elapsed: 287.77930641174316
Fold Accuracy: 0.961895
1
[0]	train-merror:0.09137	eval-merror:0.09194
[300]	train-merror:0.03557	eval-merror:0.03949
[600]	train-merror:0.03143	eval-merror:0.03761
[900]	train-merror:0.02895	eval-merror:0.03719
[1199]	train-merror:0.02652	eval-merror:0.03711
Time elapsed: 574.811526298523
Fold Accuracy: 0.9628875
2
[0]	train-merror:0.09079	eval-merror:0.09165
[300]	train-merror:0.03553	eval-merror:0.04002
[600]	train-merror:0.03134	eval-merror:0.03832
[900]	train-merror:0.02874	eval-merror:0.03820
[1199]	train-merror:0.02634	eval-merror:0.03817
Time elapsed: 863.5085253715515
Fold Accuracy: 0.9618275
3
[0]	train-merror:0.09092	eval-merror:0.09226
[300]	train-merror:0.03560	eval-merror:0.03928
[600]	train-merror:0.03145	e

In [36]:
print('OOF Accuracy:', accuracy_score(target, y_oof.argmax(axis=1)))

OOF Accuracy: 0.9623114905778727


In [37]:
%%time
y_oof = np.zeros((train.shape[0], 6))
preds_total = 0
NFOLDS = 10
start_time = time()
kf = StratifiedKFold(NFOLDS, shuffle=True, random_state=137)
for i, (train_index, test_index) in enumerate(kf.split(train,target)):
    #if i < 4:
        print(i)
        xgtrain, xgval = train[features].values[train_index], train[features].values[test_index]
        y_train, y_val = target[train_index], target[test_index]
        xgtrain = xgb.DMatrix(xgtrain, label = y_train ) 
        xgval = xgb.DMatrix(xgval, label = y_val ) 
        watchlist = [(xgtrain, 'train'), (xgval, 'eval')]
        params['random_state'] = 237
        #params['learning_rate'] = 0.013
        
        
        clf = xgb.train(params, xgtrain, num_boost_round=1300, evals=watchlist, 
                verbose_eval=300)
        
        preds = clf.predict(xgtest)
        ypred = clf.predict(xgval)
        y_oof[test_index] = ypred
        preds_total += preds/NFOLDS
        #np.save("preds_hist_"+str(i), preds)
        #p.save("oof_preds_hist_"+str(i), ypred)
        print('Time elapsed:', time()-start_time)
        #clf.save_model('xgb_gpu_hist_1812_2_fold_'+str(i)+'.model')
        del clf
        gc.collect()

        print('Fold Accuracy:', accuracy_score(y_val, ypred.argmax(axis=1)))
        
np.save('../metafeatures/train/y_oof_xgb_fe_5', y_oof)
np.save('../metafeatures/test/preds_total_xgb_fe_5', preds_total)

submission['Cover_Type'] = preds_total.argmax(axis=1)+1
submission['Cover_Type'].replace(inv_perm, inplace=True)


submission.to_csv('../submissions/submission_xgb_best_optuna_fe_5.csv', index=False)

0
[0]	train-merror:0.08192	eval-merror:0.08249
[300]	train-merror:0.03559	eval-merror:0.03965
[600]	train-merror:0.03154	eval-merror:0.03850
[900]	train-merror:0.02894	eval-merror:0.03816
[1200]	train-merror:0.02652	eval-merror:0.03809
[1299]	train-merror:0.02574	eval-merror:0.03808
Time elapsed: 307.6677746772766
Fold Accuracy: 0.9619225
1
[0]	train-merror:0.08170	eval-merror:0.08251
[300]	train-merror:0.03559	eval-merror:0.03946
[600]	train-merror:0.03159	eval-merror:0.03765
[900]	train-merror:0.02902	eval-merror:0.03728
[1200]	train-merror:0.02658	eval-merror:0.03723
[1299]	train-merror:0.02584	eval-merror:0.03727
Time elapsed: 615.7596542835236
Fold Accuracy: 0.96273
2
[0]	train-merror:0.08166	eval-merror:0.08239
[300]	train-merror:0.03553	eval-merror:0.04000
[600]	train-merror:0.03147	eval-merror:0.03832
[900]	train-merror:0.02888	eval-merror:0.03814
[1200]	train-merror:0.02648	eval-merror:0.03804
[1299]	train-merror:0.02573	eval-merror:0.03807
Time elapsed: 923.8387544155121
Fold

In [38]:
print('OOF Accuracy:', accuracy_score(target, y_oof.argmax(axis=1)))

OOF Accuracy: 0.9623184905796226
