In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')

import xgboost as xgb

In [2]:
train_label = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/train_204.h5')
test_label = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/test_204.h5')

In [3]:
#LabelEncoder
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
train_label['type'] = train_label['type'].map(type_map)

In [4]:
features = [x for x in train_label.columns if x not in ['ship','x','y','v','d','datetime','type','t','d_d','d_t',
                                                        'd_x','v_x','d_y','v_y','hour','date','diff_time']]
target = 'type'

In [5]:
params = {
    
    'booster': 'gbtree',
    'objective': 'multi:softmax',
    'num_class': 3,
    'max_depth': 24,  
    'early_stopping_rounds': 100
}

In [19]:
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X = train_label[features].copy()
y = train_label[target]
models = []
pred = np.zeros(len(test_label))
oof = np.zeros(len(X))
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    xgb_train = xgb.DMatrix(X_train, y_train)
    xgb_eval = xgb.DMatrix(X_val, y_val)
    
    
    model = xgb.train(params, xgb_train, evals=[(xgb_train, 'train'), (xgb_eval, 'val')], verbose_eval=100)
    models.append(model)
    
    val_pred = model.predict(xgb.DMatrix(X_val))

    oof[val_idx] = val_pred
    val_y = y.iloc[val_idx]

    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))
    # 0.8695539641133697
    # 0.8866211724839532

    test_pred = model.predict(xgb.DMatrix(test_label[features]))
    pred += test_pred/5


[0]	train-merror:0.017149	val-merror:0.127675
[9]	train-merror:0.000179	val-merror:0.091298
0 val f1 0.8841290509765215
[0]	train-merror:0.016786	val-merror:0.125714
[9]	train-merror:0.000357	val-merror:0.099286
1 val f1 0.8715066882527039
[0]	train-merror:0.014107	val-merror:0.113571
[9]	train-merror:0.000536	val-merror:0.089286
2 val f1 0.8877463716869617
[0]	train-merror:0.013748	val-merror:0.120086
[9]	train-merror:0.000536	val-merror:0.097927
3 val f1 0.8783086322216415
[0]	train-merror:0.019818	val-merror:0.127949
[9]	train-merror:0.000536	val-merror:0.096497
4 val f1 0.8769181572037802


In [16]:
#oof = np.argmax(oof, axis=1)
print('oof f1', metrics.f1_score(oof, y, average='macro'))

oof f1 0.8797209719560467


In [None]:
pred = np.argmax(pred, axis=1)
sub = test_label[['ship']]
sub['pred'] = pred

print(sub['pred'].value_counts(1))
sub['pred'] = sub['pred'].map(type_map_rev)
#sub.to_csv('result.csv', index=None, header=None)

In [None]:
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)

In [None]:
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'], ascending=False)

In [None]:
df