In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')

import xgboost as xgb

In [2]:
train_label = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/train_204.h5')
test_label = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/test_204.h5')

In [3]:
#LabelEncoder
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
train_label['type'] = train_label['type'].map(type_map)

In [4]:
features = [x for x in train_label.columns if x not in ['ship','x','y','v','d','datetime','type','t','d_d','d_t',
                                                        'd_x','v_x','d_y','v_y','hour','date','diff_time']]
target = 'type'

In [5]:
params = {
    
    'booster': 'gbtree',
    'objective': 'multi:softmax',
    'num_class': 3,
    'max_depth': 40,  
    'early_stopping_rounds': 100,    
    'eta': 0.42563552598003573,
     'n_estimators': 1999
}

In [6]:
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X = train_label[features].copy()
y = train_label[target]
models = []
pred = np.zeros(len(test_label))
oof = np.zeros(len(X))
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    xgb_train = xgb.DMatrix(X_train, y_train)
    xgb_eval = xgb.DMatrix(X_val, y_val)
    
    
    model = xgb.train(params, xgb_train, evals=[(xgb_train, 'train'), (xgb_eval, 'val')], verbose_eval=100)
    models.append(model)
    
    val_pred = model.predict(xgb.DMatrix(X_val))

    oof[val_idx] = val_pred
    val_y = y.iloc[val_idx]

    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))
    # 0.8695539641133697
    # 0.8866211724839532


[0]	train-merror:0.017149	val-merror:0.127675
[9]	train-merror:0.000179	val-merror:0.090585
0 val f1 0.8854876839946949
[0]	train-merror:0.016786	val-merror:0.125714
[9]	train-merror:0.000179	val-merror:0.087857
1 val f1 0.8845132924914049
[0]	train-merror:0.014107	val-merror:0.113571
[9]	train-merror:0.000357	val-merror:0.088571
2 val f1 0.8888072259272142
[0]	train-merror:0.013748	val-merror:0.120086
[9]	train-merror:0.000179	val-merror:0.08935
3 val f1 0.8900435557220497
[0]	train-merror:0.019818	val-merror:0.127949
[9]	train-merror:0	val-merror:0.090779
4 val f1 0.8860713347125856


In [8]:
#全部得分
print('oof f1', metrics.f1_score(oof, y, average='macro'))

oof f1 0.8869472478453891


#### 7000训练集的模型 + 预测

In [12]:
xgb_train = xgb.DMatrix(X, y)

model = xgb.train(params, xgb_train, evals=[(xgb_train, 'train')], verbose_eval=100)

test_pred = model.predict(xgb.DMatrix(test_label[features]))

[0]	train-merror:0.014571
[9]	train-merror:0.000143


In [13]:
sub = test_label[['ship']]
sub['pred'] = test_pred

print(sub['pred'].value_counts(1))
sub['pred'] = sub['pred'].map(type_map_rev)
#sub.to_csv('result.csv', index=None, header=None)

0.0    0.6500
1.0    0.2255
2.0    0.1245
Name: pred, dtype: float64


In [17]:
sub.head()

Unnamed: 0,ship,pred
0,7000,围网
1,7001,拖网
2,7002,围网
3,7003,拖网
4,7004,围网


In [15]:
sub.to_csv('C:/Users/f3107/Desktop/hy_data/2020年2月6日/result_0206_xgb.csv', index=None, header=None)