In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')

In [2]:
train_label = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/train_203.h5')
test_label = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/test_203.h5')

In [3]:
train_label['type'].value_counts(1)

拖网    0.623000
围网    0.231571
刺网    0.145429
Name: type, dtype: float64

In [4]:
#LabelEncoder
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
train_label['type'] = train_label['type'].map(type_map)

In [5]:
features = [x for x in train_label.columns if x not in ['ship','x','y','v','d','datetime','type','t','d_d','d_t',
                                                        'd_x','v_x','d_y','v_y','hour','date','diff_time']]
target = 'type'

In [6]:
print(len(features), ','.join(features))

56 x_max,x_min,x_mean,x_std,x_skew,x_sum,y_max,y_min,y_mean,y_std,y_skew,y_sum,v_max,v_min,v_mean,v_std,v_skew,v_sum,d_max,d_min,d_mean,d_std,d_skew,d_sum,x_max_x_min,y_max_y_min,y_max_x_min,x_max_y_min,slope_1,slope_2,area,mode_hour,v_cut_0.5_count_0.5,v_cut_0.5_count_1.0,v_cut_0.5_count_1.5,v_cut_0.5_count_2.0,v_cut_0.5_count_2.5,v_cut_0.5_count_3.0,v_cut_0.5_count_3.5,v_cut_0.5_count_4.0,v_cut_0.5_count_4.5,v_cut_0.5_count_5.0,v_cut_0.5_count_5.5,v_cut_0.5_count_6.0,v_cut_0.5_count_6.5,v_cut_0.5_count_7.0,v_cut_0.5_count_7.5,v_cut_0.5_count_8.0,v_cut_0.5_count_8.5,v_cut_0.5_count_9.0,v_cut_0.5_count_9.5,v_cut_0.5_count_10.0,v_cut_0.5_count_20.0,v_cut_tuo_count_0.7,v_cut_tuo_count_6.5,v_cut_tuo_count_20.0


In [7]:
params = {
    'n_estimators': 5000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'early_stopping_rounds': 100,
}

In [8]:
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X = train_label[features].copy()
y = train_label[target]
models = []
pred = np.zeros((len(test_label),3))
oof = np.zeros((len(X), 3))
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

    train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

    model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=100)
    models.append(model)
    val_pred = model.predict(X.iloc[val_idx])
    oof[val_idx] = val_pred
    val_y = y.iloc[val_idx]
    val_pred = np.argmax(val_pred, axis=1)
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))
    # 0.8695539641133697
    # 0.8866211724839532

    test_pred = model.predict(test_label[features])
    pred += test_pred/5

Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.0557745	valid_1's multi_logloss: 0.222861
[200]	training's multi_logloss: 0.0109572	valid_1's multi_logloss: 0.215319
Early stopping, best iteration is:
[174]	training's multi_logloss: 0.01637	valid_1's multi_logloss: 0.213861
0 val f1 0.8931978790799494
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.0544786	valid_1's multi_logloss: 0.23562
[200]	training's multi_logloss: 0.0106743	valid_1's multi_logloss: 0.233171
Early stopping, best iteration is:
[150]	training's multi_logloss: 0.0237817	valid_1's multi_logloss: 0.229729
1 val f1 0.88477610659949
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.0558005	valid_1's multi_logloss: 0.244109
[200]	training's multi_logloss: 0.0112834	valid_1's multi_logloss: 0.234056
Early stopping, best iteration is:
[166]	training's multi_logloss: 0.0191639	valid_1'

In [9]:
oof = np.argmax(oof, axis=1)
print('oof f1', metrics.f1_score(oof, y, average='macro'))

# 0.8746314842676215 - corr
# 0.8854320175756322 - v_count_1
# 0.8865305319543356 - v_count_0.5                             
# 0.8810270166098931 - v_count_2
# 0.8786693038363592 - v_count_tuo
# 0.8847267258256025 - v_count_tuo, v_count_wei
# 0.8883157927542881- v_count_0.5, v_count_tuo [0, 0.7, 6.5, 20]             *
# 0.8865905490568441- v_count_0.5, v_count_tuo [0.5, 2, 3, 6, 20]
# 0.8875954742743063- v_count_0.5, v_count_tuo(without_20)  
# 0.8833058963771583 - v_count_0.5, v_count_tuo, v_count_wei
# 0.8635629318155464 - vd_corr
# 0.8871980889018065 - v_count,corr

oof f1 0.8883157927542881


In [10]:
pred = np.argmax(pred, axis=1)
sub = test_label[['ship']]
sub['pred'] = pred

print(sub['pred'].value_counts(1))
sub['pred'] = sub['pred'].map(type_map_rev)
#sub.to_csv('result.csv', index=None, header=None)

0    0.6285
1    0.2415
2    0.1300
Name: pred, dtype: float64


In [11]:
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)

In [12]:
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'], ascending=False)

In [13]:
df

Unnamed: 0,name,score
48,y_max,691.4
9,slope_2,617.2
44,x_min,589.4
49,y_max_x_min,548.4
42,x_max_y_min,447.6
11,v_cut_0.5_count_1.0,428.8
52,y_min,412.2
19,v_cut_0.5_count_4.0,401.2
40,x_max,398.4
43,x_mean,366.2


In [15]:
sub.to_csv('C:/Users/f3107/Desktop/hy_data/result_0204.csv', index=None, header=None)