In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')

In [2]:
train_label = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/train_202.h5')
test_label = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/test_202.h5')

In [3]:
train_label['type'].value_counts(1)

拖网    0.623000
围网    0.231571
刺网    0.145429
Name: type, dtype: float64

In [4]:
#LabelEncoder
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
train_label['type'] = train_label['type'].map(type_map)

In [5]:
features = [x for x in train_label.columns if x not in ['ship','x','y','v','d','datetime','type','t','d_d','d_t',
                                                        'd_x','v_x','d_y','v_y','hour','date','diff_time']]
target = 'type'

In [6]:
print(len(features), ','.join(features))

56 x_max,x_min,x_mean,x_std,x_skew,x_sum,y_max,y_min,y_mean,y_std,y_skew,y_sum,v_max,v_min,v_mean,v_std,v_skew,v_sum,d_max,d_min,d_mean,d_std,d_skew,d_sum,x_max_x_min,y_max_y_min,y_max_x_min,x_max_y_min,slope_1,slope_2,area,mode_hour,v_cut_1_count_0.5,v_cut_1_count_1.0,v_cut_1_count_2.0,v_cut_1_count_3.0,v_cut_1_count_4.0,v_cut_1_count_5.0,v_cut_1_count_6.0,v_cut_1_count_7.0,v_cut_1_count_8.0,v_cut_1_count_9.0,v_cut_1_count_10.0,v_cut_1_count_20.0,v_cut_1_corr_0.5,v_cut_1_corr_1.0,v_cut_1_corr_2.0,v_cut_1_corr_3.0,v_cut_1_corr_4.0,v_cut_1_corr_5.0,v_cut_1_corr_6.0,v_cut_1_corr_7.0,v_cut_1_corr_8.0,v_cut_1_corr_9.0,v_cut_1_corr_10.0,v_cut_1_corr_20.0


In [7]:
params = {
    'n_estimators': 5000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'early_stopping_rounds': 100,
}

In [8]:
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X = train_label[features].copy()
y = train_label[target]
models = []
pred = np.zeros((len(test_label),3))
oof = np.zeros((len(X), 3))
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

    train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

    model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=100)
    models.append(model)
    val_pred = model.predict(X.iloc[val_idx])
    oof[val_idx] = val_pred
    val_y = y.iloc[val_idx]
    val_pred = np.argmax(val_pred, axis=1)
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))
    # 0.8695539641133697
    # 0.8866211724839532

    test_pred = model.predict(test_label[features])
    pred += test_pred/5

Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.0575216	valid_1's multi_logloss: 0.230543
[200]	training's multi_logloss: 0.0114163	valid_1's multi_logloss: 0.225152
Early stopping, best iteration is:
[131]	training's multi_logloss: 0.034484	valid_1's multi_logloss: 0.223678
0 val f1 0.8863228843547901
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.056404	valid_1's multi_logloss: 0.245081
[200]	training's multi_logloss: 0.0114626	valid_1's multi_logloss: 0.249975
Early stopping, best iteration is:
[121]	training's multi_logloss: 0.0399258	valid_1's multi_logloss: 0.242012
1 val f1 0.877197640912732
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.0574493	valid_1's multi_logloss: 0.238162
[200]	training's multi_logloss: 0.011713	valid_1's multi_logloss: 0.235855
Early stopping, best iteration is:
[166]	training's multi_logloss: 0.0198217	valid_1

In [9]:
oof = np.argmax(oof, axis=1)
print('oof f1', metrics.f1_score(oof, y, average='macro'))

# 0.8780909477829691 - data 201

oof f1 0.884484448514497


In [10]:
pred = np.argmax(pred, axis=1)
sub = test_label[['ship']]
sub['pred'] = pred

print(sub['pred'].value_counts(1))
sub['pred'] = sub['pred'].map(type_map_rev)
#sub.to_csv('result.csv', index=None, header=None)

0    0.6340
1    0.2385
2    0.1275
Name: pred, dtype: float64


In [11]:
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)

In [12]:
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'], ascending=False)

In [13]:
df

Unnamed: 0,name,score
48,y_max,666.0
9,slope_2,580.6
44,x_min,534.6
49,y_max_x_min,517.2
52,y_min,426.4
42,x_max_y_min,422.4
28,v_cut_1_count_4.0,414.0
40,x_max,383.0
23,v_cut_1_count_1.0,363.4
51,y_mean,335.2


In [14]:
#sub.to_csv('C:/Users/f3107/Desktop/hy_data/result_del_ship.csv', index=None, header=None)