In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')

In [2]:
train_label = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/train_204.h5')
test_label = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/test_204.h5')

In [3]:
train_label['type'].value_counts(1)

拖网    0.623000
围网    0.231571
刺网    0.145429
Name: type, dtype: float64

In [4]:
#LabelEncoder
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
train_label['type'] = train_label['type'].map(type_map)

In [5]:
features = [x for x in train_label.columns if x not in ['ship','x','y','v','d','datetime','type','t','d_d','d_t',
                                                        'd_x','v_x','d_y','v_y','hour','date','diff_time']]
target = 'type'

In [6]:
print(len(features), ','.join(features))

59 x_max,x_min,x_mean,x_std,x_skew,x_sum,x_median,y_max,y_min,y_mean,y_std,y_skew,y_sum,y_median,v_max,v_min,v_mean,v_std,v_skew,v_sum,v_median,d_max,d_min,d_mean,d_std,d_skew,d_sum,d_median,x_max_x_min,y_max_y_min,y_max_x_min,x_max_y_min,slope_1,slope_2,area,mode_hour,v_cut_0.5_count_0.5,v_cut_0.5_count_1.0,v_cut_0.5_count_1.5,v_cut_0.5_count_2.0,v_cut_0.5_count_2.5,v_cut_0.5_count_3.0,v_cut_0.5_count_3.5,v_cut_0.5_count_4.0,v_cut_0.5_count_4.5,v_cut_0.5_count_5.0,v_cut_0.5_count_5.5,v_cut_0.5_count_6.0,v_cut_0.5_count_6.5,v_cut_0.5_count_7.0,v_cut_0.5_count_7.5,v_cut_0.5_count_8.0,v_cut_0.5_count_8.5,v_cut_0.5_count_9.0,v_cut_0.5_count_9.5,v_cut_0.5_count_10.0,v_cut_0.5_count_20.0,v_cut_tuo_count_6,v_cut_tuo_count_20


In [7]:
params = {
    'n_estimators': 5000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'early_stopping_rounds': 100,
    'feature_fraction': 0.6785634376053006,
    'learning_rate': 0.019383601030181666,
    'max_depth': 24,
    'min_data_in_leaf': 5,
    'min_gain_to_split': 0.05071488519451617,
    'min_sum_hessian_in_leaf': 0.005378853072855871,
    'num_leaves': 44
}

In [8]:
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X = train_label[features].copy()
y = train_label[target]
models = []
pred = np.zeros((len(test_label),3))
oof = np.zeros((len(X), 3))
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

    train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

    model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=100)
    models.append(model)
    val_pred = model.predict(X.iloc[val_idx])
    oof[val_idx] = val_pred
    val_y = y.iloc[val_idx]
    val_pred = np.argmax(val_pred, axis=1)
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))
    # 0.8695539641133697
    # 0.8866211724839532

    #test_pred = model.predict(test_label[features])
    #pred += test_pred/5

Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.25138	valid_1's multi_logloss: 0.330457
[200]	training's multi_logloss: 0.121082	valid_1's multi_logloss: 0.241429
[300]	training's multi_logloss: 0.0696927	valid_1's multi_logloss: 0.216025
[400]	training's multi_logloss: 0.0426045	valid_1's multi_logloss: 0.205273
[500]	training's multi_logloss: 0.0267378	valid_1's multi_logloss: 0.200491
[600]	training's multi_logloss: 0.017499	valid_1's multi_logloss: 0.198659
[700]	training's multi_logloss: 0.0145448	valid_1's multi_logloss: 0.198192
Early stopping, best iteration is:
[654]	training's multi_logloss: 0.0150141	valid_1's multi_logloss: 0.198054
0 val f1 0.9023346953805086
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.249464	valid_1's multi_logloss: 0.339196
[200]	training's multi_logloss: 0.121022	valid_1's multi_logloss: 0.254569
[300]	training's multi_logloss: 0.0692292	valid_1's multi_l

In [9]:
oof = np.argmax(oof, axis=1)
print('oof f1', metrics.f1_score(oof, y, average='macro'))

# 0.8866228588851547 - corr, v_count_0.5, v_count_tuo [0.7, 6.5, 20] , v_corr_tuo      
# 0.8874926343083475 - v_count_0.5, v_count_tuo [0, 2, 6, 20]
# 0.8884029972976388 - v_count_0.5, v_count_tuo [0, 2, 6, 20], median
# 0.8939494737221377 - v_count_0.5, v_count_tuo [2, 6, 20], median                   *
# 0.890917528266248  - v_count_0.5, v_count_tuo [2, 6], median

oof f1 0.8989392378137055


In [11]:
train_set = lgb.Dataset(X, y)
model = lgb.train(params, train_set, valid_sets=[train_set], verbose_eval=100)
pred = model.predict(test_label[features])

Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.261626
[200]	training's multi_logloss: 0.134967
[300]	training's multi_logloss: 0.0825461
[400]	training's multi_logloss: 0.0533497
[500]	training's multi_logloss: 0.0358524
[600]	training's multi_logloss: 0.0246185
[700]	training's multi_logloss: 0.0172716
[800]	training's multi_logloss: 0.014622
[900]	training's multi_logloss: 0.014622
[1000]	training's multi_logloss: 0.014622
[1100]	training's multi_logloss: 0.014622
[1200]	training's multi_logloss: 0.014622
[1300]	training's multi_logloss: 0.014622
[1400]	training's multi_logloss: 0.014622
[1500]	training's multi_logloss: 0.014622
[1600]	training's multi_logloss: 0.014622
[1700]	training's multi_logloss: 0.014622
[1800]	training's multi_logloss: 0.014622
[1900]	training's multi_logloss: 0.014622
[2000]	training's multi_logloss: 0.014622
[2100]	training's multi_logloss: 0.014622
[2200]	training's multi_logloss: 0.014622
[2300]	training's 

In [15]:
pred

array([[1.04361683e-04, 9.99852066e-01, 4.35719598e-05],
       [9.99805881e-01, 1.66374758e-04, 2.77442906e-05],
       [1.42367674e-01, 7.99469083e-01, 5.81632438e-02],
       ...,
       [3.33133362e-03, 8.89131726e-01, 1.07536941e-01],
       [9.99413588e-01, 3.27985150e-04, 2.58427192e-04],
       [1.72842773e-02, 6.84825971e-01, 2.97889752e-01]])

In [16]:
pred = np.argmax(pred, axis=1)
sub = test_label[['ship']]
sub['pred'] = pred

print(sub['pred'].value_counts(1))
sub['pred'] = sub['pred'].map(type_map_rev)
#sub.to_csv('result.csv', index=None, header=None)

0    0.6305
1    0.2365
2    0.1330
Name: pred, dtype: float64


In [17]:
sub.head()

Unnamed: 0,ship,pred
0,7000,围网
1,7001,拖网
2,7002,围网
3,7003,拖网
4,7004,围网


In [18]:
sub.to_csv('C:/Users/f3107/Desktop/hy_data/2020年2月6日/result_0206_lgb.csv', index=None, header=None)