In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')

In [2]:
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

def extract_feature(df, train):
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','x',['count'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    train['x_max_x_min'] = train['x_max'] - train['x_min']
    train['y_max_y_min'] = train['y_max'] - train['y_min']
    train['y_max_x_min'] = train['y_max'] - train['x_min']
    train['x_max_y_min'] = train['x_max'] - train['y_min']
    train['slope'] = train['y_max_y_min'] / np.where(train['x_max_x_min']==0, 0.001, train['x_max_x_min'])
    train['area'] = train['x_max_x_min'] * train['y_max_y_min']
    
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    train['mode_hour'] = train['ship'].map(mode_hour)
    
    t = group_feature(df, 'ship','hour',['max','min'])
    train = pd.merge(train, t, on='ship', how='left')
    
    hour_nunique = df.groupby('ship')['hour'].nunique().to_dict()
    date_nunique = df.groupby('ship')['date'].nunique().to_dict()
    train['hour_nunique'] = train['ship'].map(hour_nunique)
    train['date_nunique'] = train['ship'].map(date_nunique)

    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    t['diff_day'] = t['diff_time'].dt.days
    t['diff_second'] = t['diff_time'].dt.seconds
    train = pd.merge(train, t, on='ship', how='left')
    return train

def extract_dt(df):
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    # df['month'] = df['time'].dt.month
    # df['day'] = df['time'].dt.day
    df['date'] = df['time'].dt.date
    df['hour'] = df['time'].dt.hour
    # df = df.drop_duplicates(['ship','month'])
    df['weekday'] = df['time'].dt.weekday
    return df

In [3]:
train = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/train.h5')
test = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/test.h5')

In [4]:
def del_ship(train):
    
    train_label = train.drop_duplicates('ship')

    train_x = train['x'].groupby(train['ship']).agg(['max','min']).reset_index().rename(columns = {'max':'x_max','min':'x_min'})
    train_y = train['y'].groupby(train['ship']).agg(['max','min']).reset_index().rename(columns = {'max':'y_max','min':'y_min'})
    train_x['x_max_x_min'] = train_x['x_max'] - train_x['x_min']
    train_y['y_max_y_min'] = train_y['y_max'] - train_y['y_min']

    train_data = pd.merge(train_label[['ship','type']], train_x, on ='ship')
    train_data = pd.merge(train_data, train_y, on ='ship')

    train_data_tuo = train_data[train_data['type']=='拖网']
    del_tuo = list(train_data_tuo['ship'][(train_data_tuo['x_max_x_min']<500)&(train_data_tuo['y_max_y_min']<500)])

    train_data_ci = train_data[train_data['type']=='刺网']
    del_ci = list(train_data_ci['ship'][(train_data_ci['x_max_x_min']<100)&(train_data_ci['y_max_y_min']<100)])

    train_data_wei = train_data[train_data['type']=='围网']
    del_wei = list(train_data_wei['ship'][(train_data_wei['x_max_x_min']<100)&(train_data_wei['y_max_y_min']<100)])

    del_ship_ID = del_tuo+del_ci+del_wei
    
    return del_ship_ID

In [5]:
del_ship_ID = del_ship(train)
train = train[~train.ship.isin(del_ship_ID)]

In [6]:
train = extract_dt(train)
test = extract_dt(test)

In [7]:
train_label = train.drop_duplicates('ship')
test_label = test.drop_duplicates('ship')

In [8]:
train_label['type'].value_counts(1)

拖网    0.597421
围网    0.250354
刺网    0.152225
Name: type, dtype: float64

In [9]:
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
train_label['type'] = train_label['type'].map(type_map)


In [10]:
train_label = extract_feature(train, train_label)

{'x_max': 'max', 'x_min': 'min', 'x_mean': 'mean', 'x_std': 'std', 'x_skew': 'skew', 'x_sum': 'sum'}
{'x_count': 'count'}
{'y_max': 'max', 'y_min': 'min', 'y_mean': 'mean', 'y_std': 'std', 'y_skew': 'skew', 'y_sum': 'sum'}
{'v_max': 'max', 'v_min': 'min', 'v_mean': 'mean', 'v_std': 'std', 'v_skew': 'skew', 'v_sum': 'sum'}
{'d_max': 'max', 'd_min': 'min', 'd_mean': 'mean', 'd_std': 'std', 'd_skew': 'skew', 'd_sum': 'sum'}
{'hour_max': 'max', 'hour_min': 'min'}


In [11]:
test_label = extract_feature(test, test_label)

{'x_max': 'max', 'x_min': 'min', 'x_mean': 'mean', 'x_std': 'std', 'x_skew': 'skew', 'x_sum': 'sum'}
{'x_count': 'count'}
{'y_max': 'max', 'y_min': 'min', 'y_mean': 'mean', 'y_std': 'std', 'y_skew': 'skew', 'y_sum': 'sum'}
{'v_max': 'max', 'v_min': 'min', 'v_mean': 'mean', 'v_std': 'std', 'v_skew': 'skew', 'v_sum': 'sum'}
{'d_max': 'max', 'd_min': 'min', 'd_mean': 'mean', 'd_std': 'std', 'd_skew': 'skew', 'd_sum': 'sum'}
{'hour_max': 'max', 'hour_min': 'min'}


In [12]:

features = [x for x in train_label.columns if x not in ['ship','type','time','diff_time','date']]
target = 'type'

In [13]:
print(len(features), ','.join(features))

44 x,y,v,d,hour,weekday,x_max,x_min,x_mean,x_std,x_skew,x_sum,x_count,y_max,y_min,y_mean,y_std,y_skew,y_sum,v_max,v_min,v_mean,v_std,v_skew,v_sum,d_max,d_min,d_mean,d_std,d_skew,d_sum,x_max_x_min,y_max_y_min,y_max_x_min,x_max_y_min,slope,area,mode_hour,hour_max,hour_min,hour_nunique,date_nunique,diff_day,diff_second


In [14]:
train_label

Unnamed: 0,ship,x,y,v,d,time,type,date,hour,weekday,x_max,x_min,x_mean,x_std,x_skew,x_sum,x_count,y_max,y_min,y_mean,y_std,y_skew,y_sum,v_max,v_min,v_mean,v_std,v_skew,v_sum,d_max,d_min,d_mean,d_std,d_skew,d_sum,x_max_x_min,y_max_y_min,y_max_x_min,x_max_y_min,slope,area,mode_hour,hour_max,hour_min,hour_nunique,date_nunique,diff_time,diff_day,diff_second
0,0,6.152038e+06,5.124873e+06,2.59,102,1900-11-10 11:58:19,0,1900-11-10,11,5,6.152038e+06,6.118352e+06,6.119351e+06,5037.320747,5.255558,2.533411e+09,414,5.130781e+06,5.124873e+06,5.130494e+06,850.264541,-4.762308,2.124025e+09,9.39,0.00,0.265966,1.321248,5.520205,110.11,129,0,4.613527,21.247770,4.483093,1910,33686.667453,5907.975523,-9.875704e+05,1.027165e+06,0.175380,1.990200e+08,15,23,0,24,4,2 days 23:48:51,2,85731
1,1,6.076254e+06,5.061743e+06,3.99,278,1900-11-10 11:40:21,0,1900-11-10,11,5,6.102450e+06,6.049472e+06,6.091460e+06,16543.394419,-1.058454,2.345212e+09,385,5.112874e+06,5.042857e+06,5.094050e+06,26764.042729,-0.802446,1.961209e+09,10.47,0.00,1.607922,2.412688,1.590284,619.05,336,0,56.153247,91.449382,1.418867,21619,52978.013345,70016.655842,-9.365979e+05,1.059593e+06,1.321617,3.709343e+09,19,23,0,24,4,2 days 23:39:47,2,85187
2,10,6.321032e+06,5.242805e+06,4.48,213,1900-11-10 11:49:36,0,1900-11-10,11,5,6.346913e+06,6.246119e+06,6.262484e+06,32280.567149,1.623040,2.486206e+09,397,5.265810e+06,5.229867e+06,5.242458e+06,5975.460236,2.198003,2.081256e+09,10.09,0.00,1.313854,2.442825,2.145410,521.60,359,0,108.758186,112.515081,0.727645,43177,100794.674835,35942.703641,-9.803087e+05,1.117046e+06,0.356593,3.622833e+09,23,23,0,24,4,2 days 23:33:53,2,84833
3,100,6.102751e+06,5.112534e+06,0.00,0,1900-10-30 23:50:05,0,1900-10-30,23,1,6.151439e+06,6.102326e+06,6.123711e+06,14451.941954,0.021860,2.516845e+09,411,5.112752e+06,5.069616e+06,5.085480e+06,14020.260117,1.055676,2.090132e+09,8.69,0.00,2.965864,1.647069,-0.215287,1218.97,353,0,161.727494,115.409256,-0.020073,66470,49113.022232,43135.705758,-9.895740e+05,1.081823e+06,0.878295,2.118525e+09,11,23,0,24,3,2 days 23:48:47,2,85727
4,1000,6.843713e+06,5.480538e+06,2.00,216,1900-11-06 23:42:30,1,1900-11-06,23,1,6.844414e+06,6.748890e+06,6.807536e+06,26263.537565,-0.770190,2.566441e+09,377,5.540087e+06,5.440815e+06,5.464764e+06,30135.645906,1.412544,2.060216e+09,8.90,0.00,2.085570,2.649306,1.110173,786.26,358,0,159.143236,101.832626,0.217397,59997,95524.035775,99271.486171,-1.208803e+06,1.403598e+06,1.039230,9.482813e+09,0,23,0,24,3,2 days 23:37:11,2,85031
5,1001,6.246424e+06,5.241153e+06,0.32,279,1900-11-17 11:53:50,0,1900-11-17,11,5,6.275264e+06,6.246229e+06,6.253311e+06,8617.169771,1.087114,2.613884e+09,418,5.241921e+06,5.184475e+06,5.225260e+06,18805.782327,-0.702069,2.184159e+09,10.09,0.00,1.899330,2.384395,1.470137,793.92,359,0,144.112440,114.361807,0.105224,60239,29034.306937,57446.227369,-1.004308e+06,1.090789e+06,1.978564,1.667911e+09,11,23,0,24,4,2 days 23:47:05,2,85625
6,1002,6.403154e+06,5.383851e+06,2.70,359,1900-11-17 11:58:35,1,1900-11-17,11,5,6.443405e+06,6.359137e+06,6.399824e+06,29969.229193,-0.336262,2.598329e+09,406,5.409630e+06,5.343279e+06,5.387243e+06,19622.630015,-0.339169,2.187221e+09,10.09,0.00,2.218916,1.988109,1.233653,900.88,360,0,148.389163,110.159934,0.119688,60246,84268.859893,66350.996279,-9.495065e+05,1.100126e+06,0.787373,5.591323e+09,23,23,0,24,4,2 days 23:55:24,2,86124
7,1003,7.059754e+06,6.104156e+06,0.00,342,1900-11-20 23:59:54,1,1900-11-20,23,1,7.060459e+06,7.013211e+06,7.030812e+06,17434.793430,0.324345,2.784202e+09,396,6.104412e+06,6.091781e+06,6.096992e+06,2711.110998,0.839234,2.414409e+09,10.09,0.00,0.892854,1.919625,3.074983,353.57,360,0,171.393939,108.029234,0.023722,67872,47248.466261,12631.226052,-9.087986e+05,9.686783e+05,0.267336,5.968061e+08,23,23,0,24,3,2 days 23:57:17,2,86237
8,1004,6.596109e+06,6.049142e+06,3.02,30,1900-11-03 11:54:52,0,1900-11-03,11,5,6.655167e+06,6.356943e+06,6.485758e+06,119171.171149,0.138814,2.393245e+09,369,6.049142e+06,5.408091e+06,5.590895e+06,207608.619216,0.801664,2.063040e+09,10.09,0.00,4.981057,4.016841,-0.139894,1838.01,360,0,115.409214,130.230870,0.900436,42586,298223.785513,641050.629684,-3.078009e+05,1.247075e+06,2.149562,1.911765e+11,0,23,0,24,4,2 days 23:35:28,2,84928
9,1005,6.233508e+06,5.251609e+06,0.22,0,1900-11-06 23:51:36,0,1900-11-06,23,1,6.272278e+06,6.233407e+06,6.234547e+06,5204.527464,5.365072,2.406535e+09,386,5.251720e+06,5.214722e+06,5.250717e+06,4638.847739,-5.834974,2.026777e+09,10.09,0.00,0.589145,1.951383,4.151146,227.41,357,0,106.575130,125.615723,0.696546,41138,38871.293661,36998.364483,-9.816866e+05,1.057556e+06,0.951817,1.438174e+09,23,23,0,24,3,2 days 23:44:37,2,85477


In [15]:
params = {
    'n_estimators': 5000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'early_stopping_rounds': 100,
}

In [16]:
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X = train_label[features].copy()
y = train_label[target]
models = []
pred = np.zeros((len(test_label),3))
oof = np.zeros((len(X), 3))
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

    train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

    model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=100)
    models.append(model)
    val_pred = model.predict(X.iloc[val_idx])
    oof[val_idx] = val_pred
    val_y = y.iloc[val_idx]
    val_pred = np.argmax(val_pred, axis=1)
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))
    # 0.8695539641133697
    # 0.8866211724839532

    test_pred = model.predict(test_label[features])
    pred += test_pred/5

Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.0641463	valid_1's multi_logloss: 0.298907
[200]	training's multi_logloss: 0.0130596	valid_1's multi_logloss: 0.307655
Early stopping, best iteration is:
[121]	training's multi_logloss: 0.0455076	valid_1's multi_logloss: 0.296024
0 val f1 0.8421034041086811
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.0661602	valid_1's multi_logloss: 0.281326
[200]	training's multi_logloss: 0.0135592	valid_1's multi_logloss: 0.281825
Early stopping, best iteration is:
[175]	training's multi_logloss: 0.0196858	valid_1's multi_logloss: 0.277111
1 val f1 0.8710605483147259
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.0649986	valid_1's multi_logloss: 0.273265
[200]	training's multi_logloss: 0.0137479	valid_1's multi_logloss: 0.282326
Early stopping, best iteration is:
[120]	training's multi_logloss: 0.046989	vali

In [17]:
oof = np.argmax(oof, axis=1)
print('oof f1', metrics.f1_score(oof, y, average='macro'))
# 0.8701544575329372

oof f1 0.8628356854598821


In [18]:
pred = np.argmax(pred, axis=1)
sub = test_label[['ship']]
sub['pred'] = pred

print(sub['pred'].value_counts(1))
sub['pred'] = sub['pred'].map(type_map_rev)
sub.to_csv('result.csv', index=None, header=None)

0    0.5595
1    0.2450
2    0.1955
Name: pred, dtype: float64


In [19]:
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)

In [20]:
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'], ascending=False)

In [21]:
df

Unnamed: 0,name,score
37,y_max_x_min,670.0
31,x_min,598.6
36,y_max,593.4
22,v_std,544.6
29,x_max_y_min,490.4
16,slope,452.0
35,y,442.8
21,v_skew,432.2
25,x,429.6
41,y_skew,421.8


In [22]:
sub.to_csv('C:/Users/f3107/Desktop/hy_data/result_del_ship.csv', index=None, header=None)