In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import os
from tqdm import tqdm

import warnings

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime as dt

warnings.filterwarnings('ignore')

In [2]:
train = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/train_002.h5')
train['t'] = train['t'].dt.total_seconds().astype('int')

test = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/test_002.h5')
test['t'] = test['t'].dt.total_seconds().astype('int')

In [3]:
train_label = train.drop_duplicates('ship').loc[:,['ship','type']]
test_label = test.drop_duplicates('ship').loc[:,'ship']

### 异常值

#### 单行异常值剔除

##### v>20

In [4]:
train = train[train['v']<20]
test = test[test['v']<20]

##### xy异常

In [5]:
def del_xy(df):
    
    # 生成时间间隔 d_t
    df['d_t'] = df['t'].diff()
    df.loc[0,'d_t'] = 0
    df['d_t'] = df['d_t'].astype('int')

    # 生成d_x, d_y
    df['d_x'] = df['x'].diff()
    df.loc[0,'d_x'] = 0
    df['v_x'] = df['d_x']/df['d_t']
    df.loc[0,'v_x'] = 0

    df['d_y'] = df['y'].diff()
    df.loc[0,'d_y'] = 0
    df['v_y'] = df['d_y']/df['d_t']
    df.loc[0,'v_y'] = 0
    
    df = df[(abs(df['d_x'])<200000) & (abs(df['d_y'])<200000)]
    df = df[(abs(df['v_x'])<15) & (abs(df['v_y'])<15)]
    df = df[~((abs(df['v_x'])>10) & (df['v']<3)) | ((abs(df['v_y'])>10) & (df['v']<3))]
    
    return df

In [6]:
train = del_xy(train)
train = del_xy(train)
test = del_xy(test)
test = del_xy(test)

#### 整体异常值/停泊状态优化

##### 整体v，d归零

In [7]:
def xy_range(train):

    train_x = train['x'].groupby(train['ship']).agg(['max','min']).reset_index().rename(columns = {'max':'x_max','min':'x_min'})
    train_y = train['y'].groupby(train['ship']).agg(['max','min']).reset_index().rename(columns = {'max':'y_max','min':'y_min'})
    train_x['x_max_x_min'] = train_x['x_max'] - train_x['x_min']
    train_y['y_max_y_min'] = train_y['y_max'] - train_y['y_min']

    train_data = pd.merge(train_x, train_y, on ='ship')

    ship_ID = list(train_data['ship'][(train_data['x_max_x_min']<100)&(train_data['y_max_y_min']<100)])

    return ship_ID

In [8]:
ship_ID = xy_range(train)
train['v'][train.ship.isin(ship_ID)] = 0
train['d'][train.ship.isin(ship_ID)] = 0

ship_ID = xy_range(test)
test['v'][test.ship.isin(ship_ID)] = 0
test['d'][test.ship.isin(ship_ID)] = 0

### 统计特征

#### 时间特征

In [9]:
def extract_dt(df):

    df['day'] = df['datetime'].dt.day
    df['weekday'] = df['datetime'].dt.weekday
    df['hour'] = df['datetime'].dt.hour

    return df

In [10]:
train = extract_dt(train)
test = extract_dt(test)

#### max, min, mean, std, skew, sum, max-min, slope, area

In [11]:
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    #print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

In [12]:
def extract_feature(df, train):
    
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum']) 
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')

    '''
    t = group_feature(df, 'ship','t',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d_d',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d_t',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d_x',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','v_x',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d_y',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','v_y',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    ''' 
     
    train['x_max_x_min'] = train['x_max'] - train['x_min']
    train['y_max_y_min'] = train['y_max'] - train['y_min']
    train['y_max_x_min'] = train['y_max'] - train['x_min']
    train['x_max_y_min'] = train['x_max'] - train['y_min']
    
    train['slope_1'] = train['y_max_y_min'] / np.where(train['x_max_x_min']==0, 0.001, train['x_max_x_min']) #d_y / d_x
    train['slope_2'] = train['y_sum'] / np.where(train['x_sum']==0, 0.001, train['x_sum'])
    
    train['area'] = train['x_max_x_min'] * train['y_max_y_min']
    
    #小时值的统计量，取value_counts最多的那个
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    train['mode_hour'] = train['ship'].map(mode_hour)

    return train

In [13]:
train_label = extract_feature(train, train_label)
test_label = extract_feature(test, test_label)

### 相关性

#### 切片

In [14]:
def v_cut(train):
    
    bins = [0, 0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20]
    labels = [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20]
    train['v_cut_1'] = pd.cut(train['v'],bins,labels = labels,include_lowest=True)
    
    return train

In [15]:
train = v_cut(train)
test = v_cut(test)

#### 频数统计

In [16]:
def v_count(train):
    
    v_count = train['v_cut_1'].groupby([train['ship'], train['v_cut_1']]).count()
    v_count = v_count.unstack()
    v_count.columns = [v_count.columns.name+'_count_'+str(x) for x in v_count.columns.categories]
    
    v_count = v_count.fillna(0)
    
    return v_count

In [17]:
train_label = pd.merge(train_label, v_count(train), on = 'ship')
test_label = pd.merge(test_label, v_count(test), on = 'ship')

#### 切片相关性

In [18]:
def vd_corr(train):
    
    vd_corr = train[['v','d']].groupby([train['ship'],train['v_cut_1']]).corr()
    vd_corr_temp = vd_corr.unstack()
    vd_corr_temp = vd_corr_temp['v']['d']
    vd_corr = vd_corr_temp.unstack()
    vd_corr.columns = [vd_corr.columns.name+'_corr_'+str(x) for x in vd_corr.columns.categories]
    
    vd_corr = vd_corr.fillna(-99)
    
    return vd_corr

In [19]:
train_label = pd.merge(train_label, vd_corr(train), on = 'ship')
test_label = pd.merge(test_label, vd_corr(test), on = 'ship')

### 导出数据集

In [20]:
train_label.columns

Index(['ship', 'type', 'x_max', 'x_min', 'x_mean', 'x_std', 'x_skew', 'x_sum',
       'y_max', 'y_min', 'y_mean', 'y_std', 'y_skew', 'y_sum', 'v_max',
       'v_min', 'v_mean', 'v_std', 'v_skew', 'v_sum', 'd_max', 'd_min',
       'd_mean', 'd_std', 'd_skew', 'd_sum', 'x_max_x_min', 'y_max_y_min',
       'y_max_x_min', 'x_max_y_min', 'slope_1', 'slope_2', 'area', 'mode_hour',
       'v_cut_1_count_0.5', 'v_cut_1_count_1.0', 'v_cut_1_count_2.0',
       'v_cut_1_count_3.0', 'v_cut_1_count_4.0', 'v_cut_1_count_5.0',
       'v_cut_1_count_6.0', 'v_cut_1_count_7.0', 'v_cut_1_count_8.0',
       'v_cut_1_count_9.0', 'v_cut_1_count_10.0', 'v_cut_1_count_20.0',
       'v_cut_1_corr_0.5', 'v_cut_1_corr_1.0', 'v_cut_1_corr_2.0',
       'v_cut_1_corr_3.0', 'v_cut_1_corr_4.0', 'v_cut_1_corr_5.0',
       'v_cut_1_corr_6.0', 'v_cut_1_corr_7.0', 'v_cut_1_corr_8.0',
       'v_cut_1_corr_9.0', 'v_cut_1_corr_10.0', 'v_cut_1_corr_20.0'],
      dtype='object')

In [21]:
test_label.columns

Index(['ship', 'x_max', 'x_min', 'x_mean', 'x_std', 'x_skew', 'x_sum', 'y_max',
       'y_min', 'y_mean', 'y_std', 'y_skew', 'y_sum', 'v_max', 'v_min',
       'v_mean', 'v_std', 'v_skew', 'v_sum', 'd_max', 'd_min', 'd_mean',
       'd_std', 'd_skew', 'd_sum', 'x_max_x_min', 'y_max_y_min', 'y_max_x_min',
       'x_max_y_min', 'slope_1', 'slope_2', 'area', 'mode_hour',
       'v_cut_1_count_0.5', 'v_cut_1_count_1.0', 'v_cut_1_count_2.0',
       'v_cut_1_count_3.0', 'v_cut_1_count_4.0', 'v_cut_1_count_5.0',
       'v_cut_1_count_6.0', 'v_cut_1_count_7.0', 'v_cut_1_count_8.0',
       'v_cut_1_count_9.0', 'v_cut_1_count_10.0', 'v_cut_1_count_20.0',
       'v_cut_1_corr_0.5', 'v_cut_1_corr_1.0', 'v_cut_1_corr_2.0',
       'v_cut_1_corr_3.0', 'v_cut_1_corr_4.0', 'v_cut_1_corr_5.0',
       'v_cut_1_corr_6.0', 'v_cut_1_corr_7.0', 'v_cut_1_corr_8.0',
       'v_cut_1_corr_9.0', 'v_cut_1_corr_10.0', 'v_cut_1_corr_20.0'],
      dtype='object')

In [22]:
train_label.to_hdf('C:/Users/f3107/Desktop/hy_data/train_202.h5', 'df', mode='w')
test_label.to_hdf('C:/Users/f3107/Desktop/hy_data/test_202.h5', 'df', mode='w')

In [23]:
len(train_label.columns)

58

In [24]:
len(test_label.columns)

57