In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import os
from tqdm import tqdm

import warnings

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime as dt

warnings.filterwarnings('ignore')

In [22]:
train = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/train_002.h5')
train['t'] = train['t'].dt.total_seconds().astype('int')

test = pd.read_hdf('C:/Users/f3107/Desktop/hy_data/test_002.h5')
test['t'] = test['t'].dt.total_seconds().astype('int')

In [45]:
train_label = train.drop_duplicates('ship').loc[:,['ship','type']]
test_label = test.drop_duplicates('ship').loc[:,['ship','type']]

### 异常值

#### 单行异常值剔除

##### v>20

In [23]:
train = train[train['v']<20]
test = test[test['v']<20]

##### xy异常

In [24]:
def del_xy(df):
    
    # 生成时间间隔 d_t
    df['d_t'] = df['t'].diff()
    df.loc[0,'d_t'] = 0
    df['d_t'] = df['d_t'].astype('int')

    # 生成d_x, d_y
    df['d_x'] = df['x'].diff()
    df.loc[0,'d_x'] = 0
    df['v_x'] = df['d_x']/df['d_t']
    df.loc[0,'v_x'] = 0

    df['d_y'] = df['y'].diff()
    df.loc[0,'d_y'] = 0
    df['v_y'] = df['d_y']/df['d_t']
    df.loc[0,'v_y'] = 0
    
    df = df[(abs(df['d_x'])<200000) & (abs(df['d_y'])<200000)]
    df = df[(abs(df['v_x'])<15) & (abs(df['v_y'])<15)]
    df = df[~((abs(df['v_x'])>10) & (df['v']<3)) | ((abs(df['v_y'])>10) & (df['v']<3))]
    
    return df

In [25]:
train = del_xy(train)
train = del_xy(train)
test = del_xy(test)
test = del_xy(test)

#### 整体异常值/停泊状态优化

##### 整体v，d归零

In [26]:
def xy_range(train):

    train_x = train['x'].groupby(train['ship']).agg(['max','min']).reset_index().rename(columns = {'max':'x_max','min':'x_min'})
    train_y = train['y'].groupby(train['ship']).agg(['max','min']).reset_index().rename(columns = {'max':'y_max','min':'y_min'})
    train_x['x_max_x_min'] = train_x['x_max'] - train_x['x_min']
    train_y['y_max_y_min'] = train_y['y_max'] - train_y['y_min']

    train_data = pd.merge(train_x, train_y, on ='ship')

    ship_ID = list(train_data['ship'][(train_data['x_max_x_min']<100)&(train_data['y_max_y_min']<100)])

    return ship_ID

In [27]:
ship_ID = xy_range(train)
train['v'][train.ship.isin(ship_ID)] = 0
train['d'][train.ship.isin(ship_ID)] = 0

ship_ID = xy_range(test)
test['v'][test.ship.isin(ship_ID)] = 0
test['d'][test.ship.isin(ship_ID)] = 0

### 统计特征

#### 时间特征

In [32]:
def extract_dt(df):

    df['day'] = df['datetime'].dt.day
    df['weekday'] = df['datetime'].dt.weekday
    df['hour'] = df['datetime'].dt.hour

    return df

In [33]:
train = extract_dt(train)
test = extract_dt(test)

#### max, min, mean, std, skew, sum, max-min, slope, area

In [48]:
def group_feature(df, key, target, aggs):   
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag
    #print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
    return t

In [49]:
def extract_feature(df, train):
    
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum']) 
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')

    t = group_feature(df, 'ship','t',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d_d',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d_t',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d_x',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','v_x',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','d_y',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    t = group_feature(df, 'ship','v_y',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
        
    train['x_max_x_min'] = train['x_max'] - train['x_min']
    train['y_max_y_min'] = train['y_max'] - train['y_min']
    train['y_max_x_min'] = train['y_max'] - train['x_min']
    train['x_max_y_min'] = train['x_max'] - train['y_min']
    
    train['slope_1'] = train['y_max_y_min'] / np.where(train['x_max_x_min']==0, 0.001, train['x_max_x_min']) #d_y / d_x
    train['slope_2'] = train['y_sum'] / np.where(train['x_sum']==0, 0.001, train['x_sum'])
    
    train['area'] = train['x_max_x_min'] * train['y_max_y_min']
    
    #小时值的统计量，取value_counts最多的那个
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    train['mode_hour'] = train['ship'].map(mode_hour)

    return train

In [50]:
train_label = extract_feature(train, train_label)
test_label = extract_feature(test, test_label)

{'x_max': 'max', 'x_min': 'min', 'x_mean': 'mean', 'x_std': 'std', 'x_skew': 'skew', 'x_sum': 'sum'}
{'y_max': 'max', 'y_min': 'min', 'y_mean': 'mean', 'y_std': 'std', 'y_skew': 'skew', 'y_sum': 'sum'}
{'v_max': 'max', 'v_min': 'min', 'v_mean': 'mean', 'v_std': 'std', 'v_skew': 'skew', 'v_sum': 'sum'}
{'d_max': 'max', 'd_min': 'min', 'd_mean': 'mean', 'd_std': 'std', 'd_skew': 'skew', 'd_sum': 'sum'}
{'t_max': 'max', 't_min': 'min', 't_mean': 'mean', 't_std': 'std', 't_skew': 'skew', 't_sum': 'sum'}
{'d_d_max': 'max', 'd_d_min': 'min', 'd_d_mean': 'mean', 'd_d_std': 'std', 'd_d_skew': 'skew', 'd_d_sum': 'sum'}
{'d_t_max': 'max', 'd_t_min': 'min', 'd_t_mean': 'mean', 'd_t_std': 'std', 'd_t_skew': 'skew', 'd_t_sum': 'sum'}
{'d_x_max': 'max', 'd_x_min': 'min', 'd_x_mean': 'mean', 'd_x_std': 'std', 'd_x_skew': 'skew', 'd_x_sum': 'sum'}
{'v_x_max': 'max', 'v_x_min': 'min', 'v_x_mean': 'mean', 'v_x_std': 'std', 'v_x_skew': 'skew', 'v_x_sum': 'sum'}
{'d_y_max': 'max', 'd_y_min': 'min', 'd_y_m

### 相关性

#### 切片

In [56]:
train.head(2)

Unnamed: 0,ship,x,y,v,d,datetime,type,t,d_d,d_t,d_x,v_x,d_y,v_y,weekday,date,hour,day
0,0,6118352.0,5130672.0,0.0,0,1900-11-07 12:09:28,拖网,0,0.0,0,0.0,0.0,0.0,0.0,2,1900-11-07,12,7
1,0,6118352.0,5130672.0,0.0,0,1900-11-07 12:18:30,拖网,542,0.0,542,0.0,0.0,0.0,0.0,2,1900-11-07,12,7


In [95]:
bins = [0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 8.5, 9, 9.5, 10, 20]
labels = [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 8.5, 9, 9.5, 10, 20]
train['v_cut_0.5'] = pd.cut(train['v'],bins,labels = labels,include_lowest=True)

In [96]:
train

Unnamed: 0,ship,x,y,v,d,datetime,type,t,d_d,d_t,d_x,v_x,d_y,v_y,weekday,date,hour,day,v_cut_0.5
0,0,6.118352e+06,5.130672e+06,0.00,0,1900-11-07 12:09:28,拖网,0,0.0,0,0.000000,0.000000,0.00000,0.000000,2,1900-11-07,12,7,0.5
1,0,6.118352e+06,5.130672e+06,0.00,0,1900-11-07 12:18:30,拖网,542,0.0,542,0.000000,0.000000,0.00000,0.000000,2,1900-11-07,12,7,0.5
2,0,6.118352e+06,5.130672e+06,0.00,0,1900-11-07 12:28:32,拖网,1144,0.0,602,0.000000,0.000000,0.00000,0.000000,2,1900-11-07,12,7,0.5
3,0,6.118352e+06,5.130672e+06,0.00,0,1900-11-07 12:38:32,拖网,1744,0.0,600,0.000000,0.000000,0.00000,0.000000,2,1900-11-07,12,7,0.5
4,0,6.118352e+06,5.130672e+06,0.00,0,1900-11-07 12:48:30,拖网,2342,0.0,598,0.000000,0.000000,0.00000,0.000000,2,1900-11-07,12,7,0.5
5,0,6.118352e+06,5.130672e+06,0.00,0,1900-11-07 12:58:30,拖网,2942,0.0,600,0.000000,0.000000,0.00000,0.000000,2,1900-11-07,12,7,0.5
6,0,6.118352e+06,5.130672e+06,0.00,0,1900-11-07 13:08:30,拖网,3542,0.0,600,0.000000,0.000000,0.00000,0.000000,2,1900-11-07,13,7,0.5
7,0,6.118352e+06,5.130672e+06,0.00,0,1900-11-07 13:18:32,拖网,4144,0.0,602,0.000000,0.000000,0.00000,0.000000,2,1900-11-07,13,7,0.5
8,0,6.118352e+06,5.130672e+06,0.00,0,1900-11-07 13:40:00,拖网,5432,0.0,1288,0.000000,0.000000,0.00000,0.000000,2,1900-11-07,13,7,0.5
9,0,6.118352e+06,5.130672e+06,0.05,0,1900-11-07 13:48:30,拖网,5942,0.0,510,0.000000,0.000000,0.00000,0.000000,2,1900-11-07,13,7,0.5


#### 频数统计

#### 切片相关性

### 导出数据集

In [28]:
train_label.to_hdf('C:/Users/f3107/Desktop/hy_data/train_201.h5', 'df', mode='w')
test_label.to_hdf('C:/Users/f3107/Desktop/hy_data/test_201.h5', 'df', mode='w')