In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings

warnings.filterwarnings('ignore')

train_path = 'C:/Users/f3107/Desktop/hy_data/hy_round1_train_20200102'
test_path = 'C:/Users/f3107/Desktop/hy_data/hy_round1_testA_20200102'

### 文件夹基本信息

In [2]:
train_files = os.listdir(train_path)
test_files = os.listdir(test_path)
print(len(train_files), len(test_files))

7000 2000


In [3]:
train_files[:3]

['0.csv', '1.csv', '10.csv']

In [4]:
test_files[:3]

['7000.csv', '7001.csv', '7002.csv']

### 单个文件内容

In [5]:
df = pd.read_csv(f'{train_path}/0.csv')

In [6]:
df.head()

Unnamed: 0,渔船ID,x,y,速度,方向,time,type
0,0,6152038.0,5124873.0,2.59,102,1110 11:58:19,拖网
1,0,6151230.0,5125218.0,2.7,113,1110 11:48:19,拖网
2,0,6150421.0,5125563.0,2.7,116,1110 11:38:19,拖网
3,0,6149612.0,5125907.0,3.29,95,1110 11:28:19,拖网
4,0,6148803.0,5126252.0,3.18,108,1110 11:18:19,拖网


### 整合单个文件

#### 未处理，时间倒序

##### train

In [7]:
ret = []
for file in tqdm(train_files):
    df = pd.read_csv(f'{train_path}/{file}')
    ret.append(df)
df = pd.concat(ret)
df.columns = ['ship','x','y','v','d','time','type']

100%|█████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:39<00:00, 179.44it/s]


In [8]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/train_0.h5', 'df', mode='w')

##### test

In [9]:
ret = []
for file in tqdm(test_files):
    df = pd.read_csv(f'{test_path}/{file}')
    ret.append(df)
df = pd.concat(ret)
df.columns = ['ship','x','y','v','d','time']

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:11<00:00, 179.52it/s]


In [10]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/test_0.h5', 'df', mode='w')

#### 时间顺序

##### train

In [11]:
ret = []
for file in tqdm(train_files):
    df = pd.read_csv(f'{train_path}/{file}')
    df = df.sort_index(ascending=False)
    ret.append(df)
df = pd.concat(ret)
df.columns = ['ship_ID','x','y','speed_v','direction','datetime','type']

100%|█████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:16<00:00, 436.33it/s]


In [12]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/train_1.h5', 'df', mode='w')

##### test

In [13]:
ret = []
for file in tqdm(test_files):
    df = pd.read_csv(f'{test_path}/{file}')
    df = df.sort_index(ascending=False)
    ret.append(df)
df = pd.concat(ret)
df.columns = ['ship_ID','x','y','speed_v','direction','datetime']

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 451.35it/s]


In [14]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/test_1.h5', 'df', mode='w')

#### 时间顺序，时间间隔

##### train

In [15]:
ret = []
for file in tqdm(train_files):
    df = pd.read_csv(f'{train_path}/{file}')
    
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    
    df = df.sort_index(ascending=False)
    
    df['delta_time'] = df['time'] - min(df['time'])
    df['delta_time'] = df['delta_time'].dt.total_seconds().astype('int')  
    
    ret.append(df)
    
df = pd.concat(ret)
df.columns = ['ship_ID','x','y','speed_v','direction','datetime','type','delta_time']

100%|█████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:38<00:00, 180.62it/s]


In [16]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/train_2.h5', 'df', mode='w')

##### test

In [17]:
ret = []
for file in tqdm(test_files):
    df = pd.read_csv(f'{test_path}/{file}')
    
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    
    df = df.sort_index(ascending=False)
    
    df['delta_time'] = df['time'] - min(df['time'])
    df['delta_time'] = df['delta_time'].dt.total_seconds().astype('int')  
    
    ret.append(df)
    
df = pd.concat(ret)
df.columns = ['ship_ID','x','y','speed_v','direction','datetime','delta_time']

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:10<00:00, 183.68it/s]


In [18]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/test_2.h5', 'df', mode='w')

#### 角度变化量，（时间顺序，时间间隔）

##### train

In [33]:
ret = []
for file in tqdm(train_files):
    df = pd.read_csv(f'{train_path}/{file}')
    
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    df = df.rename(columns={'time':'datetime'})
    
    df = df.sort_index(ascending=False).reset_index(drop=True)
    
    df['time'] = df['datetime'] - min(df['datetime'])
    df['time'] = df['time'].dt.total_seconds().astype('int')
    
    df['d_d'] = df['方向'].diff()
    df['d_d'].fillna(0,inplace=True)
    df['d_d'] = np.sign(df['d_d'])*(abs(df['d_d'])-180-np.sign(abs(df['d_d'])-180)*180)
    
    ret.append(df)
    
df = pd.concat(ret)
df.columns = ['ship','x','y','v','d','datetime','type','time','d_d']

100%|██████████████████████████████████████████████████████████████████████████████| 7000/7000 [01:21<00:00, 85.70it/s]


In [27]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/train_3.h5', 'df', mode='w')

##### test

In [28]:
ret = []
for file in tqdm(test_files):
    df = pd.read_csv(f'{test_path}/{file}')
    
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    
    df = df.sort_index(ascending=False)
    
    df['delta_time'] = df['time'] - min(df['time'])
    df['delta_time'] = df['delta_time'].dt.total_seconds().astype('int')
    
    df['delta_direction'] = df['方向'].diff()
    df['delta_direction'].fillna(0,inplace=True)
    df['delta_direction'] = np.sign(df['delta_direction'])*(abs(df['delta_direction'])-180-np.sign(abs(df['delta_direction'])-180)*180)
    
    ret.append(df)
    
df = pd.concat(ret)
df.columns = ['ship_ID','x','y','speed_v','direction','datetime','delta_time','delta_direction']

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:13<00:00, 149.31it/s]


In [29]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/test_3.h5', 'df', mode='w')

#### 剔除异常值，（角度变化量，时间顺序，时间间隔）

##### train

In [None]:
ret = []
for file in tqdm(train_files):
    df = pd.read_csv(f'{train_path}/{file}')
    
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    
    df = df.sort_index(ascending=False)
    
    df['delta_time'] = df['time'] - min(df['time'])
    df['delta_time'] = df['delta_time'].dt.total_seconds().astype('int')
    
    df['delta_direction'] = df['方向'].diff()
    df['delta_direction'].fillna(0,inplace=True)
    df['delta_direction'] = np.sign(df['delta_direction'])*(abs(df['delta_direction'])-180-np.sign(abs(df['delta_direction'])-180)*180)
    
    #speed_v>20
    df = df.drop[df.index[df['速度']>20], axis =1]
    
    ret.append(df)
    
df = pd.concat(ret)
df.columns = ['ship_ID','x','y','speed_v','direction','datetime','type','delta_time','delta_direction']