In [5]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings

warnings.filterwarnings('ignore')

train_path = 'C:/Users/f3107/Desktop/hy_data/hy_round1_train_20200102'
test_path = 'C:/Users/f3107/Desktop/hy_data/hy_round1_testA_20200102'

### 文件夹基本信息

In [7]:
train_files = os.listdir(train_path)
test_files = os.listdir(test_path)
print(len(train_files), len(test_files))

7000 2000


In [3]:
train_files[:3]

['0.csv', '1.csv', '10.csv']

In [4]:
test_files[:3]

['7000.csv', '7001.csv', '7002.csv']

### 单个文件内容

In [2]:
df = pd.read_csv(f'{train_path}/0.csv')

In [3]:
df.head()

Unnamed: 0,渔船ID,x,y,速度,方向,time,type
0,0,6152038.0,5124873.0,2.59,102,1110 11:58:19,拖网
1,0,6151230.0,5125218.0,2.7,113,1110 11:48:19,拖网
2,0,6150421.0,5125563.0,2.7,116,1110 11:38:19,拖网
3,0,6149612.0,5125907.0,3.29,95,1110 11:28:19,拖网
4,0,6148803.0,5126252.0,3.18,108,1110 11:18:19,拖网


### 整合单个文件

#### train

In [8]:
#未处理，时间倒序
ret = []
for file in tqdm(train_files):
    df = pd.read_csv(f'{train_path}/{file}')
    ret.append(df)
df = pd.concat(ret)
df.columns = ['ship_ID','x','y','speed_v','direction','datetime','type']

100%|█████████████████████████████████████████████████████████████████████████████| 7000/7000 [01:04<00:00, 108.68it/s]


In [8]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/train.h5', 'df', mode='w')

In [9]:
#已处理，时间顺序
ret = []
for file in tqdm(train_files):
    df = pd.read_csv(f'{train_path}/{file}')
    df = df.sort_index(ascending=False)
    ret.append(df)
df = pd.concat(ret)
df.columns = ['ship_ID','x','y','speed_v','direction','datetime','type']

100%|█████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:17<00:00, 391.47it/s]


In [10]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/train_descending.h5', 'df', mode='w')

In [12]:
#时间倒序，事件间隔
ret = []
for file in tqdm(train_files):
    df = pd.read_csv(f'{train_path}/{file}')
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    df['delta_time'] = df['time'] - min(df['time'])
    df = df.sort_index(ascending=False)
    ret.append(df)
df = pd.concat(ret)
df.columns = ['ship_ID','x','y','speed_v','direction','datetime','type','delta_time']

100%|█████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:37<00:00, 185.13it/s]


In [17]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/train_descending_deltatime.h5', 'df', mode='w')

#### test

In [11]:
#未处理，时间倒序
ret = []
for file in tqdm(test_files):
    df = pd.read_csv(f'{test_path}/{file}')
    ret.append(df)
df = pd.concat(ret)
df.columns = ['ship_ID','x','y','speed_v','direction','datetime']

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 481.40it/s]


In [12]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/test.h5', 'df', mode='w')

In [13]:
#已处理，时间顺序
ret = []
for file in tqdm(test_files):
    df = pd.read_csv(f'{test_path}/{file}')
    df = df.sort_index(ascending=False)
    ret.append(df)
df = pd.concat(ret)
df.columns = ['ship_ID','x','y','speed_v','direction','datetime']

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 406.03it/s]


In [14]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/test_descending.h5', 'df', mode='w')

In [18]:
#时间倒序，事件间隔
ret = []
for file in tqdm(test_files):
    df = pd.read_csv(f'{test_path}/{file}')
    df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    df['delta_time'] = df['time'] - min(df['time'])
    df = df.sort_index(ascending=False)
    ret.append(df)
df = pd.concat(ret)
df.columns = ['ship_ID','x','y','speed_v','direction','datetime','delta_time']

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:19<00:00, 102.99it/s]


In [19]:
df.to_hdf('C:/Users/f3107/Desktop/hy_data/test_descending_deltatime.h5', 'df', mode='w')