In [1]:
'''
author: 梁泽涛，叶文涛
create time: 2020-6-27
update time: 2020-7-24
'''

In [1]:
import pandas as pd
import numpy as np
import os,gc,time,re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm

In [34]:
def reduce_mem_usage(data,to_feather=False):
    '''
    通过判断数据范围的上下限来选择最小能存储数据的类型
    @params
        data:待内存优化的pandas.DataFrame
        to_feather: 后续是否需要存储为feather格式，若需要则不优化为float16；feather不支持float16位类型
    @return:
        返回内存优化后的dataframe
        
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = data.memory_usage().sum() / 1024**2    
    for col in data.columns:
        col_type = data[col].dtypes
        if col_type in numerics:
            c_min = data[col].min()
            c_max = data[col].max()
            # 在int范围内找区间
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            # 在float范围内找区间
            else:
                # float降低精度导致许多经纬度重合
                continue
                '''
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max and not to_feather:
                    data[col] = data[col].astype(np.float16)
                    pass
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
                '''
                    
    end_mem = data.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return data


# haversine 经纬度球面距离
def Distance(LonA,LatA,LonB,LatB):
    EARTH_RADIUS = 6378.137 # 千米
    radLatA = np.radians(LatA)
    radLatB = np.radians(LatB)
    a = radLatA-radLatB
    b = np.radians(LonA)-np.radians(LonB)
    s= 2 * np.arcsin(np.sqrt(np.power(np.sin(a / 2),2)+ np.cos(radLatA) * np.cos(radLatB)*np.power(np.sin(b / 2),2))) * EARTH_RADIUS
    return s


In [5]:
# 读入内存
TRAIN_DATA_PATH = './data/train0711.csv'
col_names = ['loadingOrder','carrierName','timestamp','longitude','latitude','vesselMMSI','speed','direction','vesselNextport','vesselNextportETA','vesselStatus','vesselDatasource','TRANSPORT_TRACE']
%time train_data = pd.read_csv(TRAIN_DATA_PATH,names=col_names)

# 去除空路由，保留标准化港口
train_data = train_data[~(train_data.TRANSPORT_TRACE.isnull())]
train_data = train_data[train_data.TRANSPORT_TRACE.str.match('[A-Z-]+-[A-Z-]+')]

# 这三列无用，删除
%time train_data.drop(columns=['vesselNextport','vesselNextportETA','vesselDatasource'],inplace=True)

# 减少内存占用
%time train_data = reduce_mem_usage(train_data,to_feather=False)

# 转换时间格式
%time train_data['timestamp'] = pd.to_datetime(train_data['timestamp'],infer_datetime_format=True)

# 去重
%time train_data.drop_duplicates(subset=['loadingOrder','carrierName','timestamp','longitude','latitude','vesselMMSI'],inplace=True)

# 排序
%time train_data = train_data.sort_values(by=['loadingOrder','timestamp']).reset_index(drop=True)

train_data.info()    
gc.collect()





Wall time: 3min 2s
Wall time: 6.35 s
Mem. usage decreased to 6993.29 Mb (11.4% reduction)
Wall time: 3.24 s
Wall time: 54 s
Wall time: 1min 14s
Wall time: 1min 11s
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74474456 entries, 0 to 74474455
Data columns (total 10 columns):
 #   Column           Dtype              
---  ------           -----              
 0   loadingOrder     object             
 1   carrierName      object             
 2   timestamp        datetime64[ns, UTC]
 3   longitude        float64            
 4   latitude         float64            
 5   vesselMMSI       object             
 6   speed            int16              
 7   direction        int32              
 8   vesselStatus     object             
 9   TRANSPORT_TRACE  object             
dtypes: datetime64[ns, UTC](1), float64(2), int16(1), int32(1), object(5)
memory usage: 4.9+ GB


15

In [6]:
# 36000方向等价于0方向
train_data.loc[train_data.direction==36000,'direction'] = 0
# 去除包括速度过大，方向不在合理区间内的不合理数据
train_data = train_data[~((train_data.speed>48.0)|(train_data.direction<0)|(train_data.direction>36000))].reset_index(drop=True)
print(train_data.shape)


(74174453, 10)


In [32]:
# 找出所有应重复而需要被删除的运单（存在一船多单现象）
# duplicated, keep = false将返回所有有重复的部分
%time dup_loc_df = train_data[train_data.duplicated(subset=['timestamp','longitude','latitude','vesselMMSI'],keep=False)].reset_index(drop=True)
all_dup_orders = dup_loc_df.loadingOrder.unique()
# 在初步所有有重复的部分基础上去重，保留首位
%time dup_loc_df = dup_loc_df.drop_duplicates(subset=['timestamp','longitude','latitude','vesselMMSI']).reset_index(drop=True)
# 把被删除的运单找出
del_dup_orders = pd.Series(all_dup_orders)[~pd.Series(all_dup_orders).isin(dup_loc_df.loadingOrder.unique())].values

# 最后，在原数据上处理
clean_data1 = train_data[~train_data.loadingOrder.isin(del_dup_orders)].reset_index(drop=True)




Wall time: 53.8 s
Wall time: 31 s


In [35]:
# gps数据时间、距离间隔
clean_data1['diff_sec'] = clean_data1.groupby(by=['loadingOrder'])['timestamp'].apply(lambda x:x.diff(1).dt.total_seconds())
clean_data1['diff_dist'] = clean_data1.groupby(by=['loadingOrder']).apply(lambda x:Distance(x['longitude'],x['latitude'],x['longitude'].shift(1),x['latitude'].shift(1))).reset_index(drop=True)
# 两点间速度
clean_data1['speed_v2'] = clean_data1['diff_dist']/(clean_data1['diff_sec']/3600)
# 删除时间间隔为0的数据
clean_data1 = clean_data1[clean_data1.diff_sec!=0].reset_index(drop=True)




(29327091, 13)
(29254615, 13)


In [54]:
# 清理出来的运单的单号
# 用分位数去掉尾部
'''
清除最大时间差过大的运单
清除最大距离差过大的运单
清除最大距离/最大时间的长尾影响，船只正常的轨迹应该是距离差大，时间差也大
'''

# 获得上下界，公式取自于箱型图的上下界
def get_bound(ls):
    upper = ls.quantile(0.75)+1.5*(ls.quantile(0.75)-ls.quantile(0.25))
    lower = ls.quantile(0.25)-1.5*(ls.quantile(0.75)-ls.quantile(0.25))
    q9 = ls.quantile(0.9)
    return [upper,lower,q9]

# 取出最大时间间隔、距离间隔、速度
order_max_diff_sec = clean_data1.groupby(by='loadingOrder')['diff_sec'].agg('max')
order_max_diff_dist = clean_data1.groupby(by='loadingOrder')['diff_dist'].agg('max')
order_max_speed_v2 = clean_data1.groupby(by='loadingOrder')['speed_v2'].agg('max')
# order_length = clean_data1.loadingOrder.value_counts(sort=False).sort_index()


fit_orders = clean_data1.loadingOrder.unique()[(order_max_speed_v2<=get_bound(order_max_speed_v2)[0])&\
                                               (order_max_diff_dist<=get_bound(order_max_diff_dist)[0])&\
                                               (order_max_diff_sec<=get_bound(order_max_diff_sec)[0])]

%time clean_data1 = clean_data1[clean_data1.loadingOrder.isin(fit_orders)].reset_index(drop=True)

Wall time: 3.67 s


In [55]:
clean_data1

Unnamed: 0,loadingOrder,carrierName,timestamp,longitude,latitude,vesselMMSI,speed,direction,vesselStatus,TRANSPORT_TRACE,diff_sec,diff_dist,speed_v2
0,AA191175561416,OIEQNT,2019-01-28 16:12:59+00:00,114.260392,22.571047,Y7540547327,0,12670,moored,CNYTN-MXZLO,,,
1,AA191175561416,OIEQNT,2019-01-28 16:22:38+00:00,114.260438,22.571125,Y7540547327,0,14790,moored,CNYTN-MXZLO,579.0,0.009887,0.061473
2,AA191175561416,OIEQNT,2019-01-28 16:30:55+00:00,114.260693,22.571567,Y7540547327,0,21510,moored,CNYTN-MXZLO,497.0,0.055750,0.403821
3,AA191175561416,OIEQNT,2019-01-28 16:37:35+00:00,114.260392,22.571463,Y7540547327,0,19900,moored,CNYTN-MXZLO,400.0,0.033036,0.297320
4,AA191175561416,OIEQNT,2019-01-28 16:45:56+00:00,114.260647,22.571510,Y7540547327,0,21360,moored,CNYTN-MXZLO,501.0,0.026729,0.192066
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17564213,ZZ824778274922,HMKTVZ,2020-03-04 05:14:21+00:00,136.794000,35.026833,F1868908205,5,28670,under way using engine,CNSHK-JPNGO,101.0,0.161833,5.768308
17564214,ZZ824778274922,HMKTVZ,2020-03-04 05:16:31+00:00,136.792333,35.027167,F1868908205,2,27440,under way using engine,CNSHK-JPNGO,130.0,0.156442,4.332241
17564215,ZZ824778274922,HMKTVZ,2020-03-04 05:18:31+00:00,136.791667,35.027000,F1868908205,2,22580,under way using engine,CNSHK-JPNGO,120.0,0.063493,1.904801
17564216,ZZ824778274922,HMKTVZ,2020-03-04 05:20:41+00:00,136.791000,35.026833,F1868908205,0,25740,under way using engine,CNSHK-JPNGO,130.0,0.063581,1.760695


In [4]:
loadingOrderEvent_df = pd.read_csv('data/loadingOrderEvent.csv',nrows=158341)
port_df = pd.read_csv('data/port.csv')
test_data = pd.read_csv('data/testData 0626.csv')

————————中途存储/读取区域————————

In [56]:
# 存储/读取
# %time clean_data1 = pd.read_pickle('data/clean_data1.pkl') #读取
%time clean_data1.to_pickle('data/clean_data1.pkl') # 存储

Wall time: 5.31 s


In [8]:
port_df = pd.read_csv('data/port.csv')
# 保留5位标准港口代码的港口
clean_port_data1 = port_df[(port_df.TRANS_NODE_NAME.str.match('[A-Z]{5}')&(port_df.TRANS_NODE_NAME.str.len()==5))].reset_index(drop=True)
clean_port_data1.drop(columns=['COUNTRY','STATE','CITY','REGION','ADDRESS','PORT_CODE','TRANSPORT_NODE_ID'],inplace=True)
# 训练gps中unique的经纬度坐标
clean_data1_geo = clean_data1[['longitude','latitude']].drop_duplicates().reset_index(drop=True)
print(clean_data1_geo.shape)

# 近邻搜索
from sklearn.neighbors import NearestNeighbors,DistanceMetric
#radius=25其实用不上, n_jobs=-1表示用上所有处理器，但此处不使用反而更快
neigh = NearestNeighbors(n_neighbors=1, radius=25,metric="haversine")
# ['LATITUDE','LONGITUDE'] form
%time neigh.fit(np.radians(clean_port_data1[['LATITUDE','LONGITUDE']].values))

(25994497, 2)
Wall time: 2 ms


NearestNeighbors(metric='haversine', n_neighbors=1, radius=25)

In [9]:
# ['LATITUDE','LONGITUDE'] form
# 对8300万数据需要30分钟
EARTH_RADIUS = 6378.137
# 对每条gps数据，搜索最近的港口以及距离，之所以neighbor设为1，是因为所有港口自成一类，先返回距离，再返回最近点
%time nearest_port_dist,nearest_port = neigh.kneighbors(np.radians(clean_data1_geo[['latitude','longitude']].values), 1)
# 利用squeeze（）函数将表示向量的数组转换为秩为1的数组
# 例1：array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])  转换为 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
# 例2：array([[[0, 1, 2, 3, 4],[5, 6, 7, 8, 9]]])  转换为array([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
nearest_port_dist = nearest_port_dist.squeeze()
nearest_port_dist *= EARTH_RADIUS
nearest_port = nearest_port.squeeze()

clean_data1_geo['nearest_port_dist'] = nearest_port_dist
# near_port返回的是经纬度，所以查找名称
clean_data1_geo['nearest_port'] = clean_port_data1['TRANS_NODE_NAME'].values[nearest_port]


# 对8300万数据需要12分钟
%time clean_data1 = clean_data1.merge(clean_data1_geo,on=['longitude','latitude'])   # 对每行数据，merge到原表格中
%time clean_data1.sort_values(by=['loadingOrder','timestamp'],inplace=True)

Wall time: 29min 38s
Wall time: 11min 16s
Wall time: 1min 5s


In [10]:
print(clean_data1_geo.shape,clean_data1.shape)

(25994497, 4) (83747018, 17)


In [13]:
clean_data1['nearest_port_dist'].quantile(0.1)

7.365017139473755

In [14]:

# 按条件判定是否到港（包括中转港）
clean_data1['is_at_port'] = 1

clean_data1.loc[(clean_data1.nearest_port_dist>7),'is_at_port'] = 0

In [15]:
# 判断起始点和终点是否在港
def judge_is_at_port(df):
    # 判断首个和最后的一个数据是否是在港内
    if df[df['is_at_port']==0].index[-1] == df.index[-1] and df[df['is_at_port']==0].index[0] == df.index[0]:
        return -1
    else:
        return 1

# idxs的结构：每个运单号对应一个终点index
%time idxs = clean_data1.groupby(by='loadingOrder').apply(judge_is_at_port)

# idxs[(end_idxs==-1)].index 取出所有终点index为-1的运单号（即异常情况）此句即取出正常运单号
clean_data2 = clean_data1[~clean_data1.loadingOrder.isin(idxs[(idxs==-1)].index)].reset_index(drop=True)

Wall time: 53.5 s


In [18]:
clean_data2

Unnamed: 0,loadingOrder,carrierName,timestamp,longitude,latitude,vesselMMSI,speed,direction,vesselNextport,vesselNextportETA,vesselStatus,vesselDatasource,TRANSPORT_TRACE,diff_dist,diff_sec,nearest_port_dist,nearest_port,is_at_port
0,AA191175561416,OIEQNT,2019-01-28 16:12:59+00:00,114.260392,22.571047,Y7540547327,0,12670,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,,,1.706335,CNYTN,1
1,AA191175561416,OIEQNT,2019-01-28 16:22:38+00:00,114.260438,22.571125,Y7540547327,0,14790,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,0.009887,579.0,1.698316,CNYTN,1
2,AA191175561416,OIEQNT,2019-01-28 16:30:55+00:00,114.260693,22.571567,Y7540547327,0,21510,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,0.055750,497.0,1.653788,CNYTN,1
3,AA191175561416,OIEQNT,2019-01-28 16:37:35+00:00,114.260392,22.571463,Y7540547327,0,19900,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,0.033036,400.0,1.686749,CNYTN,1
4,AA191175561416,OIEQNT,2019-01-28 16:45:56+00:00,114.260647,22.571510,Y7540547327,0,21360,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,0.026729,501.0,1.660718,CNYTN,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47226150,ZZ964257538884,RWHZVZ,2019-11-28 20:14:15+00:00,31.021978,-29.876915,Y9380685386,4,25140,MUPLU - ZADUR,2019-11-28T11:30:00.000Z,under way using engine,Coastal AIS,CNSHK-MYTPP-MUPLU-ZADUR,0.486119,251.0,2.877043,ZADUR,1
47226151,ZZ964257538884,RWHZVZ,2019-11-28 20:17:55+00:00,31.020602,-29.877332,Y9380685386,2,25190,MUPLU - ZADUR,2019-11-28T11:30:00.000Z,under way using engine,Coastal AIS,CNSHK-MYTPP-MUPLU-ZADUR,0.140696,220.0,3.017739,ZADUR,1
47226152,ZZ964257538884,RWHZVZ,2019-11-28 20:20:05+00:00,31.019675,-29.877572,Y9380685386,2,25240,MUPLU - ZADUR,2019-11-28T11:30:00.000Z,under way using engine,Coastal AIS,CNSHK-MYTPP-MUPLU-ZADUR,0.093381,130.0,3.111010,ZADUR,1
47226153,ZZ964257538884,RWHZVZ,2019-11-28 20:23:55+00:00,31.018670,-29.878203,Y9380685386,1,22290,MUPLU - ZADUR,2019-11-28T11:30:00.000Z,under way using engine,Coastal AIS,CNSHK-MYTPP-MUPLU-ZADUR,0.119768,230.0,3.225998,ZADUR,1


————————中途存储/读取区域————————

In [3]:
# 存储/读取
%time clean_data2 = pd.read_pickle('data/clean_data2.pkl') #读取
# %time clean_data2.to_pickle('data/clean_data2.pkl') # 存储

Wall time: 23.9 s


In [21]:
# 获取始末港口信息
test_data = pd.read_csv('data/testData 0626.csv')

s = set(test_data['TRANSPORT_TRACE'])

# 集合自动去重
start_set = set()
end_set = set()
for route in s:
    start_set.add(route.split('-')[0])
    end_set.add(route.split('-')[-1])


In [40]:
start_set

{'CNHKG', 'CNNSA', 'CNSHK', 'CNYTN', 'HKHKG', 'HONGKONG'}

In [22]:
# 删除开始港口和终点港口不在测试集中的运单号
def del_is_not_at_port(df):
    # 判断第一个数据和最后一个数据
    if df.iloc[0]['nearest_port'] in start_set and df.iloc[-1]['nearest_port'] in end_set:
        return 1
    else:
        return -1
%time idxs = clean_data2.groupby(by='loadingOrder').apply(del_is_not_at_port)

clean_data3 = clean_data2[~clean_data2.loadingOrder.isin(idxs[(idxs==-1)].index)].reset_index(drop=True)



Wall time: 18.1 s


In [26]:
clean_data3

Unnamed: 0,loadingOrder,carrierName,timestamp,longitude,latitude,vesselMMSI,speed,direction,vesselNextport,vesselNextportETA,vesselStatus,vesselDatasource,TRANSPORT_TRACE,diff_dist,diff_sec,nearest_port_dist,nearest_port,is_at_port
0,AA191175561416,OIEQNT,2019-01-28 16:12:59+00:00,114.260392,22.571047,Y7540547327,0,12670,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,,,1.706335,CNYTN,1
1,AA191175561416,OIEQNT,2019-01-28 16:22:38+00:00,114.260438,22.571125,Y7540547327,0,14790,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,0.009887,579.0,1.698316,CNYTN,1
2,AA191175561416,OIEQNT,2019-01-28 16:30:55+00:00,114.260693,22.571567,Y7540547327,0,21510,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,0.055750,497.0,1.653788,CNYTN,1
3,AA191175561416,OIEQNT,2019-01-28 16:37:35+00:00,114.260392,22.571463,Y7540547327,0,19900,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,0.033036,400.0,1.686749,CNYTN,1
4,AA191175561416,OIEQNT,2019-01-28 16:45:56+00:00,114.260647,22.571510,Y7540547327,0,21360,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,0.026729,501.0,1.660718,CNYTN,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2927203,ZY989876486292,OIEQNT,2019-04-26 19:05:20+00:00,-104.322813,19.066443,D7828192902,11,11170,,,,Satellite,CNYTN-MXZLO,0.663915,210.0,2.830415,MXZLO,1
2927204,ZY989876486292,OIEQNT,2019-04-26 19:11:00+00:00,-104.311943,19.061608,D7828192902,15,12500,,,,Coastal AIS,CNYTN-MXZLO,1.263997,340.0,2.792622,MXZLO,1
2927205,ZY989876486292,OIEQNT,2019-04-26 19:12:49+00:00,-104.308012,19.060088,D7828192902,14,9650,MANZANILLO,2019-04-26T06:00:00.000Z,under way using engine,Satellite,CNYTN-MXZLO,0.446877,109.0,2.891596,MXZLO,1
2927206,ZY989876486292,OIEQNT,2019-04-26 19:36:40+00:00,-104.296138,19.058382,D7828192902,1,20150,,,,Coastal AIS,CNYTN-MXZLO,1.263700,1431.0,3.226503,MXZLO,1


In [28]:
fill_speed = []

# haversine 经纬度球面距离
def Distance2(LonA,LatA,LonB,LatB):
    EARTH_RADIUS = 6378.137 # 千米
    radLatA = np.radians(LatA)
    radLatB = np.radians(LatB)
    a = radLatA-radLatB
    b = np.radians(LonA)-np.radians(LonB)
    s= 2 * np.arcsin(np.sqrt(np.power(np.sin(a / 2),2)+ np.cos(radLatA) * np.cos(radLatB)*np.power(np.sin(b / 2),2))) * EARTH_RADIUS
    return s

# 使用上下两点的距离/时间来填充速度
# 行驶的标准：没到港但速度为0
for idx,(speed,is_at_port) in tqdm(enumerate(zip(clean_data3.speed.values, clean_data3.is_at_port.values))):
    if is_at_port==0 and speed==0:
        # 因为之前已经确定始末两点一定在港，所以可直接运算
        d = Distance2(clean_data3.iloc[idx-1]['longitude'],clean_data3.iloc[idx-1]['latitude'],clean_data3.iloc[idx+1]['longitude'], clean_data3.iloc[idx+1]['latitude'])
        speed = d/(clean_data3.iloc[idx-1]['timestamp']-clean_data3.iloc[idx+1]['timestamp']).total_seconds()
        fill_speed.append([idx,speed])


2927208it [04:34, 10656.89it/s]


In [36]:
fill_speed = np.array(fill_speed)
fill_speed.shape

(358073, 2)

In [37]:
clean_data3.loc[fill_speed[:,0],'speed'] = fill_speed[:,1]

In [38]:
clean_data3

Unnamed: 0,loadingOrder,carrierName,timestamp,longitude,latitude,vesselMMSI,speed,direction,vesselNextport,vesselNextportETA,vesselStatus,vesselDatasource,TRANSPORT_TRACE,diff_dist,diff_sec,nearest_port_dist,nearest_port,is_at_port
0,AA191175561416,OIEQNT,2019-01-28 16:12:59+00:00,114.260392,22.571047,Y7540547327,0.0,12670,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,,,1.706335,CNYTN,1
1,AA191175561416,OIEQNT,2019-01-28 16:22:38+00:00,114.260438,22.571125,Y7540547327,0.0,14790,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,0.009887,579.0,1.698316,CNYTN,1
2,AA191175561416,OIEQNT,2019-01-28 16:30:55+00:00,114.260693,22.571567,Y7540547327,0.0,21510,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,0.055750,497.0,1.653788,CNYTN,1
3,AA191175561416,OIEQNT,2019-01-28 16:37:35+00:00,114.260392,22.571463,Y7540547327,0.0,19900,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,0.033036,400.0,1.686749,CNYTN,1
4,AA191175561416,OIEQNT,2019-01-28 16:45:56+00:00,114.260647,22.571510,Y7540547327,0.0,21360,HK HKG >CN YTN,2019-01-28T05:30:00.000Z,moored,Coastal AIS,CNYTN-MXZLO,0.026729,501.0,1.660718,CNYTN,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2927203,ZY989876486292,OIEQNT,2019-04-26 19:05:20+00:00,-104.322813,19.066443,D7828192902,11.0,11170,,,,Satellite,CNYTN-MXZLO,0.663915,210.0,2.830415,MXZLO,1
2927204,ZY989876486292,OIEQNT,2019-04-26 19:11:00+00:00,-104.311943,19.061608,D7828192902,15.0,12500,,,,Coastal AIS,CNYTN-MXZLO,1.263997,340.0,2.792622,MXZLO,1
2927205,ZY989876486292,OIEQNT,2019-04-26 19:12:49+00:00,-104.308012,19.060088,D7828192902,14.0,9650,MANZANILLO,2019-04-26T06:00:00.000Z,under way using engine,Satellite,CNYTN-MXZLO,0.446877,109.0,2.891596,MXZLO,1
2927206,ZY989876486292,OIEQNT,2019-04-26 19:36:40+00:00,-104.296138,19.058382,D7828192902,1.0,20150,,,,Coastal AIS,CNYTN-MXZLO,1.263700,1431.0,3.226503,MXZLO,1


————————中途存储/读取区域————————

In [39]:
%time clean_data3.to_pickle('data/clean_data3.pkl')

Wall time: 1.48 s
