In [None]:
'''
author: 叶文涛
create time: 2020-6-16
update time: 2020-7-24
'''

In [None]:
import pandas as pd
import numpy as np
import os,gc,time,re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm

In [None]:
# %time train_data.to_pickle('data/train_data_deal.pkl')
# %time train_data = pd.read_pickle('data/train_data_deal.pkl') #读取
%time train_data = pd.read_pickle('data/clean_data1.pkl') #读取

In [None]:
# 载入运单事件，港口信息，测试数据
loadingOrderEvent_df = pd.read_csv('data/loadingOrderEvent.csv',nrows=158341)
port_df = pd.read_csv('data/port.csv')
test_data = pd.read_csv('data/R2 ATest 0711.csv')

In [None]:
train_data

In [None]:
# 随机选取50个运单在地图上航行的样例
'''
从选取的样例中可以看到几种数据比较异常的情况：
1 - gps记录的结尾不在港口 （但是途中经过了其他港口） NP465391960396 GG443276479484
2 - 数据间隔极大 EN567191288956 HA533353890403
3 - 数据漂移 CQ194759551382 下图虽然坐标漂移很大但是时间上似乎问题不大 HA533353890403
4 - 行驶过程中速度或方向缺失(0或-1) P683337605409
5 - 超短运单 LY549146723865
6 - 起点不在港口 （也许不算脏？） YS759678876105
'''
# line_map_data = train_data[train_data.loadingOrder.isin(np.random.permutation(train_data.loadingOrder.unique())[:50])].reset_index(drop=True)
line_map_data = train_data[train_data.loadingOrder.isin(['HR904399891733']).reset_index(drop=True)]
fig = px.line_mapbox(line_map_data, lat='latitude',lon='longitude',
                     hover_data=['loadingOrder','timestamp','speed','direction', 'TRANSPORT_TRACE', 'vesselStatus'],line_group='loadingOrder',color='loadingOrder')
fig.update_layout(mapbox_style="stamen-terrain")
fig.show()



In [None]:
# 从有路由的运单中随机选取50个
'''
有路由的运单的异常数据比例并没有显著变好
'''
line_map_data = train_data[train_data.loadingOrder.isin(np.random.permutation(train_data[~train_data.TRANSPORT_TRACE.isnull()].loadingOrder.unique())[:50])].reset_index(drop=True)
fig = px.line_mapbox(line_map_data,\
                     lat='latitude',lon='longitude',hover_data=['loadingOrder','timestamp','speed','direction'],line_group='loadingOrder',color='loadingOrder')
fig.update_layout(mapbox_style="stamen-terrain")
fig.show()



In [None]:
#精确两点距离
def Distance(Lat_A,Lng_A,Lat_B,Lng_B): 
    ra=6378.140 #赤道半径
    rb=6356.755 #极半径 （km）
    flatten=(ra-rb)/ra  #地球偏率
    rad_lat_A=np.radians(Lat_A)
    rad_lng_A=np.radians(Lng_A)
    rad_lat_B=np.radians(Lat_B)
    rad_lng_B=np.radians(Lng_B)
    pA=np.arctan(rb/ra*np.tan(rad_lat_A))
    pB=np.arctan(rb/ra*np.tan(rad_lat_B)) 
    xx=np.arccos(np.sin(pA)*np.sin(pB)+np.cos(pA)*np.cos(pB)*np.cos(rad_lng_A-rad_lng_B))
    c1=(np.sin(xx)-xx)*(np.sin(pA)+np.sin(pB))**2/np.cos(xx/2)**2
    c2=(np.sin(xx)+xx)*(np.sin(pA)-np.sin(pB))**2/np.sin(xx/2)**2
    dr=flatten/8*(c1-c2)
    distance=ra*(xx+dr)
    return distance

#计算欧式距离，即两点的距离，因为是地球球面模型，所以两点的距离即两点的弧长
def distance(LatA,LatB,LonA,LonB):
    EARTH_RADIUS = 6378.137 # 千米
    
    # 度数转化为弧度
    def rad(d):
        return d * np.pi/ 180.0
    s=0
    a = rad(LatA)-rad(LatB)
    b = rad(LonA)-rad(LonB)
    s= 2 * np.arcsin(np.sqrt(np.sin(a / 2)** 2  + np.cos(rad(LatA)) * np.cos(rad(LatB))* np.sin(b / 2)**2))
    s=s* EARTH_RADIUS
    #  保留两位小数
    s = np.round(s * 100)/100
    s = s * 1000 # 转换成米单位
    return s

# 构造group_df，加入秒差分和距离差分
group_df = train_data.groupby('loadingOrder')
group_df['diff_sec'] = group_df['timestamp'].apply(lambda x:x.diff(1).dt.total_seconds())
group_df['diff_dist'] = distance(group_df['latitude'],group_df['longitude'],group_df['latitude'].shift(1),group_df['longitude'].shift(1))

In [None]:
# 单个运单数据量的分布折线图
loadingOrder_counts = tmp_data.loadingOrder.value_counts(sort=False).sort_index()
loadingOrder_counts.plot(kind='kde')

In [None]:
# 单个运单最大时间差分折线图，取0.9分位点
loadingOrder_max_diff_sec = tmp_data.groupby(by='loadingOrder')['diff_sec'].agg('max')
loadingOrder_max_diff_sec[loadingOrder_max_diff_sec<loadingOrder_max_diff_sec.quantile(0.9)].plot(kind='kde') # 分位数0.9去除极端值

In [None]:
# 单个运单最大距离差分折线图
loadingOrder_max_diff_dist = tmp_data.groupby(by='loadingOrder')['diff_dist'].agg('max')
loadingOrder_max_diff_dist.plot(kind='kde')

In [None]:
# 单个运单最大速度折线图
loadingOrder_max_speed = tmp_data.groupby(by='loadingOrder')['speed'].agg('max')
loadingOrder_max_speed.plot(kind='kde')

In [None]:
# 距离/速度（h）折线图
(loadingOrder_max_diff_dist/(loadingOrder_max_diff_sec/3600)).plot(kind='kde')

In [None]:
fig = px.scatter(x=loadingOrder_max_diff_sec, y=loadingOrder_max_diff_dist)
fig.show()

In [None]:
fig = px.scatter(x=(loadingOrder_max_speed*loadingOrder_max_diff_sec/3600), y=loadingOrder_max_diff_dist)
fig.show()

In [None]:
fig = px.scatter(x=loadingOrder_max_speed, y=loadingOrder_max_diff_dist/loadingOrder_max_diff_sec*3600)
fig.show()

In [None]:
fit_loadingOrders = loadingOrder_max_diff_dist.index[(loadingOrder_max_diff_sec<loadingOrder_max_diff_sec.quantile(0.9)) &\
                                                    (loadingOrder_max_diff_dist<loadingOrder_max_diff_dist.quantile(0.1))&\
                                                    (loadingOrder_counts>loadingOrder_counts.quantile(0.05))]
print(fit_loadingOrders.shape)
print(loadingOrder_max_diff_sec.quantile(0.9),loadingOrder_max_diff_dist.quantile(0.1),loadingOrder_counts.quantile(0.05))

In [None]:
line_map_data = train_data[train_data.loadingOrder.isin(np.random.permutation(fit_loadingOrders)[:50])].reset_index(drop=True)

In [None]:

# 港口在地图中的位置分布
scatter_map_data = port_df
# 缺失值情况判断
'''
港口数据中只有(TRANS_NODE_NAME,LONGITUDE,LATITUDE,COUNTRY)是没有缺失值的,对于小于国家的地区的描述,
缺失情况(REGION<STATE<CITY),PORT_CODE是几乎全部缺失,TRANSPORT_NODE_ID是个int值,没什么用.
'''
port_df.isnull().sum()

In [None]:
# 删掉city为空 以及 符合5位港口代码标准 的港口数据
scatter_map_data = port_df
# scatter_map_data = port_df[(~port_df.CITY.isnull())&(port_df.TRANS_NODE_NAME.str.match('[A-Z]{5}')&(port_df.TRANS_NODE_NAME.str.len()==5))]
# scatter_map_data = port_df[port_df['TRANS_NODE_NAME'].isin(['MYTPP', 'CNYTN', 'CNSHK'])]
# scatter_map_data = port_df[port_df['COUNTRY'] == 'Hungary']
# scatter_map_data = port_df[port_df['TRANS_NODE_NAME'].isin(['SIKOP'])]
# 港口map图.可以看到，有许多港口的位置十分集中接近。
gc.collect()
fig = px.scatter_mapbox(scatter_map_data,lat='LATITUDE',lon='LONGITUDE',hover_name='TRANS_NODE_NAME',hover_data=['TRANS_NODE_NAME','COUNTRY','CITY','LATITUDE','LONGITUDE'])
fig.update_layout(mapbox_style="stamen-terrain")
fig.show()
print(scatter_map_data.shape)

In [None]:
# 测试数据的map图，较为干净
line_map_data = test_data[test_data.loadingOrder.isin(np.random.permutation(test_data.loadingOrder.unique())[:10])].reset_index(drop=True)
gc.collect()
fig = px.line_mapbox(line_map_data,\
                     lat='latitude',lon='longitude',hover_data=['loadingOrder','timestamp','speed','direction','TRANSPORT_TRACE'],line_group='loadingOrder',color='loadingOrder')
fig.update_layout(mapbox_style="stamen-terrain")
fig.show()
print(line_map_data.shape)

In [None]:
set(test_data['TRANSPORT_TRACE'])