# 导入基本库

In [118]:
# 导入需要的库
# 1.基本分析工具
import pandas as pd
import numpy as np
# 2.时间处理
import time
from utm import *
from tqdm import tqdm, tqdm_pandas
# 3.地理坐标处理
from osgeo import osr
# 4.系统文件处理
import os


In [119]:
# 设置时间区间 & 读取原数据

# 设置时间区间
time1 = "2016 11 01 08:00:00"
time2 = "2016 11 01 10:00:00"

stamp1 = time.mktime(time.strptime(time1, "%Y %m %d %H:%M:%S"))
stamp2 = time.mktime(time.strptime(time2, "%Y %m %d %H:%M:%S"))

# 读取原地理数据
df = pd.read_csv('D:\jupyter\gps_20161101.csv', header = None)
df.columns = ['driver_ID', 'order_ID', 'timestamp', 'lon', 'lat']






# 空间坐标系转换

In [120]:
print([stamp1, stamp2])
# 空间坐标转换
# 讲原数据的时间从unix时间戳转换为utc+8时区

df = df[(df['timestamp'] >= stamp1) & (df['timestamp'] <= stamp2)].reset_index(drop = True)
df

# 定义坐标转换
# 1.WGS-84坐标系
wgs84 = osr.SpatialReference()
wgs84.ImportFromEPSG(4326)
# 2.Pseudo-Mercator坐标系
inp = osr.SpatialReference()
inp.ImportFromEPSG(3857)
# 3.定义坐标变换映射
transformation = osr.CoordinateTransformation(wgs84, inp)
# 4.转换原数据的坐标
xy = df[['lon', 'lat']].apply(lambda x: transformation.TransformPoint(x[0], x[1])[:2], axis = 1)



[1477958400.0, 1477965600.0]


In [121]:
# xy是一个list，每一个元素为一个tuple
xy

0           (11591636.237388736, 3588561.948596014)
1           (11591588.370007694, 3588537.360956222)
2           (11591543.842211375, 3588514.067447319)
3          (11591498.201220153, 3588494.6562230354)
4          (11591454.786618741, 3588479.1272652997)
                             ...                   
3726185     (11585336.667404745, 3587926.567733475)
3726186     (11585317.74309131, 3587905.8634010805)
3726187      (11585313.29031168, 3587907.157420853)
3726188     (11585273.21529499, 3587896.8052664325)
3726189     (11585260.970151003, 3587876.100983296)
Length: 3726190, dtype: object

In [122]:
# 将所生成的数据写入dataframe
df['lon'] = [x[0] for x in xy]
df['lat'] = [x[1] for x in xy]
df.columns = ['driver_ID', 'order_ID', 'timestamp', 'x', 'y']

# 时空单元划分

## 所谓‘划分’，就是先定好数据间距，再根据所定间距将时间和空间数据网格化（以网格索引代表所在位置）。
- 时间数据：1.确定时间窗长度 2.生成时间窗索引
- 空间数据：1.确定左边界和上边界 2.确定网格的长和宽（此处设为长宽相同）3.生成横向和纵向索引


In [123]:
#时间窗划分
time_inteval = 600 #时间窗长度
df['time_ID'] = df.timestamp.apply(lambda x: (x - stamp1)//time_inteval) #生成时间窗索引

#空间网格划分
#1.计算左边界和上边界 左右-x 上下-y
left = df['x'].min()
up = df['y'].max()
#2.设置空间网格大小
space_inteval = 70
#3.生成横向和纵向索引
df['row_id'] = df['y'].apply(lambda x: (up - x)//space_inteval)
df['col_id'] = df['x'].apply(lambda x: (x - left) // space_inteval)

In [124]:
df

Unnamed: 0,driver_ID,order_ID,timestamp,x,y,time_ID,row_id,col_id
0,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960620,1.159164e+07,3.588562e+06,3.0,126.0,138.0
1,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960623,1.159159e+07,3.588537e+06,3.0,127.0,138.0
2,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960626,1.159154e+07,3.588514e+06,3.0,127.0,137.0
3,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960629,1.159150e+07,3.588495e+06,3.0,127.0,136.0
4,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960632,1.159145e+07,3.588479e+06,3.0,128.0,136.0
...,...,...,...,...,...,...,...,...
3726185,lhwAvh8ne2Csyd_qyrtoulmlbm6aCizd,sbyupnkfc@DtxabqqBleruldjnh1swnb,1477963493,1.158534e+07,3.587927e+06,8.0,135.0,48.0
3726186,lhwAvh8ne2Csyd_qyrtoulmlbm6aCizd,sbyupnkfc@DtxabqqBleruldjnh1swnb,1477963496,1.158532e+07,3.587906e+06,8.0,136.0,48.0
3726187,lhwAvh8ne2Csyd_qyrtoulmlbm6aCizd,sbyupnkfc@DtxabqqBleruldjnh1swnb,1477963499,1.158531e+07,3.587907e+06,8.0,136.0,48.0
3726188,lhwAvh8ne2Csyd_qyrtoulmlbm6aCizd,sbyupnkfc@DtxabqqBleruldjnh1swnb,1477963502,1.158527e+07,3.587897e+06,8.0,136.0,47.0


# 时空特征提取 extract feature
## 即使根据合理计算出更有意义的二级数据
## 先计算**个体特征**，再对这些特征做出网格交通流（整体）的**统计分析**
我们关注的个体特征就是速度和加速度，而可以统计分析的有网格平均车速，网格平均加速度，网格速度标准差，网格浮动车流量，网格平均停车次数
-------


## 下面是提取一次特征的一种标准步骤：
### 1. 按一定顺序排序 df.sort_value(by = []).reset_index(drop = True)
### 2. 用shift创建新列： df.\[\<column\>\].shift(<offset>)
### 3. 删除‘交界点’所在的行，用boolean实现：df = df[df['column1'] == df['column2']]
### 4. 用向量化的函数批量计算出所要的个体特征，存在另一个新列中 df['new_column'] = df['column1'] + df['column2']
### 5. 将当前dataframe赋值给新表，并在新表中删除不需要的列 df.drop(columns = ['column1','column2'])
### 6. 有些数据可能有限制范围（比如时间），也用boolean提取 df = df[(df.time >= t1) & (df.time <= t2>)]

In [125]:
# 1.计算瞬时速度
# 排序：先按司机排，同司机再按订单排，同订单再按时间排(注意还是要按timestamp排序)
df = df.sort_values(by = ['driver_ID', 'order_ID', 'timestamp']).reset_index(drop = True)

# 将订单id下移一行，用于判断前后数据是否是同意订单
df['orderFlag'] = df['order_ID'].shift(1)
df['identi'] = (df['orderFlag'] == df['order_ID']) #一个由boolean构成的列，方便后面所有shift完成了以后再删除分界行

# 将坐标、时间戳下移一行，匹配相应轨迹点
df['x1'] = df['x'].shift(1)
df['y1'] = df['y'].shift(1)
df['timestamp1'] = df['timestamp'].shift(1)

# 将不属于同意订单的轨迹点删除
df = df[df['identi'] == True]

# 计算相邻轨迹点之间的距离和相差时间
dist = np.sqrt(np.square(df['x'].values - df['x1'].values) + np.square(df['y'].values - df['y1'].values))
time = df['timestamp'].values - df['timestamp1'].values

# 计算速度
df['speed'] = dist/ time

# 删除临时数据
df = df.drop(columns = ['x1', 'y1', 'orderFlag', 'timestamp1', 'identi'])



In [126]:
# 2.计算瞬时加速度
df['speed1'] = df['speed'].shift(1)
df['timestamp1'] = df['timestamp'].shift(1)
df['identi'] = df['order_ID'].shift(1)

df = df[df.identi == df.order_ID]

df['acc'] = (df.speed - df.speed1)/(df.timestamp - df.timestamp1)

df = df.drop(columns = ['speed1', 'timestamp1', 'identi'])

In [127]:
df = df.reset_index(drop = True)
df

Unnamed: 0,driver_ID,order_ID,timestamp,x,y,time_ID,row_id,col_id,speed,acc
0,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961134,1.158771e+07,3.595125e+06,4.0,33.0,82.0,5.353831,0.634832
1,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961137,1.158773e+07,3.595116e+06,4.0,33.0,83.0,8.012669,0.886279
2,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961140,1.158776e+07,3.595105e+06,4.0,33.0,83.0,8.185162,0.057498
3,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961143,1.158778e+07,3.595096e+06,4.0,33.0,83.0,7.330652,-0.284837
4,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961146,1.158780e+07,3.595085e+06,4.0,33.0,83.0,8.049442,0.239597
...,...,...,...,...,...,...,...,...,...,...
3684451,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963740,1.158738e+07,3.590010e+06,8.0,106.0,77.0,11.892815,0.087028
3684452,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963743,1.158736e+07,3.590020e+06,8.0,106.0,77.0,5.931354,-1.987154
3684453,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963746,1.158735e+07,3.590030e+06,8.0,105.0,77.0,5.691153,-0.080067
3684454,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963749,1.158735e+07,3.590030e+06,8.0,105.0,77.0,0.000000,-1.897051


In [128]:
# 个体特征归纳统计 -> 集体/网格平均特征

# 1.网格平均速度: 先求每辆车在网格中的平均速度，然后再求网格中所有个体平均速度的均值，作为网格均速
# 基于时空网格和轨迹id分组
orderGrouped = df.groupby(['row_id', 'col_id', 'time_ID', 'order_ID'])
# 网格平均速度
grouped_speed = orderGrouped.speed.mean().reset_index()
grouped_speed = grouped_speed.groupby(['row_id', 'col_id', 'time_ID'])

grid_speed = grouped_speed.speed.mean()

#去除异常值
grid_speed = grid_speed.clip(grid_speed.quantile(0.05), grid_speed.quantile(0.95))

In [129]:
# 2.网格平均加速度
gridGrouped = orderGrouped = df.groupby(['row_id', 'col_id', 'time_ID'])
grid_acc = gridGrouped.acc.mean()

In [130]:
# 3.网格浮动车流量
grouped_volume = orderGrouped.speed.last().reset_index()
grouped_volume = grouped_volume.groupby(['row_id', 'col_id', 'time_ID'])
grid_volume = grouped_volume['speed'].size()
grid_volume = grid_volume.clip(grid_volume.quantile(0.05), grid_volume.quantile(0.95))

In [132]:
grid_volume

row_id  col_id  time_ID
0.0     7.0     6.0        1
                7.0        1
                9.0        1
                10.0       1
        8.0     0.0        1
                          ..
138.0   16.0    7.0        1
                8.0        1
                9.0        1
                10.0       1
                11.0       1
Name: speed, Length: 80345, dtype: int64

In [133]:
# 4.网格车速标准差
grid_v_std = gridGrouped.speed.std()

In [134]:
grid_v_std

row_id  col_id  time_ID
0.0     7.0     6.0             NaN
                7.0             NaN
                9.0        0.305236
                10.0            NaN
        8.0     0.0        5.101212
                             ...   
138.0   16.0    7.0        3.141760
                8.0             NaN
                9.0        1.772664
                10.0            NaN
                11.0       5.007004
Name: speed, Length: 80345, dtype: float64

In [None]:
# 5.网格平均停车次数
stopNum = gridGrouped.speed.agg(lambda x: (x==0).sum())
grid_stop = pd.concat(())