In [1]:
'''
此代码目前仅可分析某一天（单文件）的某个时间段（预定的时间区间）的订单和位置数据
'''

import pandas as pd
import numpy as np
import time
import os
from utm import *
from tqdm import tqdm, tqdm_pandas
from osgeo import osr

In [29]:
# 在此处设置一些文件地址
feature_file_name = '1101 8-10am 21-10-30'
csv_path = 'F:/大学/第40期PRP/交通订单数据/traffic_data/gps_20161101.csv'
feature_dst_path = 'F:/大学/第40期PRP/特征提取/' + feature_file_name + '.csv'

In [3]:
# 在此处设置时间窗(秒)和空间网格的边长(WGS84坐标系)
time_interval = 600
space_interval = 70

In [4]:
# 设置时间区间 读取原数据
# 时间区间: 减少单次的处理量
time1 = '2016 11 01 08:00:00'
time2 = '2016 11 01 10:00:00'
stamp1 = time.mktime(time.strptime(time1, '%Y %m %d %H:%M:%S'))
stamp2 = time.mktime(time.strptime(time2, '%Y %m %d %H:%M:%S'))
#读取原地理数据
df = pd.read_csv(csv_path, header = None) #注意我此处使用的是移动硬盘的地址
df.columns = ['driver_ID', 'order_ID', 'timestamp', 'lon', 'lat']

In [5]:
# 空间坐标系转换
df = df[(df['timestamp'] >= stamp1)&(df['timestamp'] <= stamp2)].reset_index(drop = True)

# 定义坐标系转换
# 1.WGS-84
wgs84 = osr.SpatialReference()
wgs84.ImportFromEPSG(4326)
# 2.Pseudo-Mercator
inp = osr.SpatialReference()
inp.ImportFromEPSG(3857)
# 3.定义坐标变换映射
transformation = osr.CoordinateTransformation(wgs84, inp)
# 4.转换原数据的坐标
xy = df[['lon','lat']].apply(lambda x: transformation.TransformPoint(x[0],x[1])[:2], axis = 1)


In [6]:
# 将所生成的xy坐标系数据写入dataframe
df['lon'] = [x[0] for x in xy]
df['lat'] = [x[1] for x in xy]
df.columns = ['driver_ID', 'order_ID', 'timestamp', 'x', 'y']
df

Unnamed: 0,driver_ID,order_ID,timestamp,x,y
0,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960620,1.159164e+07,3.588562e+06
1,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960623,1.159159e+07,3.588537e+06
2,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960626,1.159154e+07,3.588514e+06
3,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960629,1.159150e+07,3.588495e+06
4,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960632,1.159145e+07,3.588479e+06
...,...,...,...,...,...
3726185,lhwAvh8ne2Csyd_qyrtoulmlbm6aCizd,sbyupnkfc@DtxabqqBleruldjnh1swnb,1477963493,1.158534e+07,3.587927e+06
3726186,lhwAvh8ne2Csyd_qyrtoulmlbm6aCizd,sbyupnkfc@DtxabqqBleruldjnh1swnb,1477963496,1.158532e+07,3.587906e+06
3726187,lhwAvh8ne2Csyd_qyrtoulmlbm6aCizd,sbyupnkfc@DtxabqqBleruldjnh1swnb,1477963499,1.158531e+07,3.587907e+06
3726188,lhwAvh8ne2Csyd_qyrtoulmlbm6aCizd,sbyupnkfc@DtxabqqBleruldjnh1swnb,1477963502,1.158527e+07,3.587897e+06


In [7]:
# 时空单元划分
# 所谓‘划分’，就是先定好数据间距，再根据所定间距将时间和空间数据网格化
# （以网格索引代表所在位置）。

# 时间数据：1.确定时间窗长度 2.生成时间窗索引
# 空间数据：1.确定左边界和上边界 2.确定网格的长和宽（此处设为长宽相同）
#           3.生成横向和纵向索引


In [8]:
# 时间窗划分
df['time_ID'] = df.timestamp.apply(lambda x: (x - stamp1)//time_interval)

# 空间网格划分
# 1.计算左边界和上边界，左右-x， 上下-y
left = df['x'].min()
up = df['y'].max()
# 2.生成横向和纵向索引
df['row_id'] = df['y'].apply(lambda y: (up - y)//space_interval)
df['col_id'] = df['x'].apply(lambda x: (x - left)//space_interval)

In [9]:
df

Unnamed: 0,driver_ID,order_ID,timestamp,x,y,time_ID,row_id,col_id
0,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960620,1.159164e+07,3.588562e+06,3.0,126.0,138.0
1,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960623,1.159159e+07,3.588537e+06,3.0,127.0,138.0
2,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960626,1.159154e+07,3.588514e+06,3.0,127.0,137.0
3,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960629,1.159150e+07,3.588495e+06,3.0,127.0,136.0
4,potvwmdihbvxqfamCqlksxljdb69tixp,lpqmkhaf9aHtooanvwsjzsjn8l95Eltk,1477960632,1.159145e+07,3.588479e+06,3.0,128.0,136.0
...,...,...,...,...,...,...,...,...
3726185,lhwAvh8ne2Csyd_qyrtoulmlbm6aCizd,sbyupnkfc@DtxabqqBleruldjnh1swnb,1477963493,1.158534e+07,3.587927e+06,8.0,135.0,48.0
3726186,lhwAvh8ne2Csyd_qyrtoulmlbm6aCizd,sbyupnkfc@DtxabqqBleruldjnh1swnb,1477963496,1.158532e+07,3.587906e+06,8.0,136.0,48.0
3726187,lhwAvh8ne2Csyd_qyrtoulmlbm6aCizd,sbyupnkfc@DtxabqqBleruldjnh1swnb,1477963499,1.158531e+07,3.587907e+06,8.0,136.0,48.0
3726188,lhwAvh8ne2Csyd_qyrtoulmlbm6aCizd,sbyupnkfc@DtxabqqBleruldjnh1swnb,1477963502,1.158527e+07,3.587897e+06,8.0,136.0,47.0


In [10]:
# 下面开始时空特征提取

#1. 计算瞬时速度

# 排序：先按司机排，同司机按订单排，同订单再按时间排
df = df.sort_values(by = ['driver_ID', 'order_ID', 'timestamp']).reset_index(drop = True)

# 将订单id下移一行，用于判断前后数据是否属于同一订单
df['orderFlag'] = df['order_ID'].shift(1)
df['identi'] = (df['orderFlag'] == df['order_ID']) #一个由boolean构成的列，方便后面所有shift完成了之后再删除分界行

# 将坐标，时间戳下移一行，匹配相应轨迹点
df['x1'] = df['x'].shift(1)
df['y1'] = df['y'].shift(1)
df['timestamp1'] = df['timestamp'].shift(1)

# 将不属于同一订单的轨迹点删除
df = df[df['identi'] == True]

# 计算相邻轨迹点之间的距离和相差时间
# 距离采用欧式距离
dist = np.sqrt(np.square(df['x'].values - df['x1'].values) + np.square(df['y'].values - df['y1'].values))
time = df['timestamp'].values - df['timestamp1'].values

# 计算速度
df['speed'] = dist/time

# 删除临时数据
df = df.drop(columns = ['x1', 'y1', 'orderFlag', 'timestamp1', 'identi'])

In [11]:
df

Unnamed: 0,driver_ID,order_ID,timestamp,x,y,time_ID,row_id,col_id,speed
1,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961131,1.158770e+07,3.595129e+06,4.0,33.0,82.0,3.449334
2,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961134,1.158771e+07,3.595125e+06,4.0,33.0,82.0,5.353831
3,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961137,1.158773e+07,3.595116e+06,4.0,33.0,83.0,8.012669
4,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961140,1.158776e+07,3.595105e+06,4.0,33.0,83.0,8.185162
5,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961143,1.158778e+07,3.595096e+06,4.0,33.0,83.0,7.330652
...,...,...,...,...,...,...,...,...,...
3726185,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963740,1.158738e+07,3.590010e+06,8.0,106.0,77.0,11.892815
3726186,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963743,1.158736e+07,3.590020e+06,8.0,106.0,77.0,5.931354
3726187,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963746,1.158735e+07,3.590030e+06,8.0,105.0,77.0,5.691153
3726188,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963749,1.158735e+07,3.590030e+06,8.0,105.0,77.0,0.000000


In [12]:
# 2.计算瞬时加速度
df['speed1'] = df['speed'].shift(1)
df['timestamp1'] = df['timestamp'].shift(1)
df['identi'] = df['order_ID'].shift(1)

df = df[df.identi == df.order_ID]

df['acc'] = (df.speed - df.speed1)/(df.timestamp - df.timestamp1)

df = df.drop(columns = ['speed1', 'timestamp1', 'identi'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
df

Unnamed: 0,driver_ID,order_ID,timestamp,x,y,time_ID,row_id,col_id,speed,acc
2,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961134,1.158771e+07,3.595125e+06,4.0,33.0,82.0,5.353831,0.634832
3,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961137,1.158773e+07,3.595116e+06,4.0,33.0,83.0,8.012669,0.886279
4,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961140,1.158776e+07,3.595105e+06,4.0,33.0,83.0,8.185162,0.057498
5,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961143,1.158778e+07,3.595096e+06,4.0,33.0,83.0,7.330652,-0.284837
6,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961146,1.158780e+07,3.595085e+06,4.0,33.0,83.0,8.049442,0.239597
...,...,...,...,...,...,...,...,...,...,...
3726185,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963740,1.158738e+07,3.590010e+06,8.0,106.0,77.0,11.892815,0.087028
3726186,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963743,1.158736e+07,3.590020e+06,8.0,106.0,77.0,5.931354,-1.987154
3726187,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963746,1.158735e+07,3.590030e+06,8.0,105.0,77.0,5.691153,-0.080067
3726188,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963749,1.158735e+07,3.590030e+06,8.0,105.0,77.0,0.000000,-1.897051


In [14]:
df = df.reset_index(drop = True)
df

Unnamed: 0,driver_ID,order_ID,timestamp,x,y,time_ID,row_id,col_id,speed,acc
0,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961134,1.158771e+07,3.595125e+06,4.0,33.0,82.0,5.353831,0.634832
1,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961137,1.158773e+07,3.595116e+06,4.0,33.0,83.0,8.012669,0.886279
2,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961140,1.158776e+07,3.595105e+06,4.0,33.0,83.0,8.185162,0.057498
3,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961143,1.158778e+07,3.595096e+06,4.0,33.0,83.0,7.330652,-0.284837
4,gaAAqamkj@sqvfantBjjpqt8bnh5xquf,kcDzyb7cf5Djvg-fDssnxAm8jiiaAnAi,1477961146,1.158780e+07,3.595085e+06,4.0,33.0,83.0,8.049442,0.239597
...,...,...,...,...,...,...,...,...,...,...
3684451,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963740,1.158738e+07,3.590010e+06,8.0,106.0,77.0,11.892815,0.087028
3684452,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963743,1.158736e+07,3.590020e+06,8.0,106.0,77.0,5.931354,-1.987154
3684453,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963746,1.158735e+07,3.590030e+06,8.0,105.0,77.0,5.691153,-0.080067
3684454,vpzwvmhge4Bxtj9eqsncDmlhif7-xsnk,vdpnuddi95wxujamtzknxrrmheebzwrp,1477963749,1.158735e+07,3.590030e+06,8.0,105.0,77.0,0.000000,-1.897051


In [16]:
# 下面计算集体/网格平均特征

# 1. 网格平均速度：先求每辆车在网格中的平均速度，然后求网格中所有个体平均速度的军制
# 基于时空网格和估计id分组
orderGrouped = df.groupby(['row_id', 'col_id', 'time_ID', 'order_ID'])
# 网格在每个时刻（时间窗）的平均速度
grouped_speed = orderGrouped.speed.mean().reset_index()
grouped_speed = grouped_speed.groupby(['row_id', 'col_id', 'time_ID'])
grid_speed = grouped_speed.speed.mean()
# 去除异常值
grid_speed = grid_speed.clip(grid_speed.quantile(0.05), grid_speed.quantile(0.95))

In [20]:
# 2. 网格平均加速度
gridGrouped = df.groupby(['row_id', 'col_id', 'time_ID'])
grid_acc = gridGrouped.acc.mean()

In [21]:
# 3.网格浮动车流量
grouped_volume = orderGrouped.speed.last().reset_index() #每个时空网格中的每个order只保留一辆（用last（）来取）
grouped_volume = grouped_volume.groupby(['row_id', 'col_id', 'time_ID'])
grid_volume = grouped_volume['speed'].size()
grid_volume = grid_volume.clip(grid_volume.quantile(0.05), grid_volume.quantile(0.95))

In [22]:
# 4.网格车速标准差
grid_v_std = gridGrouped.speed.std()
# 去除异常值
grid_v_std = grid_v_std.clip(grid_v_std.quantile(0.05), group_v_std.quantile(0.95))

In [23]:
# 5.网格平均停车次数
stopNum = gridGrouped.speed.agg(lambda x: (x==0).sum())
grid_stop = pd.concat((stopNum, grid_volume), axis = 1)
grid_stop['stopNum'] = stopNum.values/ grid_volume.values
grid_stop = grid_stop['stopNum']
grid_stop = grid_stop.clip(0, grid_stop.quantile(0.95))

In [25]:
# 下面进行数据整理
feature = pd.concat([grid_speed, grid_acc, grid_volume, grid_v_std, grid_stop], axis = 1).reset_index()
feature.columns = ['row_id','col_id', 'time_id', 'aveSpeed', 'gridAcc', 'volume', 'speedStd', 'stopNum']
feature

Unnamed: 0,row_id,col_id,time_id,aveSpeed,gridAcc,volume,speedStd,stopNum
0,0.0,7.0,6.0,3.489376,-0.015243,1.0,,0.0
1,0.0,7.0,7.0,3.489376,-0.014831,1.0,,0.0
2,0.0,7.0,9.0,3.489376,0.000000,1.0,0.305236,1.0
3,0.0,7.0,10.0,7.391224,0.126333,1.0,,0.0
4,0.0,8.0,0.0,13.363363,-0.106795,17.0,5.101212,0.0
...,...,...,...,...,...,...,...,...
80340,138.0,16.0,7.0,4.073749,0.035849,4.0,3.141760,0.0
80341,138.0,16.0,8.0,7.288781,-0.430011,1.0,,0.0
80342,138.0,16.0,9.0,3.489376,-0.084439,5.0,1.772664,3.4
80343,138.0,16.0,10.0,5.802151,-0.803677,1.0,,0.0


In [30]:
# 导出数据
feature.to_csv(feature_dst_path, index = None)