# 生成轨迹特征

1. 将每个用户的所有plt文件都合并为1个csv文件。
2. 生成停留点和移动点。
3. 删除异常值。
   1. 删除超过设定城市区域范围之外的轨迹。
   2. 停留时间超过一周的停留点。因为数据是都按天提供的。
4. 生成指定数据格式的轨迹数据。
   1. 生成时序数据。
   2. 矩阵数据。

In [1]:
import os
import pandas as pd
import transbigdata as tbd

from Utils import CalcGrid, OperJson

gParameters = OperJson.JSONConfig('./Parameters.json')
# print(gParameters.get('gPreprocessDataSavePath'))
# lon1, lat1, lon2, lat2
gGeoParameters = tbd.area_to_params(location = gParameters.get('gBoundsBeijing'), 
                                    accuracy = gParameters.get('gAccuracy'), 
                                    method = gParameters.get('gMethod'))

In [2]:
CG_Ellipsoid = CalcGrid.GridMapperEllipsoid(gParameters.get('gBoundsBeijing')[0], 
                         gParameters.get('gBoundsBeijing')[1], 
                         gParameters.get('gBoundsBeijing')[2],
                         gParameters.get('gBoundsBeijing')[3],
                         cell_size_m=1000)

testgrid = CG_Ellipsoid.lonlat_to_grid(117.52, 41.05)
testgrid

34203

In [16]:
gUsersList = next(os.walk(gParameters.get('gTrajectoryFolderPath')))[1]
# gUsersList


def GenerateStayMove(userID:str) -> bool:
    ErrorFlag = False
    gTrajectoryFolderPath = gParameters.get('gTrajectoryFolderPath')
    userdata = gTrajectoryFolderPath + '/{}/Trajectory/'.format(userID)

    # 返回指定路径下所有文件和文件夹的名字，并存放于一个列表中
    filelist = os.listdir(userdata)
    # plt文件中的字段名称。
    names = ['lat','lng','zero','alt','days','date','time']
    # 读取一个用户目录下的所有轨迹文件。
    df_list = [pd.read_csv(userdata + f, header=6, names=names, index_col=False) for f in filelist]
    # 将轨迹文件合并。
    df = pd.concat(df_list, ignore_index=True)
    # 规范时间格式。
    df['entireTime'] = pd.to_datetime((df['date'] + ' ' + df['time']), format='%Y-%m-%d %H:%M:%S')
    # 修改列名。
    df.rename(columns={'lat': 'latitude', 'lng': 'longitude'}, inplace=True)
    # 删除不需要使用的列。
    df.drop(['zero', 'days', 'date', 'time', 'alt'], axis=1, inplace=True)

    # print(df.shape)
    # 删除超过定义范围之外的点。
    df = tbd.clean_outofbounds(df, bounds = gParameters.get('gBoundsBeijing'), 
                            col = ['longitude', 'latitude'])
    # print(df.shape)
    if df.shape[0] == 0:
        print(f'{userID} after clean out of bounds, dataframe is null.')
        ErrorFlag = True
        return ErrorFlag
    # 添加userID列。
    df['userID'] = userID
    # df.head(3)

    # 生成停留点。
    stay, move = tbd.traj_stay_move(df, gGeoParameters,
                                        col=['userID', 'entireTime', 'longitude', 'latitude'], 
                                        activitytime=gParameters.get('gActivityTime'))
    if stay.shape[0] == 0:
        print(f'{userID} generate null stay, dataframe is null.')
        ErrorFlag = True
        return ErrorFlag
    
    # 删除停留点时间超过一周的停留点。
    # 因为geolife 中都是按天提供轨迹数据的。其中有部分数据的时间间隔是间断（超过一天）的。所以需要删除异常值。
    # print(stay)
    stay = stay[stay['duration'] <= gParameters.get('gStayDurationthreshold')]
    if stay.shape[0] == 0:
        print(f'{userID} Duration is less than the threshold, dataframe is null.')
        ErrorFlag = True
        return ErrorFlag

    # print(stay.columns)
    # 生成自定义的grid。
    def GenerateStayGrid(df):
        df['grid'] = CG_Ellipsoid.lonlat_to_grid(df['lon'], df['lat'])
        return df
    stay = stay.apply(GenerateStayGrid, axis=1)

    # 删除没有使用的列名。
    stay = stay[['userID', 'stime', 'etime', 'lon', 'lat', 'duration', 'grid']]
    stay.to_csv(gParameters.get('gSingleUserStaySavePath').format(userID))

    def GenerateMoveGrid(df):
        df['sgrid'] = CG_Ellipsoid.lonlat_to_grid(df['slon'], df['slat'])
        df['egrid'] = CG_Ellipsoid.lonlat_to_grid(df['elon'], df['elat'])
        return df
    move = move.apply(GenerateMoveGrid, axis=1)
    move = move[['userID', 'stime', 'slon', 'slat', 'sgrid', 'etime', 'elon', 'elat', 'egrid', 'duration']]
    # move
    move.to_csv(gParameters.get('gSingleUserMoveSavePath').format(userID))
    # print(f'{userID} has generated stay and move.')
    return ErrorFlag


# GenerateStayMove('049')

for userID in gUsersList:
    if GenerateStayMove(userID=userID):
        # raise ValueError(f"{userID} report ERROR.")
        continue

# consume 1m 38.4s .

049 generate null stay, dataframe is null.
118 after clean out of bounds, dataframe is null.
120 generate null stay, dataframe is null.
123 generate null stay, dataframe is null.
132 after clean out of bounds, dataframe is null.
137 generate null stay, dataframe is null.
160 after clean out of bounds, dataframe is null.
178 generate null stay, dataframe is null.
180 Duration is less than the threshold, dataframe is null.


generate record:

- 049 generate null stay, dataframe is null.
- 118 after clean out of bounds, dataframe is null.
- 120 generate null stay, dataframe is null.
- 123 generate null stay, dataframe is null.
- 132 after clean out of bounds, dataframe is null.
- 137 generate null stay, dataframe is null.
- 160 after clean out of bounds, dataframe is null.
- 178 generate null stay, dataframe is null.
- 180 Duration is less than the threshold, dataframe is null.

# 将每个人的轨迹都合并为一个CSV文件用于模型训练

In [None]:
# 获取该路径下的所有文件。
filelist = os.listdir("./Data/Output/Stays/")
# 读取所有的CSV，并且存入链表。
df_list = [pd.read_csv("./Data/Output/Stays/" + f, index_col=0, dtype={'userID': object}) for f in filelist]
# 合并。
AllStays = pd.concat(df_list, ignore_index=True)
# 显示形状。
AllStays.shape

(27919, 7)

In [39]:
AllStays.head(3)

Unnamed: 0,userID,stime,etime,lon,lat,duration,grid
0,0,2008-10-23 03:01:40,2008-10-23 04:08:07,116.301309,39.984345,3987.0,14428
1,0,2008-10-23 04:30:57,2008-10-23 09:44:15,116.322261,39.998151,18798.0,14808
2,0,2008-10-23 09:44:15,2008-10-23 10:14:21,116.321396,40.007122,1806.0,14809


In [52]:
# 保存。
AllStays.to_csv('./Data/Output/AllUserTimeSeries.csv')

# 生成交互矩阵

In [51]:
InteractionMatrix = pd.pivot_table(AllStays[['userID', 'grid', 'duration']], index='userID',columns='grid', values='duration', aggfunc='count')
InteractionMatrix.fillna(0, inplace=True)
InteractionMatrix.to_csv('./Data/Output/InteractionMatrix.csv')

# 生成类似自然语言的Grid矩阵

1. 按8：2的比例分了训练集和测试集。
2. 可调参的矩阵第二个维度。