In [None]:
import os
import pandas as pd
import transbigdata as tbd
from typing import Tuple, List
import random
import math
import numpy as np

from Utils import CalcGrid, OperJson

gParameters = OperJson.JSONConfig('./Parameters.json')

gGeoParameters = tbd.area_to_params(location = gParameters.get('gBoundsBeijing'), 
                                    accuracy = gParameters.get('gAccuracy'), 
                                    method = gParameters.get('gMethod'))

In [None]:
CG_Ellipsoid = CalcGrid.GridMapperEllipsoid(gParameters.get('gBoundsBeijing')[0], 
                         gParameters.get('gBoundsBeijing')[1], 
                         gParameters.get('gBoundsBeijing')[2],
                         gParameters.get('gBoundsBeijing')[3],
                         cell_size_m=1000)

testgrid = CG_Ellipsoid.lonlat_to_grid(117.52, 41.05)
testgrid

In [None]:
gUsersList = next(os.walk(gParameters.get('gTrajectoryFolderPath')))[1]
# gUsersList


def GenerateStayMove(userID:str) -> bool:

    ErrorFlag = False
    gTrajectoryFolderPath = gParameters.get('gTrajectoryFolderPath')
    userdata = gTrajectoryFolderPath + '/{}/Trajectory/'.format(userID)

    filelist = os.listdir(userdata)
    names = ['lat','lng','zero','alt','days','date','time']
    df_list = [pd.read_csv(userdata + f, header=6, names=names, index_col=False) for f in filelist]
    df = pd.concat(df_list, ignore_index=True)
    df['entireTime'] = pd.to_datetime((df['date'] + ' ' + df['time']), format='%Y-%m-%d %H:%M:%S')
    df.rename(columns={'lat': 'latitude', 'lng': 'longitude'}, inplace=True)
    df.drop(['zero', 'days', 'date', 'time', 'alt'], axis=1, inplace=True)

    df = tbd.clean_outofbounds(df, bounds = gParameters.get('gBoundsBeijing'), 
                            col = ['longitude', 'latitude'])

    if df.shape[0] == 0:
        print(f'{userID} after clean out of bounds, dataframe is null.')
        ErrorFlag = True
        return ErrorFlag

    df['userID'] = userID

    stay, move = tbd.traj_stay_move(df, gGeoParameters,
                                        col=['userID', 'entireTime', 'longitude', 'latitude'], 
                                        activitytime=gParameters.get('gActivityTime'))
    if stay.shape[0] == 0:
        print(f'{userID} generate null stay, dataframe is null.')
        ErrorFlag = True
        return ErrorFlag

    stay = stay[stay['duration'] <= gParameters.get('gStayDurationthreshold')]
    if stay.shape[0] == 0:
        print(f'{userID} Duration is less than the threshold, dataframe is null.')
        ErrorFlag = True
        return ErrorFlag

    def GenerateStayGrid(df):
        df['grid'] = CG_Ellipsoid.lonlat_to_grid(df['lon'], df['lat'])
        return df
    stay = stay.apply(GenerateStayGrid, axis=1)

    stay = stay[['userID', 'stime', 'etime', 'lon', 'lat', 'duration', 'grid']]
    stay.to_csv(gParameters.get('gSingleUserStaySavePath').format(userID))

    def GenerateMoveGrid(df):
        df['sgrid'] = CG_Ellipsoid.lonlat_to_grid(df['slon'], df['slat'])
        df['egrid'] = CG_Ellipsoid.lonlat_to_grid(df['elon'], df['elat'])
        return df
    move = move.apply(GenerateMoveGrid, axis=1)
    move = move[['userID', 'stime', 'slon', 'slat', 'sgrid', 'etime', 'elon', 'elat', 'egrid', 'duration']]

    move.to_csv(gParameters.get('gSingleUserMoveSavePath').format(userID))
    return ErrorFlag


for userID in gUsersList:
    if GenerateStayMove(userID=userID):
        continue


In [None]:

filelist = os.listdir("./Data/Output/Stays/")

df_list = [pd.read_csv("./Data/Output/Stays/" + f, index_col=0, dtype={'userID': object}) for f in filelist]
AllStays = pd.concat(df_list, ignore_index=True)

AllStays.shape

In [None]:
AllStays.head(3)

In [None]:

AllStays.to_csv('./Data/Output/AllUserTimeSeries.csv')

In [51]:
InteractionMatrix = pd.pivot_table(AllStays[['userID', 'grid', 'duration']], index='userID',columns='grid', values='duration', aggfunc='count')
InteractionMatrix.fillna(0, inplace=True)
InteractionMatrix.to_csv('./Data/Output/InteractionMatrix.csv')

In [None]:
def list_split(input_list:List, ratio:float, shuffle=False)->Tuple[List, List]:
    full_list = input_list
    n_total = len(full_list)
    offset = int(n_total * ratio)
    if n_total == 0 or offset < 1:
        return [], full_list
    if shuffle:
        random.shuffle(full_list)
    sublist_1 = sorted(full_list[:offset])
    sublist_2 = sorted(full_list[offset:])
    return sublist_1, sublist_2

data = list(range(20))
sub_data1, sub_data2 = list_split(data, ratio=0.2, shuffle=True)
print(sub_data1, sub_data2)


In [None]:
filelist = os.listdir("./Data/Output/Stays/")
trainUsers, testUsers = list_split(filelist, ratio=0.9, shuffle=True)
print(len(trainUsers), len(testUsers))

In [None]:
def FormatTrajData(UserFileName:List) -> Tuple[pd.DataFrame, List]:

    df_list = []
    delete_list = []
    for f in UserFileName:
        df = pd.read_csv("./Data/Output/Stays/" + f, index_col=0, dtype={'userID': object}) 
        if df.shape[0] < gParameters.get('gMatrixSecondDimension'):
            delete_list.append(f)
            continue
        if df.shape[0] % gParameters.get('gMatrixSecondDimension') != 0:
            temp = df.iloc[:math.floor(df.shape[0]/gParameters.get('gMatrixSecondDimension')) * gParameters.get('gMatrixSecondDimension'), :]
            
            df_list.append(temp)
    df = pd.concat(df_list, ignore_index=True)
    return df, delete_list


trainData, trainDelUsers = FormatTrajData(trainUsers)
testData, testDelUsers = FormatTrajData(testUsers)

In [None]:
def data_split_twodimension(sequence, windows_length=10, step_length=1):

    x = []
    y = []
    
    for i in range(math.ceil(len(sequence)/step_length)):
        labelIndex = step_length * i + windows_length
        if labelIndex > len(sequence) - 1:
            break
        # sequence[i:labelIndex, :], sequence[labelIndex, :]
        seq_x, seq_y = sequence[step_length*i:labelIndex, :], sequence[labelIndex, :]
        x.append(seq_x)
        y.append(seq_y)
    return np.array(x), np.array(y)

In [None]:
trainNp = trainData['grid'].values.reshape(-1, gParameters.get('gMatrixSecondDimension'))  # type: ignore
testNp = testData['grid'].values.reshape(-1, gParameters.get('gMatrixSecondDimension')) # type: ignore
print(trainNp.shape, testNp.shape)

trainSrc = trainNp[:-1, :]
trainTgt = trainNp[1:, :]

testSrc = testNp[:-1, :]
testTgt = testNp[1:, :]

print(trainSrc.shape, trainTgt.shape)

In [9]:
np.savetxt('./Data/Output/StayTrainMatrix_{}.csv'.format(gParameters.get('gMatrixSecondDimension')), 
           trainNp, delimiter=',', fmt='%d')
np.savetxt('./Data/Output/StayTestMatrix_{}.csv'.format(gParameters.get('gMatrixSecondDimension')), 
           testNp, delimiter=',', fmt='%d')

In [None]:
import pandas as pd
AllStays = pd.read_csv('./Data/Output/AllUserTimeSeries.csv', index_col=0)
AllStays.shape

In [None]:
AllStays.head(3)