In [4]:
# CHANGE THE PATH TO THE PATH AT YOUR PC!
#path = "C:\\Users\\user\\Desktop\\TUe\\Topological\\Project\\Geolife Trajectories 1.3\\Data\\"
path = "Geolife Trajectories 1.3\\Data\\"
CITY_BORDERS = [40.19, 39.65, 115.98, 116.74]

import pandas as pd
import os
from datetime import datetime, timedelta
from math import sqrt

## Changes in the processing:

- Date converted to datetime format and to Beijing time (for day/night purposes)

- Feet altitude converted to meters, because we have any remains of self-respect

- Plug in the path from above and one number - the number of person as given by the folders

- Creates one full.csv dataset for one person outside the redundant Trajectories folder

- Prunes it so that no trajectory datapoint is recorded if no movement was detected

- ^No such case found in the first couple files, lol. So this will not run on the dataset

- ^Perhaps we shall just set a threshold there, so that movement of speed 1 meter per hour (if exists) is trated as no movement

In [5]:
def fix_date_trajectories(row):
    '''Merges the date and time, converts them into datetime objects
    And converts into Beijing time [by adding 8 hours to the UTC, idc about winter/summer time]'''
    
    text1, text2 = row['date'], row['hour']
    text1 = text1+'&'+text2
    time = datetime.strptime(text1, '%Y-%m-%d&%H:%M:%S') + timedelta(hours=8)
    return time

def feet_to_meters(text: str):
    '''because we use metric system here'''
    return round(int(text)/3.28084)

def data_per_file(path, prune_redundant=False, crop_to_borders=True):
    '''Altitude set to -237 if None
    DataFramifies one .plt trajectory file
    latitude NORTH, longitude EAST, altitude METERS
    prune_redundant for now did not prune a single row, which is disturbing'''
    
    # Reading the dataset and removing redundant columns:
    df = pd.read_csv(path, skiprows=6)
    df.columns = ['latitude', 'longitude', 'worthless1', 'altitude', 'worthless2', 'date', 'hour']
    df = df.drop(['worthless1', 'worthless2'], axis=1)
    
    # Simplifying the date format and chaning to local time:
    df['full_date'] = df.apply(fix_date_trajectories, axis=1)
    df = df.drop(['date', 'hour'], axis=1)
    df.columns = list(df.columns[:-1]) + ["date"]
    
    # (Possibly) pruning the useless (no movement) rows:
    if prune_redundant:
        to_be_deleted_rows = delete_redundant_rows(df)
        df = df.drop(to_be_deleted_rows, axis=1).reset_index(drop=True)
        
    # (Possibly) cropping the data to the city borders - NOT TESTED YET:
    if crop_to_borders:
        mask_north = df['latitude'] <= CITY_BORDERS[0]
        mask_south = df['latitude'] >= CITY_BORDERS[1]
        mask_east = df['longitude'] <= CITY_BORDERS[3]
        mask_west = df['longitude'] >= CITY_BORDERS[2]
        mask_all = mask_north & mask_south & mask_east & mask_west
        df = df[mask_all]
    
    #Converting feet to meters
    df['altitude'] = df['altitude'].apply(feet_to_meters)
    
    # Adds the "change" row to the dataframe
    df = add_change_row(df)
    
    return df

def save_data_per_person(path: str, number_person_as_string: str):
    '''Runs the data_per_file once for each file one person has and concatenates the result'''
    
    df = pd.DataFrame()
    for element in os.listdir(path + number_person_as_string + "\\Trajectory"):
        file_path = path + number_person_as_string + "\\Trajectory\\" + element
        download_df = data_per_file(file_path)
        download_df["trajectory"] = number_person_as_string + "." + element[:-4] # Adds the trajectory
        df = pd.concat([df, download_df], axis=0).reset_index(drop=True)
    
    # Adds the user and (as much as possible) the trajectory
    df["user"] = number_person_as_string
    df["transportation"] = None
    if os.path.exists(path + number_person_as_string + "\\labels.txt"):
        df = add_transportation_lables(df, number_person_as_string)
    
    # Saved as a CSV outside the redundant Trajectories folder:
    df.to_csv(path+number_person_as_string+'\\full.csv', index=False)
    
def add_transportation_lables(df: pd.DataFrame, number_person_as_string: str):
    '''Adds the transportation method to the dataframe'''
    labels = pd.read_csv(path + number_person_as_string + "\\labels.txt", sep = "\t", parse_dates = [0,1])
    labels["Start Time"] = labels["Start Time"] + timedelta(hours = 8)
    labels["End Time"] = labels["End Time"] + timedelta(hours = 8)

    for index, row in labels.iterrows():
        start_date = row["Start Time"]
        end_date = row["End Time"]
        transportation = row["Transportation Mode"]

        df.loc[((df["date"] > start_date) & (df["date"] < end_date)), "transportation"] = transportation
    
    return df
    
def delete_redundant_rows(df):
    '''Deletes rows for which both the longitude and latitude did not change
    Seems like there are no such rows now??? I was sure I saw them!'''
    
    for i in range(df.shape[0]-1):
        to_be_deleted_rows = []
        if df.loc[i]['latitude'] == df.loc[i+1]['latitude'] and df.loc[i]['longitude'] == df.loc[i+1]['longitude']:
            to_be_deleted_rows.append(i+1)
            
    return to_be_deleted_rows

def add_change_row(df):
    '''Adds the "change" column to the dataframe, which tracks the change since the last move'''
    df.loc[:, "latitude_change"] = df.loc[:,"latitude"] - df.loc[:,"latitude"].shift(1)
    df.loc[:, "longitude_change"] = df.loc[:,"longitude"] - df.loc[:,"longitude"].shift(1)

    def pythago(lat, long):
        return sqrt(lat*lat + long*long)

    df.loc[:,["change"]] = df.apply(lambda x: pythago(x.latitude_change, x.longitude_change), axis=1)
    
    df = df.drop(["latitude_change", "longitude_change"], axis = 1)
    
    return df

### Distance shenanigans - needed to contact non-euclidean trigonometry for that:

- a 0.001 difference in latitude equals roughly 111 meters in distance

- a 0.001 difference in longitude equals roughly 85 meters in distance

- a combined 0.001 difference can be computed from Pythagorean (neglecting curvature) to be roughly 140 meters in distance

In [6]:
# Creates the csv's for every user
# Warning: Takes quite a while
if False:
    for user in os.listdir(path):
        print(user)
        save_data_per_person(path, user)

000
001
002
003
004
005
006
007
008
009
010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
042
043
044
045
046
047
048
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181


In [7]:
# Combines all users into 10 size packages
# Warning: Also takes quite a while
if False:
    full_df = pd.DataFrame()
    counter = 0
    label_counter = 0

    for user in os.listdir(path):
        counter += 1
        print(user)
        full_df = pd.concat([full_df, pd.read_csv(f"{path}{user}\\full.csv")])

        if counter == 10:
            full_df.to_csv(f"{label_counter}-{label_counter+9}.csv", index = False)
            label_counter += 10
            counter = 0

            full_df = pd.DataFrame()

    full_df.to_csv(f"{label_counter}-{label_counter+counter-1}.csv", index = False)

000
001
002
003
004
005
006
007
008
009
010
011


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
042
043
044
045
046
047
048
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181


In [8]:
# Combines all users into the same csv
# Warning: Also also takes quite a while
if False:
    files = [i for i in os.listdir()if i[-3:] == "csv" ]
    df = pd.concat([pd.read_csv(file) for file in files], ignore_index=True)
    df.to_csv("0-181.csv", index = False)

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


In [11]:
full_df = pd.read_csv("0-181.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [None]:
# Create a store 6 subsets of the data
full_df[full_df["transportation"].isin(["walk", "run"])].to_csv("presets/walk.csv", index = False)
full_df[full_df["transportation"].isin(["bus", "train", "subway"])].to_csv("presets/PT.csv", index = False)
full_df[full_df["transportation"].isin(["car", "taxi"])].to_csv("presets/car.csv", index = False)

trajectory_count = full_df["trajectory"].value_counts()
very_short_trajectories = list(trajectory_count[(trajectory_count <= 120)].keys())
short_trajectories = list(trajectory_count[(trajectory_count >= 120) & (trajectory_count <= 720)].keys())
medium_trajectories = list(trajectory_count[(trajectory_count >= 720) & (trajectory_count <= 2160)].keys())

full_df[full_df["trajectory"].isin(very_short_trajectories)].to_csv("presets/very_short.csv", index = False)
full_df[full_df["trajectory"].isin(short_trajectories)].to_csv("presets/short.csv", index = False)
full_df[full_df["trajectory"].isin(medium_trajectories)].to_csv("presets/medium.csv", index = False)