In [1]:
# CHANGE THE PATH TO THE PATH AT YOUR PC!
#path = "C:\\Users\\user\\Desktop\\TUe\\Topological\\Project\\Geolife Trajectories 1.3\\Data\\"
path = "Geolife Trajectories 1.3\\Data\\"

import pandas as pd
import os
from datetime import datetime, timedelta

## Changes in the processing:

- Date converted to datetime format and to Beijing time (for day/night purposes)

- Feet altitude converted to meters, because we have any remains of self-respect

- Plug in the path from above and one number - the number of person as given by the folders

- Creates one full.csv dataset for one person outside the redundant Trajectories folder

- Prunes it so that no trajectory datapoint is recorded if no movement was detected

- ^No such case found in the first couple files, lol. So this will not run on the dataset

- ^Perhaps we shall just set a threshold there, so that movement of speed 1 meter per hour (if exists) is trated as no movement

In [2]:
def fix_date_trajectories(row):
    '''Merges the date and time, converts them into datetime objects
    And converts into Beijing time [by adding 8 hours to the UTC, idc about winter/summer time]'''
    
    text1, text2 = row['date'], row['hour']
    text1 = text1+'&'+text2
    time = datetime.strptime(text1, '%Y-%m-%d&%H:%M:%S') + timedelta(hours=8)
    return time

def feet_to_meters(text: str):
    '''because we use metric system here'''
    return round(int(text)/3.28084)

def data_per_file(path, prune_redundant=False):
    '''Altitude set to -237 if None
    DataFramifies one .plt trajectory file
    latitude NORTH, longitude EAST, altitude METERS
    prune_redundant for now did not prune a single row, which is disturbing'''
    
    #Reading the dataset and removing redundant columns:
    df = pd.read_csv(path, skiprows=6)
    df.columns = ['latitude', 'longitude', 'worthless1', 'altitude', 'worthless2', 'date', 'hour']
    df = df.drop(['worthless1', 'worthless2'], axis=1)
    
    #Simplifying the date format and chaning to local time:
    df['full_date'] = df.apply(fix_date_trajectories, axis=1)
    df = df.drop(['date', 'hour'], axis=1)
    df.columns = list(df.columns[:-1]) + ["date"]
    
    #(possibly) pruning the useless (no movement) rows:
    if prune_redundant:
        to_be_deleted_rows = delete_redundant_rows(df)
        df = df.drop(to_be_deleted_rows, axis=1).reset_index(drop=True)
    
    #Converting feet to meters
    df['altitude'] = df['altitude'].apply(feet_to_meters)
    
    return df

def save_data_per_person(path: str, number_person_as_string: str):
    '''Runs the data_per_file once for each file one person has and concatenates the result'''
    
    df = pd.DataFrame()
    for element in os.listdir(path + number_person_as_string + "\\Trajectory"):
        file_path = path + number_person_as_string + "\\Trajectory\\" + element
        download_df = data_per_file(file_path)
        download_df["trajectory"] = number_person_as_string + "." + element[:-4] # Adds the trajectory
        df = pd.concat([df, download_df], axis=0).reset_index(drop=True)
    
    # Adds the user and (as much as possible) the trajectory
    df["user"] = number_person_as_string
    df["transportation"] = None
    if os.path.exists(path + number_person_as_string + "\\labels.txt"):
        df = add_transportation_lables(df, number_person_as_string)
    
    #Saved as a CSV outside the redundant Trajectories folder:
    df.to_csv(path+number_person_as_string+'\\full.csv', index=False)
    
def add_transportation_lables(df: pd.DataFrame, number_person_as_string: str):
    '''Adds the transportation method to the dataframe'''
    labels = pd.read_csv(path + number_person_as_string + "\\labels.txt", sep = "\t", parse_dates = [0,1])
    labels["Start Time"] = labels["Start Time"] + timedelta(hours = 8)
    labels["End Time"] = labels["End Time"] + timedelta(hours = 8)

    for index, row in labels.iterrows():
        start_date = row["Start Time"]
        end_date = row["End Time"]
        transportation = row["Transportation Mode"]

        df.loc[((df["date"] > start_date) & (df["date"] < end_date)), "transportation"] = transportation
    
    return df
    
def delete_redundant_rows(df):
    '''Deletes rows for which both the longitude and latitude did not change
    Seems like there are no such rows now??? I was sure I saw them!'''
    
    for i in range(df.shape[0]-1):
        to_be_deleted_rows = []
        if df.loc[i]['latitude'] == df.loc[i+1]['latitude'] and df.loc[i]['longitude'] == df.loc[i+1]['longitude']:
            to_be_deleted_rows.append(i+1)
            
    return to_be_deleted_rows

In [None]:
# Creates the csv's for every user
# Warning: Takes quite a while
if False:
    for user in os.listdir(path):
        print(user)
        save_data_per_person(path, user)

In [None]:
# Combines all users into 10 size packages
# Warning: Also takes quite a while
if False:
    full_df = pd.DataFrame()
    counter = 0
    label_counter = 0

    for user in os.listdir(path):
        counter += 1
        print(user)
        full_df = pd.concat([full_df, pd.read_csv(f"{path}{user}\\full.csv")])

        if counter == 10:
            full_df.to_csv(f"{label_counter}-{label_counter+9}.csv", index = False)
            label_counter += 10
            counter = 0

            full_df = pd.DataFrame()

    full_df.to_csv(f"{label_counter}-{label_counter+counter-1}.csv", index = False)

In [None]:
# Combines all users into the same csv
# Warning: Also also takes quite a while
if False:
    files = [i for i in os.listdir()if i[-3:] == "csv" ]
    df = pd.concat([pd.read_csv(file) for file in files], ignore_index=True)
    df.to_csv("0-181.csv", index = False)

In [3]:
full_df = pd.read_csv("0-182.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
full_df.head()

Unnamed: 0,latitude,longitude,altitude,date,trajectory,user,transportation
0,39.984683,116.31845,150,2008-10-23 10:53:10,0.20081,0,
1,39.984686,116.318417,150,2008-10-23 10:53:15,0.20081,0,
2,39.984688,116.318385,150,2008-10-23 10:53:20,0.20081,0,
3,39.984655,116.318263,150,2008-10-23 10:53:25,0.20081,0,
4,39.984611,116.318026,150,2008-10-23 10:53:30,0.20081,0,


In [5]:
full_df["transportation"].unique()

array([nan, 'train', 'taxi', 'walk', 'bus', 'subway', 'airplane', 'car',
       'bike', 'motorcycle', 'run', 'boat'], dtype=object)

In [11]:
full_df[full_df["transportation"].isin(["walk", "run"])].to_csv("presets/walk.csv", index = False)
full_df[full_df["transportation"].isin(["bus", "train", "subway"])].to_csv("presets/PT.csv", index = False)
full_df[full_df["transportation"].isin(["car", "taxi"])].to_csv("presets/car.csv", index = False)

In [36]:
trajectory_count = full_df["trajectory"].value_counts()
very_short_trajectories = list(trajectory_count[(trajectory_count <= 120)].keys())
short_trajectories = list(trajectory_count[(trajectory_count >= 120) & (trajectory_count <= 720)].keys())
medium_trajectories = list(trajectory_count[(trajectory_count >= 720) & (trajectory_count <= 2160)].keys())

full_df[full_df["trajectory"].isin(very_short_trajectories)].to_csv("presets/very_short.csv", index = False)
full_df[full_df["trajectory"].isin(short_trajectories)].to_csv("presets/short.csv", index = False)
full_df[full_df["trajectory"].isin(medium_trajectories)].to_csv("presets/medium.csv", index = False)