In [4]:
# CHANGE THE PATH TO THE PATH AT YOUR PC!
path = "C:\\Users\\user\\Desktop\\TUe\\Topological\\Project\\Geolife Trajectories 1.3\\Data\\"
#path = "Geolife Trajectories 1.3\\Data\\"

import pandas as pd
import os
from datetime import datetime, timedelta

## Changes in the processing:

- Date converted to datetime format and to Beijing time (for day/night purposes)

- Feet altitude converted to meters, because we have any remains of self-respect

- Plug in the path from above and one number - the number of person as given by the folders

- Creates one full.csv dataset for one person outside the redundant Trajectories folder

- Prunes it so that no trajectory datapoint is recorded if no movement was detected

- ^No such case found in the first couple files, lol. So this will not run on the dataset

- ^Perhaps we shall just set a threshold there, so that movement of speed 1 meter per hour (if exists) is trated as no movement

In [22]:
def fix_date(row):
    '''Merges the date and time, converts them into datetime objects
    And converts into Beijing time [by adding 8 hours to the UTC, idc about winter/summer time]'''
    
    text1, text2 = row['date'], row['hour']
    text1 = text1+'&'+text2
    time = datetime.strptime(text1, '%Y-%m-%d&%H:%M:%S') + timedelta(hours=8)
    return time

def feet_to_meters(text: str):
    '''because we use metric system here'''
    return round(int(text)/3.28084)

def data_per_file(path, prune_redundant=False):
    '''Altitude set to -237 if None
    DataFramifies one .plt trajectory file
    latitude NORTH, longitude EAST, altitude METERS
    prune_redundant for now did not prune a single row, which is disturbing'''
    
    #Reading the dataset and removing redundant columns:
    df = pd.read_csv(path, skiprows=6)
    df.columns = ['latitude', 'longitude', 'worthless1', 'altitude', 'worthless2', 'date', 'hour']
    df = df.drop(['worthless1', 'worthless2'], axis=1)
    
    #Simplifying the date format and chaning to local time:
    df['full_date'] = df.apply(fix_date, axis=1)
    df = df.drop(['date', 'hour'], axis=1)
    df.columns = list(df.columns[:-1]) + ["date"]
    
    #(possibly) pruning the useless (no movement) rows:
    if prune_redundant:
        to_be_deleted_rows = delete_redundant_rows(df)
        df = df.drop(to_be_deleted_rows, axis=1).reset_index(drop=True)
    
    #Converting feet to meters
    df['altitude'] = df['altitude'].apply(feet_to_meters)
    
    return df

def save_data_per_person(path: str, number_person_as_string: str):
    '''Runs the data_per_file once for each file one person has and concatenates the result'''
    
    df = pd.DataFrame()
    for element in os.listdir(path + number_person_as_string + "\\Trajectory"):
        file_path = path + number_person_as_string + "\\Trajectory\\" + element
        download_df = data_per_file(file_path)
        download_df["trajectory"] = number_person_as_string + "." + element[:-4]
        df = pd.concat([df, download_df], axis=0).reset_index(drop=True)
    
    df["user"] = number_person_as_string
    
    #Saved as a CSV outside the redundant Trajectories folder:
    df.to_csv(path+number_person_as_string+'\\full.csv', index=False)
    
def delete_redundant_rows(df):
    '''Deletes rows for which both the longitude and latitude did not change
    Seems like there are no such rows now??? I was sure I saw them!'''
    
    for i in range(df.shape[0]-1):
        to_be_deleted_rows = []
        if df.loc[i]['latitude'] == df.loc[i+1]['latitude'] and df.loc[i]['longitude'] == df.loc[i+1]['longitude']:
            to_be_deleted_rows.append(i+1)
            
    return to_be_deleted_rows

In [23]:
df = data_per_file(path=f"{path}000\\Trajectory\\20081023025304.plt")
df.head()

Unnamed: 0,latitude,longitude,altitude,date
0,39.984683,116.31845,150,2008-10-23 10:53:10
1,39.984686,116.318417,150,2008-10-23 10:53:15
2,39.984688,116.318385,150,2008-10-23 10:53:20
3,39.984655,116.318263,150,2008-10-23 10:53:25
4,39.984611,116.318026,150,2008-10-23 10:53:30


In [25]:
save_data_per_person(path, "000")

In [26]:
# Warning: Takes quite a while
if False
    for user in os.listdir(path):
        save_data_per_person(path, user)

['000',
 '001',
 '002',
 '003',
 '004',
 '005',
 '006',
 '007',
 '008',
 '009',
 '010',
 '011',
 '012',
 '013',
 '014',
 '015',
 '016',
 '017',
 '018',
 '019',
 '020',
 '021',
 '022',
 '023',
 '024',
 '025',
 '026',
 '027',
 '028',
 '029',
 '030',
 '031',
 '032',
 '033',
 '034',
 '035',
 '036',
 '037',
 '038',
 '039',
 '040',
 '041',
 '042',
 '043',
 '044',
 '045',
 '046',
 '047',
 '048',
 '049',
 '050',
 '051',
 '052',
 '053',
 '054',
 '055',
 '056',
 '057',
 '058',
 '059',
 '060',
 '061',
 '062',
 '063',
 '064',
 '065',
 '066',
 '067',
 '068',
 '069',
 '070',
 '071',
 '072',
 '073',
 '074',
 '075',
 '076',
 '077',
 '078',
 '079',
 '080',
 '081',
 '082',
 '083',
 '084',
 '085',
 '086',
 '087',
 '088',
 '089',
 '090',
 '091',
 '092',
 '093',
 '094',
 '095',
 '096',
 '097',
 '098',
 '099',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
