# Data extraction from files

In [50]:
import pandas as pd
import os
import sys
from datetime import datetime,date
import plotly.express as px

project_path = '/home/yohann/projects/geolife'
os.chdir(project_path)
data_path = os.path.join(project_path, 'data')


In [51]:
from dataclasses import dataclass
from typing import List, Dict

@dataclass
class Record:
    user_id: str
    latitude: float
    longitude: float
    altitude: float
    datetime: datetime
    timestamp: float
    label: str = None

## Extracting records from a single file (=Trajectory)

In [52]:
# filepath
user_id = '170'
user_path = os.path.join(data_path, user_id)
records_files_paths = [os.path.join(user_path, 'Trajectory', file) for file in os.listdir(os.path.join(data_path, user_id, 'Trajectory')) if file.endswith('.plt')]
file_path = records_files_paths[0]

# read file and create dataframe
plt_files_columns = ['latitude', 'longitude', 'zero', 'altitude', 'days', 'date', 'time']
df = pd.read_csv(file_path, skiprows=6, header=None, names=plt_files_columns)
df.drop(columns=['zero', 'days'], inplace=True)
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
df['timestamp'] = df['datetime'].apply(lambda x: x.timestamp())
df.drop(columns=['date', 'time'], inplace=True)
df['user_id'] = user_id
df['label'] = None

print(df.head())
print("Features of the trajectory:")
print(f"Number of records: {df.shape[0]}")
print(f"start_datetime: {df['datetime'].min()}")
print(f"end_datetime: {df['datetime'].max()}")
print(f"duration: {df['datetime'].max() - df['datetime'].min()}")

    latitude   longitude  altitude            datetime     timestamp user_id  \
0  40.071500  116.314280       492 2008-05-12 01:27:05  1.210556e+09     170   
1  40.071270  116.314442       492 2008-05-12 01:27:06  1.210556e+09     170   
2  40.071345  116.314347       492 2008-05-12 01:27:07  1.210556e+09     170   
3  40.071296  116.314351       493 2008-05-12 01:27:09  1.210556e+09     170   
4  40.071192  116.314421       493 2008-05-12 01:27:11  1.210556e+09     170   

  label  
0  None  
1  None  
2  None  
3  None  
4  None  
Features of the trajectory:
Number of records: 991
start_datetime: 2008-05-12 01:27:05
end_datetime: 2008-05-12 01:55:35
duration: 0 days 00:28:30


In [73]:
@dataclass
class Trajectory:
    user_id: str
    records: List[Record]
    label: str = None
    
    @property
    def count(self) -> int:
        return len(self.records)
    
    @property
    def start_datetime(self) -> datetime:
        return min([record.datetime for record in self.records])
    
    @property
    def end_datetime(self) -> datetime:
        return max([record.datetime for record in self.records])
    
    @property
    def duration(self) -> datetime:
        return self.end_datetime - self.start_datetime
    
    @classmethod
    def from_file(
        cls: 'Trajectory',
        file_path: str,
        user_id: str,
        label: str = None
    ) -> 'Trajectory':
        plt_files_columns = ['latitude', 'longitude', 'zero', 'altitude', 'days', 'date', 'time']
        df = pd.read_csv(file_path, skiprows=6, header=None, names=plt_files_columns)
        df.drop(columns=['zero', 'days'], inplace=True)
        df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
        df['timestamp'] = df['datetime'].apply(lambda x: x.timestamp())
        df.drop(columns=['date', 'time'], inplace=True)
        df['user_id'] = user_id
        df['label'] = label
        records = [Record(**record) for record in df.to_dict(orient='records')]
        return cls(user_id=user_id, records=records, label=label)
    
    @property
    def df(self) -> pd.DataFrame:
        return pd.DataFrame([record.__dict__ for record in self.records])
    
    @property
    def features(self) -> Dict:
        # list all property into an dict
        return {
            'user_id': self.user_id,
            'count': self.count,
            'start_datetime': self.start_datetime,
            'end_datetime': self.end_datetime,
            'duration': self.duration,
            'label': self.label,
        }

In [74]:
trajectory = Trajectory.from_file(file_path, user_id)
trajectory.features

{'user_id': '170',
 'count': 991,
 'start_datetime': Timestamp('2008-05-12 01:27:05'),
 'end_datetime': Timestamp('2008-05-12 01:55:35'),
 'duration': Timedelta('0 days 00:28:30'),
 'label': None}

In [81]:
# list the trajectories from a single user

trajectories = []
for file in records_files_paths:
    trajectory = Trajectory.from_file(file, user_id)
    trajectories.append(trajectory)
    
trajectories_df = pd.DataFrame([trajectory.features for trajectory in trajectories])
trajectories_df.sort_values(by='start_datetime', inplace=True)
trajectories_df.head()
    

Unnamed: 0,user_id,count,start_datetime,end_datetime,duration,label
2,170,954,2008-04-28 11:27:04,2008-04-28 11:50:10,0 days 00:23:06,
1,170,992,2008-04-29 01:38:05,2008-04-29 02:03:28,0 days 00:25:23,
0,170,991,2008-05-12 01:27:05,2008-05-12 01:55:35,0 days 00:28:30,
4,170,546,2008-05-14 01:56:23,2008-05-14 02:19:52,0 days 00:23:29,
3,170,894,2008-05-21 01:33:29,2008-05-21 02:02:01,0 days 00:28:32,


In [82]:
@dataclass
class Trajectories:
    trajectories: List[Trajectory]
    
    @property
    def df(self) -> pd.DataFrame:
        return pd.concat([trajectory.df for trajectory in self.trajectories])
    
    @property
    def features(self) -> pd.DataFrame:
        df = pd.DataFrame([trajectory.features for trajectory in self.trajectories])
        df.sort_values(by='start_datetime', inplace=True)
        return df
    
    @classmethod
    def from_files(
        cls: 'Trajectories',
        files_paths: List[str], 
        user_id: str
    ) -> 'Trajectories':
        trajectories = [Trajectory.from_file(file_path, user_id) for file_path in files_paths]
        return cls(trajectories=trajectories)

In [83]:
trajectories = Trajectories.from_files(records_files_paths, user_id)
trajectories.features

Unnamed: 0,user_id,count,start_datetime,end_datetime,duration,label
2,170,954,2008-04-28 11:27:04,2008-04-28 11:50:10,0 days 00:23:06,
1,170,992,2008-04-29 01:38:05,2008-04-29 02:03:28,0 days 00:25:23,
0,170,991,2008-05-12 01:27:05,2008-05-12 01:55:35,0 days 00:28:30,
4,170,546,2008-05-14 01:56:23,2008-05-14 02:19:52,0 days 00:23:29,
3,170,894,2008-05-21 01:33:29,2008-05-21 02:02:01,0 days 00:28:32,


## Extracting labels (if exists) for a user (user_id='170')

In [85]:
if 'labels.txt' in os.listdir(user_path):
    df_labels = pd.DataFrame(columns=['start_datetime', 'end_datetime', 'mode'])
    with open(user_path + '/labels.txt') as f:
        for line in f:
            if 'Time' in line:
                continue
            start_datetime, end_datetime, mode = line.split('\t')
            mode = mode.replace('\n', '')
            df_labels = pd.concat([df_labels, pd.DataFrame({'start_datetime': [start_datetime], 'end_datetime': [end_datetime], 'mode': [mode]})])
    df_labels['start_datetime'] = pd.to_datetime(df_labels['start_datetime'])
    df_labels['end_datetime'] = pd.to_datetime(df_labels['end_datetime'])
    
df_labels.head()

Unnamed: 0,start_datetime,end_datetime,mode
0,2008-04-28 11:27:42,2008-04-28 11:27:58,walk
0,2008-04-28 11:28:00,2008-04-28 11:42:54,subway
0,2008-04-28 11:42:56,2008-04-28 11:50:10,walk
0,2008-04-29 01:38:21,2008-04-29 01:41:28,walk
0,2008-04-29 01:41:30,2008-04-29 01:57:53,subway


In [None]:
@dataclass


In [20]:
def plot_trajectory(
    df: pd.DataFrame,
    ) -> px.scatter_mapbox:
    df['text'] = df['datetime'].dt.strftime('%H:%M')
    fig = px.scatter_mapbox(
        df, 
        lat="latitude", 
        lon="longitude", 
        hover_data=['altitude', 'datetime'], 
        color='altitude',
        zoom=10,
        template="plotly_dark",
        text='text',
        )
    fig.update_layout(
        margin={"r":0,"t":0,"l":0,"b":0},
        mapbox_style='carto-darkmatter',
        )
    return fig

In [9]:
plot_trajectory(df)