# 1. Load Microsoft Geolife Dataset

In [2]:
# Define main directory 
main_directory = 'C:/Users/wengk.WK/OneDrive - Nanyang Technological University/Desktop/Anomaly Dectection In Trajectories Using GPS Data/Geolife Trajectories 1.3/Data'

### 1.1 Load Labelled Dataset (labels.txt & Trajectory folders)

In [6]:
import os 
import pandas as pd

# Initialize start_end_traj DataFrame 
start_end_traj = pd.DataFrame()

# Loop through each user directory
for user_id in range(182):
    user_folder = os.path.join(main_directory, f'{user_id:03}', 'Trajectory')
    labels_file = os.path.join(main_directory, f'{user_id:03}', 'labels.txt')
    
    # Check if labels.txt exists in current user folder
    if os.path.exists(labels_file):
        # Read the labels.txt file
        user_labels = pd.read_csv(labels_file, delimiter='\t')
        
        # Remove rows with 'boat' or 'airplane' in Transportation Mode
        user_labels = user_labels[~user_labels['Transportation Mode'].isin(['boat', 'airplane'])]
        
        # Consolidate transport modes
        mode_mapping = {
            'car': 'Car', 'taxi': 'Car',
            'train': 'Train', 'subway': 'Train',
            'walk': 'Walk', 'run': 'Walk',
            'bike': 'Bike', 'motorcycle': 'Bike',
            'bus': 'Bus'
        }
        
        # Apply mode mapping to Transportation Mode column
        user_labels['Transportation Mode'] = user_labels['Transportation Mode'].map(mode_mapping)
        
        # Add column for user ID
        user_labels['User'] = f'{user_id:03}'
        
        # Append to start_end_traj DataFrame
        start_end_traj = pd.concat([start_end_traj, user_labels], ignore_index=True)

# Reorder columns 
start_end_traj = start_end_traj[['User', 'Start Time', 'End Time', 'Transportation Mode']]

# Print start_end_traj DataFrame
print(start_end_traj)

      User           Start Time             End Time Transportation Mode
0      010  2007/06/26 11:32:29  2007/06/26 11:40:29                 Bus
1      010  2008/03/28 14:52:54  2008/03/28 15:59:59               Train
2      010  2008/03/28 16:00:00  2008/03/28 22:02:00               Train
3      010  2008/03/29 01:27:50  2008/03/29 15:59:59               Train
4      010  2008/03/29 16:00:00  2008/03/30 15:59:59               Train
...    ...                  ...                  ...                 ...
14689  179  2008/11/17 06:59:58  2008/11/17 07:06:16                 Bus
14690  179  2008/11/17 07:06:16  2008/11/17 07:14:32                Walk
14691  179  2008/11/29 01:58:05  2008/11/29 02:01:39                 Bus
14692  179  2008/11/29 02:01:39  2008/11/29 02:07:57                Walk
14693  179  2008/11/29 02:07:57  2008/11/29 02:43:37               Train

[14694 rows x 4 columns]


In [7]:
import os
import pandas as pd

# Initialize empty lists to store DataFrames for each user's trajectory data and labels
all_data = []
all_users_data = []

# Loop through each user directory
for user_folder in os.listdir(main_directory):
    user_folder_path = os.path.join(main_directory, user_folder)

    # Define paths for labels.txt and Trajectory folder
    labels_file_path = os.path.join(user_folder_path, 'labels.txt')
    trajectory_folder_path = os.path.join(user_folder_path, 'Trajectory')
    
    # Check if the path is a directory and contains labels.txt
    if os.path.isdir(user_folder_path) and os.path.exists(labels_file_path):
        # Read the labels.txt file
        labels_df = pd.read_csv(labels_file_path, sep='\t')
        # Add a column for user ID
        labels_df['User'] = user_folder
        # Append the DataFrame to the list of labels data
        all_users_data.append(labels_df)
        
        # Check if the user folder contains the Trajectory folder
        if os.path.exists(trajectory_folder_path) and os.path.isdir(trajectory_folder_path):
            # Initialize an empty list to store DataFrames for each PLT file
            user_data = []
            # Iterate through each PLT file in the Trajectory folder
            for plt_file in os.listdir(trajectory_folder_path):
                plt_file_path = os.path.join(trajectory_folder_path, plt_file)
                # Read the PLT file into a DataFrame
                df = pd.read_csv(plt_file_path, skiprows=6, header=None, 
                                 names=['Latitude(deg)', 'Longitude(deg)', 'All set to 0', 'Altitude(ft)', 'No of days since 12/30/1899', 'Date', 'Time'], 
                                 usecols=[0, 1, 3, 5, 6])  # Skip Column 2 'All set to 0' & Column 4 'No of days since 12/30/1899'
                # Append the DataFrame to the list of user data
                user_data.append(df)
            # Concatenate all DataFrames for user into a single DataFrame
            user_df = pd.concat(user_data, ignore_index=True)
            # Add column for user ID
            user_df['User'] = user_folder
            # Append the DataFrame for the user to the list of all data
            all_data.append(user_df)

# Concatenate all DataFrames for each user into a single DataFrame
all_data_df = pd.concat(all_data, ignore_index=True)

# Merge Date and Time columns to create a Date-Time column
all_data_df['Date-Time'] = pd.to_datetime(all_data_df['Date'] + ' ' + all_data_df['Time'])
# Drop the Date & Time columns 
all_data_df.drop(['Date', 'Time'], axis=1, inplace=True)
# Reorder the columns
all_data_df = all_data_df[['User', 'Latitude(deg)', 'Longitude(deg)', 'Altitude(ft)', 'Date-Time']]

# Concatenate all DataFrames for different users into a single DataFrame
all_labels_df = pd.concat(all_users_data, ignore_index=True)

# Remove records for "boat" and "airplane" 
all_labels_df = all_labels_df[~all_labels_df['Transportation Mode'].isin(['boat', 'airplane'])]

# Consolidate transport modes
mode_mapping = {
    'car': 'Car', 'taxi': 'Car',
    'train': 'Train', 'subway': 'Train',
    'walk': 'Walk', 'run': 'Walk',
    'bike': 'Bike', 'motorcycle': 'Bike',
    'bus': 'Bus'
}

# Apply mode mapping to Transportation Mode column
all_labels_df['Transportation Mode'] = all_labels_df['Transportation Mode'].map(mode_mapping)

# Ensure the Date-Time columns are in datetime format
all_data_df['Date-Time'] = pd.to_datetime(all_data_df['Date-Time'])
all_labels_df['Start Time'] = pd.to_datetime(all_labels_df['Start Time'])
all_labels_df['End Time'] = pd.to_datetime(all_labels_df['End Time'])

# Initialize empty list to store filtered data
filtered_data = []

# Iterate through each user
for user in all_data_df['User'].unique():
    user_data = all_data_df[all_data_df['User'] == user]
    user_labels = all_labels_df[all_labels_df['User'] == user]
    for _, label in user_labels.iterrows():
        # Filter the user_data based on Start Time and End Time
        filtered_user_data = user_data[(user_data['Date-Time'] >= label['Start Time']) & 
                                       (user_data['Date-Time'] <= label['End Time'])]
        if not filtered_user_data.empty:
            filtered_user_data = filtered_user_data.copy()  # To avoid SettingWithCopyWarning
            filtered_user_data.loc[:, 'Transportation Mode'] = label['Transportation Mode']
            filtered_data.append(filtered_user_data)

# Concatenate all filtered data into a single DataFrame
labelled_df = pd.concat(filtered_data, ignore_index=True)

# Reorder columns
labelled_df = labelled_df[['User', 'Transportation Mode', 'Date-Time', 'Latitude(deg)', 'Longitude(deg)', 'Altitude(ft)']]

# Print labelled_df DataFrame
print(labelled_df)

        User Transportation Mode           Date-Time  Latitude(deg)  \
0        010               Train 2008-03-28 14:54:40      39.894178   
1        010               Train 2008-03-28 14:55:14      39.894505   
2        010               Train 2008-03-28 14:56:13      39.894953   
3        010               Train 2008-03-28 14:57:12      39.894600   
4        010               Train 2008-03-28 14:58:11      39.889622   
...      ...                 ...                 ...            ...   
5496246  179               Train 2008-11-29 02:29:29      40.029320   
5496247  179               Train 2008-11-29 02:29:31      40.029111   
5496248  179               Train 2008-11-29 02:29:33      40.028904   
5496249  179               Train 2008-11-29 02:29:35      40.028697   
5496250  179               Train 2008-11-29 02:43:37      39.967705   

         Longitude(deg)  Altitude(ft)  
0            116.318200        -777.0  
1            116.321132        -777.0  
2            116.326452    

### 1.2 Combine Labelled Dataset with Transport Mode & Start and End Points (labelled_trajectories.csv)

In [2]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

# Initialize empty lists to store DataFrames for each user's trajectory data and labels
all_data = []
all_users_data = []

# Loop through each user directory
for user_folder in os.listdir(main_directory):
    user_folder_path = os.path.join(main_directory, user_folder)

    # Define paths for labels.txt and Trajectory folder
    labels_file_path = os.path.join(user_folder_path, 'labels.txt')
    trajectory_folder_path = os.path.join(user_folder_path, 'Trajectory')
    
    # Check if the path is a directory and contains labels.txt
    if os.path.isdir(user_folder_path) and os.path.exists(labels_file_path):
        # Read labels.txt file
        labels_df = pd.read_csv(labels_file_path, sep='\t')
        # Add column for user ID
        labels_df['User'] = user_folder
        # Append DataFrame to the list of labels data
        all_users_data.append(labels_df)
        
        # Check if user folder contains the Trajectory folder
        if os.path.exists(trajectory_folder_path) and os.path.isdir(trajectory_folder_path):
            # Initialize an empty list to store DataFrames for each PLT file
            user_data = []
            # Iterate through each PLT file in the Trajectory folder
            for plt_file in os.listdir(trajectory_folder_path):
                plt_file_path = os.path.join(trajectory_folder_path, plt_file)
                # Read PLT file into a DataFrame
                df = pd.read_csv(plt_file_path, skiprows=6, header=None, 
                                 names=['Latitude(deg)', 'Longitude(deg)', 'All set to 0', 'Altitude(ft)', 'No of days since 12/30/1899', 'Date', 'Time'], 
                                 usecols=[0, 1, 3, 5, 6])  # Skip Column 2 'All set to 0' & Column 4 'No of days since 12/30/1899'
                # Append the DataFrame to the list of user data
                user_data.append(df)
            # Concatenate all DataFrames for the user into a single DataFrame
            user_df = pd.concat(user_data, ignore_index=True)
            # Add column for user ID
            user_df['User'] = user_folder
            # Append DataFrame for user to the list of all data
            all_data.append(user_df)

# Concatenate all DataFrames for each user into a single DataFrame
all_data_df = pd.concat(all_data, ignore_index=True)

# Merge Date and Time columns to create a Date-Time column
all_data_df['Date-Time'] = pd.to_datetime(all_data_df['Date'] + ' ' + all_data_df['Time'])
# Drop Date & Time columns 
all_data_df.drop(['Date', 'Time'], axis=1, inplace=True)

# Concatenate all DataFrames for different users into a single DataFrame
all_labels_df = pd.concat(all_users_data, ignore_index=True)

# Remove records for "boat" and "airplane" 
all_labels_df = all_labels_df[~all_labels_df['Transportation Mode'].isin(['boat', 'airplane'])]

# Consolidate transport modes
mode_mapping = {
    'car': 'Car', 'taxi': 'Car',
    'train': 'Train', 'subway': 'Train',
    'walk': 'Walk', 'run': 'Walk',
    'bike': 'Bike', 'motorcycle': 'Bike',
    'bus': 'Bus'
}

# Apply mode mapping to Transportation Mode column
all_labels_df['Transportation Mode'] = all_labels_df['Transportation Mode'].map(mode_mapping)

# Ensure the Date-Time columns are in datetime format
all_data_df['Date-Time'] = pd.to_datetime(all_data_df['Date-Time'])
all_labels_df['Start Time'] = pd.to_datetime(all_labels_df['Start Time'])
all_labels_df['End Time'] = pd.to_datetime(all_labels_df['End Time'])

# Initialize an empty list to store filtered data
filtered_data = []

# Iterate through each user
for user in all_data_df['User'].unique():
    user_data = all_data_df[all_data_df['User'] == user]
    user_labels = all_labels_df[all_labels_df['User'] == user]
    for _, label in user_labels.iterrows():
        # Filter the user_data based on Start Time and End Time
        filtered_user_data = user_data[(user_data['Date-Time'] >= label['Start Time']) & 
                                       (user_data['Date-Time'] <= label['End Time'])]
        if not filtered_user_data.empty:
            filtered_user_data = filtered_user_data.copy()  # To avoid SettingWithCopyWarning
            filtered_user_data.loc[:, 'Transportation Mode'] = label['Transportation Mode']
            filtered_user_data.loc[:, 'Start Time'] = label['Start Time']
            filtered_user_data.loc[:, 'End Time'] = label['End Time']
            filtered_user_data.loc[:, 'Start Latitude(deg)'] = filtered_user_data['Latitude(deg)'].iloc[0]
            filtered_user_data.loc[:, 'Start Longitude(deg)'] = filtered_user_data['Longitude(deg)'].iloc[0]
            filtered_user_data.loc[:, 'End Latitude(deg)'] = filtered_user_data['Latitude(deg)'].iloc[-1]
            filtered_user_data.loc[:, 'End Longitude(deg)'] = filtered_user_data['Longitude(deg)'].iloc[-1]

            filtered_data.append(filtered_user_data)

# Concatenate all filtered data into a single DataFrame
labelled_df = pd.concat(filtered_data, ignore_index=True)

# Reorder columns
labelled_df = labelled_df[['User', 'Transportation Mode', 'Date-Time', 'Latitude(deg)', 'Longitude(deg)', 'Altitude(ft)', 'Start Time', 'End Time',
                           'Start Latitude(deg)', 'Start Longitude(deg)', 'End Latitude(deg)', 'End Longitude(deg)']]

# Print labelled_df DataFrame
print(labelled_df)

# Save labelled_df to labelled_trajectories.csv file
output_file = os.path.join(main_directory, 'labelled_trajectories.csv')
labelled_df.to_csv(output_file, index=False)
print(f'Trajectories with mode of transport labelled saved to {output_file}')

        User Transportation Mode           Date-Time  Latitude(deg)  \
0        010               Train 2008-03-28 14:54:40      39.894178   
1        010               Train 2008-03-28 14:55:14      39.894505   
2        010               Train 2008-03-28 14:56:13      39.894953   
3        010               Train 2008-03-28 14:57:12      39.894600   
4        010               Train 2008-03-28 14:58:11      39.889622   
...      ...                 ...                 ...            ...   
5496246  179               Train 2008-11-29 02:29:29      40.029320   
5496247  179               Train 2008-11-29 02:29:31      40.029111   
5496248  179               Train 2008-11-29 02:29:33      40.028904   
5496249  179               Train 2008-11-29 02:29:35      40.028697   
5496250  179               Train 2008-11-29 02:43:37      39.967705   

         Longitude(deg)  Altitude(ft)          Start Time            End Time  \
0            116.318200        -777.0 2008-03-28 14:52:54 2008-03-

### 1.3 Combine Labelled Dataset to form Trajectories (labelled_trajectories_geopandas.csv)

In [3]:
import os
import pandas as pd
from shapely.geometry import LineString, Point
from math import radians, cos, sin, sqrt, atan2
import geopandas as gpd

# Load the labelled DataFrame
labelled_file = os.path.join(main_directory, 'labelled_trajectories.csv')
labelled_df = pd.read_csv(labelled_file, parse_dates=['Date-Time'])

# Ensure 'Date-Time' is in datetime format
labelled_df['Date-Time'] = pd.to_datetime(labelled_df['Date-Time'])
labelled_df['Start Time'] = pd.to_datetime(labelled_df['Start Time'])
labelled_df['End Time'] = pd.to_datetime(labelled_df['End Time'])

# Sort dataframe by User and Start Time
labelled_df = labelled_df.sort_values(by=['User', 'Start Time'])

# Function to create trajectories
def create_trajectory(group):
    if len(group) > 1: 
        return LineString(zip(group['Longitude(deg)'], group['Latitude(deg)']))
    else:
        return None  

# Extract start and end points
start_points = labelled_df.groupby(['User', 'Start Time', 'End Time']).first().reset_index()
end_points = labelled_df.groupby(['User', 'Start Time', 'End Time']).last().reset_index()

# Group by User, Start Time & End Time and create trajectories
trajectories = labelled_df.groupby(['User', 'Start Time', 'End Time'], group_keys=False).apply(create_trajectory).reset_index(name='Trajectory')

# Filter out None trajectories
trajectories = trajectories[trajectories['Trajectory'].notnull()]

# Merge trajectories back to the original dataframe (keep only first occurrence of each trajectories)
labelled_df = labelled_df.drop(columns=['Longitude(deg)', 'Latitude(deg)'])
labelled_df = pd.merge(labelled_df.drop_duplicates(subset=['User', 'Start Time', 'End Time']), trajectories, on=['User', 'Start Time', 'End Time'])

# Create GeoDataFrame
gdf = gpd.GeoDataFrame(labelled_df, geometry='Trajectory')

# Define the Haversine formula
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in km
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c
    
# Calculate Distance using Haversine formula
distances = []
for index, row in gdf.iterrows():
    distance = 0
    if isinstance(row['Trajectory'], LineString):
        coords = list(row['Trajectory'].coords)
        for i in range(len(coords) - 1):
            lon1, lat1 = coords[i]
            lon2, lat2 = coords[i + 1]
            distance += calculate_distance(lat1, lon1, lat2, lon2)
    distances.append(distance)
gdf['Distance(km)'] = distances

# Calculate Duration in hour and Speed in km/h
gdf['Duration(hr)'] = (gdf['End Time'] - gdf['Start Time']).dt.total_seconds() / 3600
gdf['Speed(km/h)'] = gdf['Distance(km)'] / gdf['Duration(hr)']

# View the trajectories 
print(gdf[['User', 'Trajectory', 'Transportation Mode', 'Start Time', 'End Time', 'Start Longitude(deg)', 
           'Start Latitude(deg)', 'End Longitude(deg)', 'End Latitude(deg)', 'Distance(km)', 'Duration(hr)', 'Speed(km/h)']])

# Save the GeoDataFrame to labelled_trajectories_geopandas.csv file
output_file_path = os.path.join(main_directory, 'labelled_trajectories_geopandas.csv')
gdf.to_csv(output_file_path, index=False)
print(f"GeoDataFrame of Labelled Dataset saved to {output_file_path}")

  trajectories = labelled_df.groupby(['User', 'Start Time', 'End Time'], group_keys=False).apply(create_trajectory).reset_index(name='Trajectory')


      User                                         Trajectory  \
0       10  LINESTRING (116.31820 39.89418, 116.32113 39.8...   
1       10  LINESTRING (116.71495 39.50293, 116.72614 39.4...   
2       10  LINESTRING (116.95947 36.66328, 116.95627 36.6...   
3       10  LINESTRING (109.61987 34.49713, 109.60540 34.4...   
4       10  LINESTRING (95.45776 41.14720, 95.44490 41.153...   
...    ...                                                ...   
9563   179  LINESTRING (116.31263 40.07015, 116.31261 40.0...   
9564   179  LINESTRING (116.29891 40.08890, 116.29895 40.0...   
9565   179  LINESTRING (116.30685 40.07311, 116.30683 40.0...   
9566   179  LINESTRING (116.31381 40.07024, 116.31375 40.0...   
9567   179  LINESTRING (116.31338 40.06952, 116.31352 40.0...   

     Transportation Mode          Start Time            End Time  \
0                  Train 2008-03-28 14:52:54 2008-03-28 15:59:59   
1                  Train 2008-03-28 16:00:00 2008-03-28 22:02:00   
2              

### 1.4 Load Unlabelled Dataset - No Transport Mode and Start & End Points (unlabelled_trajectories.csv)

In [4]:
import os
import pandas as pd

# Initialize empty list to store DataFrames from each user's Trajectory folder
dataframes = []

# # Loop through each user directory (000 to 181)
for user_folder in range(182):
    user_folder_path = os.path.join(main_directory, f'{user_folder:03d}')
    labels_file_path = os.path.join(user_folder_path, 'labels.txt')
    trajectory_folder_path = os.path.join(user_folder_path, 'Trajectory')
    
    # Check if labels.txt does not exist (unlabelled data)
    if not os.path.exists(labels_file_path):
        # Process Trajectory folder
        for trajectory_file in os.listdir(trajectory_folder_path):
            if trajectory_file.endswith('.plt'):
                trajectory_file_path = os.path.join(trajectory_folder_path, trajectory_file)
                # Read the trajectory file, skipping the first 6 lines of metadata
                df = pd.read_csv(trajectory_file_path, skiprows=6, header=None, 
                                 names=['Latitude(deg)', 'Longitude(deg)', 'Zero1', 'Altitude(ft)', 'Date Days', 'Date', 'Time'])
                df['User'] = f'{user_folder:03d}'
                df['Date-Time'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
                df = df[['User', 'Date-Time', 'Latitude(deg)', 'Longitude(deg)', 'Altitude(ft)']]
                dataframes.append(df)

# Combine all dataframes into a single dataframe
unlabelled_trajectories_df = pd.concat(dataframes, ignore_index=True)

# Print unlabelled_trajectories_df DataFrame
print(unlabelled_trajectories_df)

# Save unlabelled_trajectories_df DataFrame to unlabelled_trajectories file
output_file_path = os.path.join(main_directory, 'unlabelled_trajectories.csv')
unlabelled_trajectories_df.to_csv(output_file_path, index=False)
print(f"Unlabelled trajectories saved to {output_file_path}")

         User           Date-Time  Latitude(deg)  Longitude(deg)  Altitude(ft)
0         000 2008-10-23 02:53:04      39.984702      116.318417    492.000000
1         000 2008-10-23 02:53:10      39.984683      116.318450    492.000000
2         000 2008-10-23 02:53:15      39.984686      116.318417    492.000000
3         000 2008-10-23 02:53:20      39.984688      116.318385    492.000000
4         000 2008-10-23 02:53:25      39.984655      116.318263    492.000000
...       ...                 ...            ...             ...           ...
12359609  181 2008-03-14 03:39:56      40.914867      111.710500   3802.493438
12359610  181 2008-03-14 03:41:17      40.914267      111.710333   3795.931759
12359611  181 2008-03-14 03:43:02      40.912467      111.710667   3795.931759
12359612  181 2008-03-14 03:43:28      40.911517      111.711317   3779.527559
12359613  181 2008-03-14 03:43:40      40.910933      111.711617   3802.493438

[12359614 rows x 5 columns]
Unlabelled trajectories

# 2. Model to Predict Start & End Points

### 2.1 Train & Evaluate Model using labelled_trajectories - Random Forest Regressor (Start: 88% & End: 90%)

In [5]:
import os
import pandas as pd
import numpy as np
import datetime
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Load the labelled dataset
labelled_file = os.path.join(main_directory, 'labelled_trajectories.csv')
labelled_df = pd.read_csv(labelled_file)

# Define feature and target columns for start and end points
feature_columns = ['User', 'Latitude(deg)', 'Longitude(deg)', 'Altitude(ft)', 'Date-Time']
start_target = ['Start Latitude(deg)', 'Start Longitude(deg)', 'Start Time']
end_target = ['End Latitude(deg)', 'End Longitude(deg)', 'End Time']

# Ensure datetime columns are in the correct format
labelled_df['Start Time'] = pd.to_datetime(labelled_df['Start Time'])
labelled_df['End Time'] = pd.to_datetime(labelled_df['End Time'])
labelled_df['Date-Time'] = pd.to_datetime(labelled_df['Date-Time'])

# Convert datetime to numerical values (timestamp)
labelled_df['Start Time'] = labelled_df['Start Time'].astype('int64') / 10**9
labelled_df['End Time'] = labelled_df['End Time'].astype('int64') / 10**9
labelled_df['Date-Time'] = labelled_df['Date-Time'].astype('int64') / 10**9

# Features and targets for start points
X_start = labelled_df[feature_columns]
y_start = labelled_df[start_target]

# Splitting the data for start points (80% training & 20% testing)
X_start_train, X_start_test, y_start_train, y_start_test = train_test_split(X_start, y_start, test_size=0.2, random_state=42)

# Hyperparameter tuning using GridSearchCV for start points
start_param_grid = {
    'n_estimators': [100],
    'max_depth': [20],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'bootstrap': [True]
}
regressor_start = RandomForestRegressor(random_state=42, n_jobs=-1)
grid_search_start = GridSearchCV(estimator=regressor_start, param_grid=start_param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search_start.fit(X_start_train, y_start_train)
best_start_regressor = grid_search_start.best_estimator_

# Save the best start regressor model
start_model_path = os.path.join(main_directory, 'start_points_regressor.pkl')
joblib.dump(best_start_regressor, start_model_path)
print(f"Start points regressor model saved to {start_model_path}")

# Features and targets for end points
X_end = labelled_df[feature_columns]
y_end = labelled_df[end_target]

# Splitting the data for end points (80% training & 20% testing)
X_end_train, X_end_test, y_end_train, y_end_test = train_test_split(X_end, y_end, test_size=0.2, random_state=42)

# Hyperparameter tuning using GridSearchCV for end points
end_param_grid = {
    'n_estimators': [100],
    'max_depth': [20],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'bootstrap': [True]
}
regressor_end = RandomForestRegressor(random_state=42, n_jobs=-1)
grid_search_end = GridSearchCV(estimator=regressor_end, param_grid=end_param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search_end.fit(X_end_train, y_end_train)
best_end_regressor = grid_search_end.best_estimator_

# Save the best end regressor model
end_model_path = os.path.join(main_directory, 'end_points_regressor.pkl')
joblib.dump(best_end_regressor, end_model_path)
print(f"End points regressor model saved to {end_model_path}")

# Make predictions on the entire dataset for start and end points
predicted_start_points = best_start_regressor.predict(X_start)
predicted_end_points = best_end_regressor.predict(X_end)

# Add predicted columns to dataframe
labelled_df['Predicted Start Latitude(deg)'] = predicted_start_points[:, 0]
labelled_df['Predicted Start Longitude(deg)'] = predicted_start_points[:, 1]
labelled_df['Predicted Start Time'] = predicted_start_points[:, 2]
labelled_df['Predicted End Latitude(deg)'] = predicted_end_points[:, 0]
labelled_df['Predicted End Longitude(deg)'] = predicted_end_points[:, 1]
labelled_df['Predicted End Time'] = predicted_end_points[:, 2]

# Convert predicted start and end times back to datetime string format
labelled_df['Predicted Start Time'] = pd.to_datetime(labelled_df['Predicted Start Time'], unit='s')
labelled_df['Predicted End Time'] = pd.to_datetime(labelled_df['Predicted End Time'], unit='s')

# Calculate distance, duration and speed for predicted points and ensure they are valid
def calculate_distance(lat1, lon1, lat2, lon2):
    from math import radians, cos, sin, sqrt, atan2
    R = 6371  # Radius of the Earth in km
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

labelled_df['Predicted Distance(km)'] = labelled_df.apply(
    lambda row: calculate_distance(row['Predicted Start Latitude(deg)'], row['Predicted Start Longitude(deg)'],
                                   row['Predicted End Latitude(deg)'], row['Predicted End Longitude(deg)']), axis=1)

labelled_df['Predicted Duration(hr)'] = (labelled_df['Predicted End Time'] - labelled_df['Predicted Start Time']).dt.total_seconds() / 3600
labelled_df['Predicted Speed(km/h)'] = labelled_df['Predicted Distance(km)'] / labelled_df['Predicted Duration(hr)']

# Filter out invalid distances and durations
valid_predictions = (labelled_df['Predicted Distance(km)'] > 0) & (labelled_df['Predicted Duration(hr)'] > 0)
invalid_predictions_count = (~valid_predictions).sum()
print(f"Number of invalid predictions: {invalid_predictions_count}")

# Evaluate accuracy by comparing predicted and original values
labelled_df['Start Latitude Accurate'] = np.isclose(labelled_df['Start Latitude(deg)'], labelled_df['Predicted Start Latitude(deg)'], atol=0.0001)
labelled_df['Start Longitude Accurate'] = np.isclose(labelled_df['Start Longitude(deg)'], labelled_df['Predicted Start Longitude(deg)'], atol=0.0001)
labelled_df['Start Time Accurate'] = np.isclose(labelled_df['Start Time'], labelled_df['Predicted Start Time'].astype('int64') / 10**9, atol=60)  # Allow 1-minute difference
labelled_df['End Latitude Accurate'] = np.isclose(labelled_df['End Latitude(deg)'], labelled_df['Predicted End Latitude(deg)'], atol=0.0001)
labelled_df['End Longitude Accurate'] = np.isclose(labelled_df['End Longitude(deg)'], labelled_df['Predicted End Longitude(deg)'], atol=0.0001)
labelled_df['End Time Accurate'] = np.isclose(labelled_df['End Time'], labelled_df['Predicted End Time'].astype('int64') / 10**9, atol=60)  # Allow 1-minute difference

# Calculate overall accuracy for start and end points
start_accuracy = labelled_df[['Start Latitude Accurate', 'Start Longitude Accurate', 'Start Time Accurate']].mean().mean()
end_accuracy = labelled_df[['End Latitude Accurate', 'End Longitude Accurate', 'End Time Accurate']].mean().mean()

print(f"Overall Start Points Prediction Accuracy: {start_accuracy:}")
print(f"Overall End Points Prediction Accuracy: {end_accuracy:}")

# Save labelled_df DataFrame to labelled_trajectories_with_predictions.csv
output_file = os.path.join(main_directory, 'labelled_trajectories_with_predictions.csv')
labelled_df.to_csv(output_file, index=False)
print(f"Data with predictions saved to {output_file}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Start points regressor model saved to C:/Users/wengk.WK/OneDrive - Nanyang Technological University/Desktop/Anomaly Dectection In Trajectories Using GPS Data/Geolife Trajectories 1.3/Data\start_points_regressor.pkl
Fitting 3 folds for each of 1 candidates, totalling 3 fits
End points regressor model saved to C:/Users/wengk.WK/OneDrive - Nanyang Technological University/Desktop/Anomaly Dectection In Trajectories Using GPS Data/Geolife Trajectories 1.3/Data\end_points_regressor.pkl
Number of invalid predictions: 10656
Overall Start Points Prediction Accuracy: 0.8871445887994077
Overall End Points Prediction Accuracy: 0.9093220087656113
Data with predictions saved to C:/Users/wengk.WK/OneDrive - Nanyang Technological University/Desktop/Anomaly Dectection In Trajectories Using GPS Data/Geolife Trajectories 1.3/Data\labelled_trajectories_with_predictions.csv


### 2.2 Predict Start & End Points for Unlabelled Dataset to form Trajectories (use start_points_regressor.pkl & end_points_regressor.pkl)

In [3]:
import os
import pandas as pd
import numpy as np
from haversine import haversine, Unit
from shapely.geometry import LineString
import geopandas as gpd
import joblib

# Load the trained regressor models for start and end points
start_model_file = os.path.join(main_directory, 'start_points_regressor.pkl')
end_model_file = os.path.join(main_directory, 'end_points_regressor.pkl')
regressor_start = joblib.load(start_model_file)
regressor_end = joblib.load(end_model_file)

# Load the unlabelled dataset
unlabelled_file = os.path.join(main_directory, 'unlabelled_trajectories.csv')
unlabelled_df = pd.read_csv(unlabelled_file)

# Ensure datetime columns are in the correct format
unlabelled_df['Date-Time'] = pd.to_datetime(unlabelled_df['Date-Time'])
unlabelled_df['Date-Time'] = unlabelled_df['Date-Time'].astype('int64') // 10**9

# Extract features used in the trained models
feature_columns = ['User', 'Latitude(deg)', 'Longitude(deg)', 'Altitude(ft)', 'Date-Time']

# Ensure columns are in the same order as used during training
unlabelled_features = unlabelled_df[feature_columns]

# Predict start and end points for unlabelled data
y_start_pred = regressor_start.predict(unlabelled_features)
y_end_pred = regressor_end.predict(unlabelled_features)

# Update 'Predicted' columns in unlabelled_df
unlabelled_df['Predicted Start Latitude(deg)'] = y_start_pred[:, 0]
unlabelled_df['Predicted Start Longitude(deg)'] = y_start_pred[:, 1]
unlabelled_df['Predicted Start Time'] = y_start_pred[:, 2]

unlabelled_df['Predicted End Latitude(deg)'] = y_end_pred[:, 0]
unlabelled_df['Predicted End Longitude(deg)'] = y_end_pred[:, 1]
unlabelled_df['Predicted End Time'] = y_end_pred[:, 2]

# Ensure valid predictions for start and end times
unlabelled_df['Predicted End Time'] = unlabelled_df.apply(
    lambda row: max(row['Predicted End Time'], row['Predicted Start Time']), axis=1)

def calculate_distance(lat1, lon1, lat2, lon2):
    # Haversine formula to calculate distance between two points on the Earth
    from math import radians, cos, sin, sqrt, atan2
    R = 6371  # Radius of the Earth in km
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

# Calculate distance and duration for predicted points and ensure they are valid
unlabelled_df['Predicted Distance(km)'] = unlabelled_df.apply(
    lambda row: calculate_distance(row['Predicted Start Latitude(deg)'], row['Predicted Start Longitude(deg)'],
                                   row['Predicted End Latitude(deg)'], row['Predicted End Longitude(deg)']), axis=1)

unlabelled_df['Predicted Duration(hr)'] = (unlabelled_df['Predicted End Time'] - unlabelled_df['Predicted Start Time']) / 3600
unlabelled_df['Predicted Speed(km/h)'] = unlabelled_df['Predicted Distance(km)'] / unlabelled_df['Predicted Duration(hr)']

# Filter out invalid distances and durations
valid_predictions = (unlabelled_df['Predicted Distance(km)'] > 0) & (unlabelled_df['Predicted Duration(hr)'] > 0)
invalid_predictions_count = (~valid_predictions).sum()
print(f"Number of invalid predictions: {invalid_predictions_count}")

# Filter out invalid predictions
unlabelled_df = unlabelled_df[valid_predictions]

# Convert times back to datetime format
unlabelled_df['Date-Time'] = pd.to_datetime(unlabelled_df['Date-Time'], unit='s')
unlabelled_df['Predicted Start Time'] = pd.to_datetime(unlabelled_df['Predicted Start Time'], unit='s')
unlabelled_df['Predicted End Time'] = pd.to_datetime(unlabelled_df['Predicted End Time'], unit='s')

# Convert times to string format
unlabelled_df['Date-Time'] = unlabelled_df['Date-Time'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')
unlabelled_df['Predicted Start Time'] = unlabelled_df['Predicted Start Time'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')
unlabelled_df['Predicted End Time'] = unlabelled_df['Predicted End Time'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')

# Print unlabelled_df DataFrame
print(f"Predicted start and end points with metrics for unlabelled trajectories: ")
print(unlabelled_df)

# Combine Unlabelled Dataset to form Trajectories (No Transport Mode)
# Rename columns to remove "Predicted" 
unlabelled_df.rename(columns={
    'Predicted Start Time': 'Start Time',
    'Predicted Start Longitude(deg)': 'Start Longitude(deg)',
    'Predicted Start Latitude(deg)': 'Start Latitude(deg)',
    'Predicted End Time': 'End Time',
    'Predicted End Longitude(deg)': 'End Longitude(deg)',
    'Predicted End Latitude(deg)': 'End Latitude(deg)',
    'Predicted Distance(km)': 'Distance(km)',
    'Predicted Duration(hr)': 'Duration(hr)',
    'Predicted Speed(km/h)': 'Speed(km/h)'
}, inplace=True)

# Sort dataframe by User and Start Time
unlabelled_df = unlabelled_df.sort_values(by=['User', 'Start Time'])

# Function to create trajectories
def create_trajectory(group):
    if len(group) > 1:  # Ensure there are more than one point
        return LineString(zip(group['Longitude(deg)'], group['Latitude(deg)']))
    else:
        return None  # Return None if there's only one point

# Group by User, Start Time, and End Time and create trajectories
trajectories = unlabelled_df.groupby(['User', 'Start Time', 'End Time'], group_keys=False).apply(create_trajectory).reset_index(name='Trajectory')

# Filter out None trajectories
trajectories = trajectories[trajectories['Trajectory'].notnull()]

# Merge trajectories back to the original dataframe, keeping only the first occurrence of each combination of User, Start Time, and End Time
unlabelled_df = unlabelled_df.drop(columns=['Longitude(deg)', 'Latitude(deg)'])
unlabelled_df = pd.merge(unlabelled_df.drop_duplicates(subset=['User', 'Start Time', 'End Time']), trajectories, on=['User', 'Start Time', 'End Time'])

# Create GeoDataFrame
gdf = gpd.GeoDataFrame(unlabelled_df, geometry='Trajectory')

# Filter out invalid values for Distance or Duration or Speed
valid_predictions = (gdf['Distance(km)'] > 0) & (gdf['Duration(hr)'] > 0) & (gdf['Speed(km/h)'] > 0)
invalid_predictions_count = (~valid_predictions).sum()
print(f"Number of invalid predictions: {invalid_predictions_count}")

# View the trajectories with additional columns
print(f"Predicted trajectories: ")
print(gdf[['User', 'Trajectory', 'Altitude(ft)', 'Date-Time', 'Start Time', 'End Time', 'Start Longitude(deg)', 
           'Start Latitude(deg)', 'End Longitude(deg)', 'End Latitude(deg)', 'Distance(km)', 'Duration(hr)', 'Speed(km/h)']])

# Save the GeoDataFrame to unlabelled_trajectories_geopandas.csv file
output_file_path = os.path.join(main_directory, 'unlabelled_trajectories_geopandas.csv')
gdf.to_csv(output_file_path, index=False)
print(f"GeoDataFrame of Unlabelled Dataset saved to {output_file_path}")

Number of invalid predictions: 1343723
Predicted start and end points with metrics for unlabelled trajectories: 
          User                   Date-Time  Latitude(deg)  Longitude(deg)  \
0            0  2008-10-23 02:53:04.000000      39.984702      116.318417   
1            0  2008-10-23 02:53:10.000000      39.984683      116.318450   
2            0  2008-10-23 02:53:15.000000      39.984686      116.318417   
3            0  2008-10-23 02:53:20.000000      39.984688      116.318385   
4            0  2008-10-23 02:53:25.000000      39.984655      116.318263   
...        ...                         ...            ...             ...   
12359609   181  2008-03-14 03:39:56.000000      40.914867      111.710500   
12359610   181  2008-03-14 03:41:17.000000      40.914267      111.710333   
12359611   181  2008-03-14 03:43:02.000000      40.912467      111.710667   
12359612   181  2008-03-14 03:43:28.000000      40.911517      111.711317   
12359613   181  2008-03-14 03:43:40.0000

  trajectories = unlabelled_df.groupby(['User', 'Start Time', 'End Time'], group_keys=False).apply(create_trajectory).reset_index(name='Trajectory')


Number of invalid predictions: 0
Predicted trajectories: 
       User                                         Trajectory  Altitude(ft)  \
0         0  LINESTRING (116.31842 39.98470, 116.31845 39.9...    492.000000   
1         0  LINESTRING (116.31432 39.98462, 116.31411 39.9...    113.000000   
2         0  LINESTRING (116.32026 39.99128, 116.32045 39.9...     71.000000   
3         0  LINESTRING (116.32215 39.99948, 116.32455 39.9...    148.000000   
4         0  LINESTRING (116.32218 39.99961, 116.32232 39.9...    142.000000   
...     ...                                                ...           ...   
47301   181  LINESTRING (116.31097 39.98043, 116.31072 39.9...    167.322835   
47302   181  LINESTRING (116.30895 39.98547, 116.30972 39.9...    223.097113   
47303   181  LINESTRING (116.30000 39.98067, 116.29978 39.9...    754.593176   
47304   181  LINESTRING (116.31235 39.97198, 116.31270 39.9...    216.535433   
47305   181  LINESTRING (111.70922 40.91815, 111.70953 40.9...

# 3. Model to Predict Transport Mode 

### 3.1 Train & Evaluate Model using labelled data - XGBoost Classifier (99%)

In [8]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define the feature columns and target column
feature_columns = ['Altitude(ft)', 'Start Time', 'End Time', 'Start Longitude(deg)', 'Start Latitude(deg)', 
                   'End Longitude(deg)', 'End Latitude(deg)', 'Distance(km)', 'Duration(hr)', 'Speed(km/h)']

# Load the labelled data
labelled_file = os.path.join(main_directory, 'labelled_trajectories_geopandas.csv')
labelled_df = pd.read_csv(labelled_file)

# Ensure datetime columns are in the correct format
labelled_df['Start Time'] = pd.to_datetime(labelled_df['Start Time'])
labelled_df['End Time'] = pd.to_datetime(labelled_df['End Time'])

# Convert datetime to numerical values (timestamp)
labelled_df['Start Time'] = labelled_df['Start Time'].astype('int64') // 10**9
labelled_df['End Time'] = labelled_df['End Time'].astype('int64') // 10**9

# Reset index of the DataFrame
labelled_df.reset_index(drop=True, inplace=True)

# Features and target
X = labelled_df[feature_columns]
y = labelled_df['Transportation Mode']

# Encode the target variable as numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Define the XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_), random_state=42)

# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [200],
    'max_depth': [7],
    'learning_rate': [0.15],
    'subsample': [0.85],
    'colsample_bytree': [0.55],
    'reg_alpha': [0.05],
    'reg_lambda': [2.0],
}

# Perform grid search
grid_search = GridSearchCV(xgb_model, param_grid, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X, y_encoded)

# Get the best model from the search
best_xgb_model = grid_search.best_estimator_

# Make predictions on the entire dataset
y_pred_encoded = best_xgb_model.predict(X)

# Decode the predicted and true labels back to original transportation modes
y_pred_decoded = label_encoder.inverse_transform(y_pred_encoded)

# Evaluate the predictions
accuracy = accuracy_score(y, y_pred_decoded)
report = classification_report(y, y_pred_decoded)

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

# Save the best model to a file
model_file = os.path.join(main_directory, 'transport_mode_classifier.pkl')
joblib.dump(best_xgb_model, model_file)
print(f"Trained model saved to {model_file}")

# Add 'Predicted Transport Mode' column
labelled_df['Predicted Transport Mode'] = y_pred_decoded

# Convert numeric times back to date-time format
labelled_df['Start Time'] = pd.to_datetime(labelled_df['Start Time'], unit='s')
labelled_df['End Time'] = pd.to_datetime(labelled_df['End Time'], unit='s')

# Convert datetime columns to the desired string format
labelled_df['Start Time'] = labelled_df['Start Time'].dt.strftime('%Y-%m-%d %H:%M:%S')
labelled_df['End Time'] = labelled_df['End Time'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Save the label encoder to a file
encoder_file = os.path.join(main_directory, 'label_encoder.pkl')
joblib.dump(label_encoder, encoder_file)
print(f"Label encoder saved to {encoder_file}")

# Save the updated dataframe with predictions to a new CSV file
output_file_with_predictions = os.path.join(main_directory, 'labelled_trajectories_with_predictions.csv')
labelled_df.to_csv(output_file_with_predictions, index=False)
print(f"Labelled trajectories with predictions saved to {output_file_with_predictions}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Accuracy: 0.9997909698996655
Classification Report:
              precision    recall  f1-score   support

        Bike       1.00      1.00      1.00      1572
         Bus       1.00      1.00      1.00      1866
         Car       1.00      1.00      1.00      1307
       Train       1.00      1.00      1.00       787
        Walk       1.00      1.00      1.00      4036

    accuracy                           1.00      9568
   macro avg       1.00      1.00      1.00      9568
weighted avg       1.00      1.00      1.00      9568

Trained model saved to C:/Users/wengk.WK/OneDrive - Nanyang Technological University/Desktop/Anomaly Dectection In Trajectories Using GPS Data/Geolife Trajectories 1.3/Data\transport_mode_classifier.pkl
Label encoder saved to C:/Users/wengk.WK/OneDrive - Nanyang Technological University/Desktop/Anomaly Dectection In Trajectories Using GPS Data/Geolife Trajectories 1.3/Data\label_encoder.pkl
Labell

In [9]:
print(test_set)

NameError: name 'test_set' is not defined

In [None]:
print(labelled_df)

### 3.2 Predict Mode of Transport for Unlabelled Data (use XGBoost transport_mode_classifier.pkl)

In [10]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load the trained model
model_file = os.path.join(main_directory, 'transport_mode_classifier.pkl')
clf = joblib.load(model_file)

# Load the unlabelled trajectories dataset
unlabelled_file = os.path.join(main_directory, 'unlabelled_trajectories_geopandas.csv')
unlabelled_df = pd.read_csv(unlabelled_file)

# Load the label encoder from a file
encoder_file = os.path.join(main_directory, 'label_encoder.pkl')
label_encoder = joblib.load(encoder_file)

# Ensure datetime columns are in the correct format
unlabelled_df['Start Time'] = pd.to_datetime(unlabelled_df['Start Time'], errors='coerce')
unlabelled_df['End Time'] = pd.to_datetime(unlabelled_df['End Time'], errors='coerce')

# Check if any NaN values in 'Start Time' or 'End Time' column
nan_start = unlabelled_df['Start Time'].isna().sum()
nan_end = unlabelled_df['End Time'].isna().sum()
nan_time = nan_start + nan_end
print(f"Number of records with NaN in Start or End Time columns: {nan_time}")

# Drop rows with NaT values in 'Start Time' or 'End Time'
unlabelled_df.dropna(subset=['Start Time', 'End Time'], inplace=True)

# Convert datetime to numerical values (timestamp)
unlabelled_df['Start Time'] = unlabelled_df['Start Time'].astype('int64') / 10**9
unlabelled_df['End Time'] = unlabelled_df['End Time'].astype('int64') / 10**9

# Extract features used in the trained model
feature_columns = ['Altitude(ft)', 'Start Time', 'End Time', 'Start Longitude(deg)', 'Start Latitude(deg)', 
                   'End Longitude(deg)', 'End Latitude(deg)', 'Distance(km)', 'Duration(hr)', 'Speed(km/h)']

# Ensure columns are in the same order as used during training
unlabelled_features = unlabelled_df[feature_columns]

# Check for and handle large or infinite values in 'Speed'
max_speed = unlabelled_features[unlabelled_features['Speed(km/h)'] != np.inf]['Speed(km/h)'].max()
unlabelled_features.loc[unlabelled_features['Speed(km/h)'] == np.inf, 'Speed(km/h)'] = max_speed

# Predict transport modes for unlabelled data
predicted_modes = clf.predict(unlabelled_features)

# Decode the predicted modes from numerical labels to original mode names
predicted_modes_decoded = label_encoder.inverse_transform(predicted_modes)

# Update 'Predicted Transport Mode' column in unlabelled_df
unlabelled_df['Predicted Transportation Mode'] = predicted_modes_decoded

# Convert numerical times back to datetime format
unlabelled_df['Start Time'] = pd.to_datetime(unlabelled_df['Start Time'] * 10**9)
unlabelled_df['End Time'] = pd.to_datetime(unlabelled_df['End Time'] * 10**9)

# Print unlabelled_df DataFrame
print(unlabelled_df)

# Save unlabelled_df dataframe to predicted_trajectories_transport.csv file
output_file_path = os.path.join(main_directory, 'predicted_trajectories_transport.csv')
unlabelled_df.to_csv(output_file_path, index=False)
print(f"Predicted transport modes for unlabelled trajectories saved to {output_file_path}")

Number of records with NaN in Start or End Time columns: 0
       User            Date-Time  Altitude(ft)  Start Latitude(deg)  \
0         0  2008-10-23 02:53:04    492.000000            39.975352   
1         0  2008-10-23 02:55:25    113.000000            39.975352   
2         0  2008-10-23 04:25:57     71.000000            39.976305   
3         0  2008-10-23 04:31:42    148.000000            39.990085   
4         0  2008-10-23 04:31:47    142.000000            39.990085   
...     ...                  ...           ...                  ...   
47301   181  2008-02-16 08:40:19    167.322835            39.976168   
47302   181  2008-02-16 08:04:42    223.097113            39.976155   
47303   181  2008-02-16 08:59:27    754.593176            39.976155   
47304   181  2008-02-16 08:32:35    216.535433            39.975860   
47305   181  2008-03-14 02:57:55   3825.459318            39.977733   

       Start Longitude(deg)                    Start Time  End Latitude(deg)  \
0       

In [11]:
import pandas as pd
import numpy as np

# Count of records for each unique transportation mode
mode_counts = unlabelled_df['Predicted Transportation Mode'].value_counts()
print("Counts of records for each transportation mode:")
print(mode_counts)

# Check if any NaN values in Transportation Mode column
nan_count = unlabelled_df['Predicted Transportation Mode'].isna().sum()
print(f"Number of records with NaN in Transport Mode column: {nan_count}")

Counts of records for each transportation mode:
Predicted Transportation Mode
Walk     26900
Bus      10820
Car       4084
Train     3841
Bike      1661
Name: count, dtype: int64
Number of records with NaN in Transport Mode column: 0


# 4. Form Combined Trajectories Dataset 

### 4.1 Combine Labelled & Predicted Unlabelled Trajectories (combined_trajectories_predictions.csv)

In [13]:
import os
import pandas as pd

# Load labelled dataset
labelled_file = os.path.join(main_directory, 'labelled_trajectories_geopandas.csv')
labelled_df = pd.read_csv(labelled_file, parse_dates=['Start Time', 'End Time'])

# Load predicted unlabelled dataset
unlabelled_file = os.path.join(main_directory, 'predicted_trajectories_transport.csv')
unlabelled_predicted_df = pd.read_csv(unlabelled_file, parse_dates=['Start Time', 'End Time'])

# Rename Predicted Transportation Mode column to Transportation Mode in unlabelled_predicted_df
unlabelled_predicted_df.rename(columns={'Predicted Transportation Mode': 'Transportation Mode'}, inplace=True)

# Combine the labelled and unlabelled datasets
combined_df = pd.concat([labelled_df, unlabelled_predicted_df], ignore_index=True)

# Ensure Date-Time columns are in datetime format
datetime_columns = ['Date-Time', 'Start Time', 'End Time']
for column in datetime_columns:
    if column in combined_df.columns:
        combined_df[column] = pd.to_datetime(combined_df[column], errors='coerce')

# Convert datetime columns to YYYY-MM-DD HH:MM:SS format
for column in datetime_columns:
    if column in combined_df.columns:
        combined_df[column] = combined_df[column].dt.strftime('%Y-%m-%d %H:%M:%S')

# Print combined_df DataFrame
print(combined_df)

# Save the combined dataframe to a new CSV file
output_combined_file = os.path.join(main_directory, 'combined_trajectories_predictions.csv')
combined_df.to_csv(output_combined_file, index=False)
print(f"Combined dataset with predictions saved to {output_combined_file}")

       User Transportation Mode            Date-Time  Altitude(ft)  \
0        10               Train  2008-03-28 14:54:40   -777.000000   
1        10               Train  2008-03-28 16:00:01   -777.000000   
2        10               Train  2008-03-29 01:32:52   -777.000000   
3        10               Train  2008-03-29 16:00:48   -777.000000   
4        10               Train  2008-03-30 16:00:39   -777.000000   
...     ...                 ...                  ...           ...   
56869   181                Walk  2008-02-16 08:40:19    167.322835   
56870   181                Walk  2008-02-16 08:04:42    223.097113   
56871   181                Walk  2008-02-16 08:59:27    754.593176   
56872   181                Walk  2008-02-16 08:32:35    216.535433   
56873   181                Bike  2008-03-14 02:57:55   3825.459318   

                Start Time             End Time  Start Latitude(deg)  \
0      2008-03-28 14:52:54  2008-03-28 15:59:59            39.894178   
1      2008-03-

In [14]:
import pandas as pd
import numpy as np

# Count of records for each unique transportation mode
mode_counts = combined_df['Transportation Mode'].value_counts()
print("Counts of records for each transportation mode:")
print(mode_counts)

# Check if any NaN values in Transportation Mode column
nan_count = combined_df['Transportation Mode'].isna().sum()
print(f"Number of records with NaN in Transport Mode column: {nan_count}")

Counts of records for each transportation mode:
Transportation Mode
Walk     30936
Bus      12686
Car       5391
Train     4628
Bike      3233
Name: count, dtype: int64
Number of records with NaN in Transport Mode column: 0


### 4.2. Generate Anomalous Trajectories (5% of combined dataset)

In [15]:
import os
import pandas as pd
from shapely.geometry import LineString
import random
import numpy as np
from math import radians, sin, cos, atan2, sqrt

# Load combined_trajectories_predictions.csv
file_path = os.path.join(main_directory, 'combined_trajectories_predictions.csv')
df = pd.read_csv(file_path, parse_dates=['Date-Time'])

# Function to parse each trajectory individually
def parse_trajectories(trajectory_string):
    # Remove 'LINESTRING (' and ')'
    coords_str = trajectory_string.replace('LINESTRING (', '').replace(')', '')
        
    # Split by ', ' to get individual coordinate pairs
    coord_pairs = coords_str.split(', ')
        
    # Ensure there are at least two coordinates for a LineString
    if len(coord_pairs) < 2:
        raise ValueError(f"Insufficient coordinate pairs in LineString representation: {trajectory_string}")
        
    # Extract coordinates
    coords = []
    for coord_pair in coord_pairs:
        lon_lat = coord_pair.split()  # Split by space to get longitude and latitude
        if len(lon_lat) != 2:
            raise ValueError(f"Invalid coordinate pair format: {coord_pair}")
        lon = float(lon_lat[0])
        lat = float(lon_lat[1])
        coords.append((lon, lat))
        
    return LineString(coords)

# Convert 'Trajectory' column to LineString objects
df['Parsed_Trajectory'] = df['Trajectory'].apply(parse_trajectories)

# Drop rows where Parsed_Trajectory is None
df = df.dropna(subset=['Parsed_Trajectory'])

# Determine the number of trajectories to modify (5% of total)
anomalous_count = int(len(df) * 0.05) 

# Select random indices of trajectories to modify
anomalous_indices = random.sample(df.index.tolist(), anomalous_count)

# Function to calculate Anomaly Correlation Coefficient (ACC)
def calculate_acc(original_coords, modified_coords):
    original_array = np.array(original_coords)
    modified_array = np.array(modified_coords)
    
    # Compute correlation coefficient between original and modified trajectories
    acc = np.corrcoef(original_array[:, 0], modified_array[:, 0])[0, 1] * np.corrcoef(original_array[:, 1], modified_array[:, 1])[0, 1]
    return acc

# Function to generate anomalous trajectories based on ACC formula
def generate_anomalous_trajectory(original_trajectory, threshold=0.5):
    # Convert LineString to list of coordinates
    coordinates = list(original_trajectory.coords)
        
    # Number of deviations (5 to 10)
    num_deviations = random.randint(5, 10)
        
    # Generate random indices to insert deviations
    positions = random.sample(range(1, len(coordinates)), min(num_deviations, len(coordinates) - 1))
        
    # Apply deviations until the ACC drops below the threshold
    for pos in sorted(positions, reverse=True):
        deviation = (coordinates[pos][0] + random.uniform(-0.001, 0.001), 
                        coordinates[pos][1] + random.uniform(-0.001, 0.001))  
        modified_coords = coordinates[:pos] + [deviation] + coordinates[pos+1:]
            
        # Calculate ACC
        acc = calculate_acc(coordinates, modified_coords)
        if acc < threshold:
            coordinates = modified_coords
        else:
            break
        
    return LineString(coordinates)

# Define Haversine formula
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in km
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

# Calculate Distance using Haversine formula
def calculate_trajectory_distance(trajectory):
    if isinstance(trajectory, LineString):
        coords = list(trajectory.coords)
        distance = 0
        for i in range(len(coords) - 1):
            lon1, lat1 = coords[i]
            lon2, lat2 = coords[i + 1]
            distance += calculate_distance(lat1, lon1, lat2, lon2)
        return distance
    return 0

# Apply anomaly generation to selected trajectories
df.loc[df.index.isin(anomalous_indices), 'Parsed_Trajectory'] = df.loc[df.index.isin(anomalous_indices), 'Parsed_Trajectory'].apply(lambda traj: generate_anomalous_trajectory(traj, threshold=0.9))

# Create a copy of the filtered DataFrame for anomalous trajectories
df_anomalous = df[df.index.isin(anomalous_indices)].copy()

# Recalculate Distance, Duration, and Speed for each anomalous trajectory
df_anomalous['Distance(km)'] = df_anomalous['Parsed_Trajectory'].apply(calculate_trajectory_distance)  # in kilometers
df_anomalous['Duration(hr)'] = (pd.to_datetime(df_anomalous['End Time']) - pd.to_datetime(df_anomalous['Start Time'])).dt.total_seconds() / 3600 # in hours
df_anomalous['Speed(km/h)'] = df_anomalous['Distance(km)'] / df_anomalous['Duration(hr)'] # in km/hr

# Add Anomalous column with value 1
df_anomalous['Anomalous'] = 1

# Drop the Parsed_Trajectory column
df_anomalous.drop(columns=['Parsed_Trajectory'], inplace=True)

# Print df_anomalous DataFrame
print(df_anomalous)

# Save df_anomalous DataFrame to generated_anomalous_trajectories.csv file
output_file_path = os.path.join(main_directory, 'generated_anomalous_trajectories.csv')
df_anomalous.to_csv(output_file_path, index=False)
print(f"Anomalous trajectories saved to {output_file_path}")

# Print the total number of anomalous trajectories generated
print(f"Total number of anomalous trajectories generated: {len(anomalous_indices)}")

# Print the count of anomalous trajectories generated for each mode of transport
anomalous_modes_count = df_anomalous['Transportation Mode'].value_counts()
print("\nCount of anomalous trajectories generated for each mode of transport:")
print(anomalous_modes_count)

  c /= stddev[:, None]
  c /= stddev[None, :]


       User Transportation Mode           Date-Time  Altitude(ft)  \
2        10               Train 2008-03-29 01:32:52   -777.000000   
13       10                Walk 2008-04-01 11:30:37   -777.000000   
126      10                 Bus 2008-09-17 22:56:54      7.000000   
132      10                Walk 2008-09-18 00:53:08    167.000000   
145      10                Walk 2008-09-19 11:00:03    161.000000   
...     ...                 ...                 ...           ...   
56809   172                 Car 2008-06-28 05:03:23     95.100000   
56818   172                Walk 2008-07-18 11:14:30     98.400000   
56820   172                Walk 2008-07-18 11:14:40     98.400000   
56831   176                Bike 2007-11-30 09:33:10    272.309711   
56861   181                Walk 2007-12-07 10:06:05    226.377953   

                Start Time             End Time  Start Latitude(deg)  \
2      2008-03-29 01:27:50  2008-03-29 15:59:59            36.663282   
13     2008-04-01 11:30:37 

### 4.3 Combine Original & Generated Anomalous Trajectories (merged_trajectories.csv)

In [16]:
import os
import pandas as pd

# Load original combined trajectories dataset
combined_file = os.path.join(main_directory, 'combined_trajectories_predictions.csv')
combined_df = pd.read_csv(combined_file, parse_dates=['Start Time', 'End Time'])

# Add Anomalous column to original combined_df and set its value to 0
combined_df['Anomalous'] = 0

# Load generated anomalous trajectories
anomalous_file = os.path.join(main_directory, 'generated_anomalous_trajectories.csv')
anomalous_df = pd.read_csv(anomalous_file, parse_dates=['Start Time', 'End Time'])

# Combine trajectories
merged_df = pd.concat([combined_df, anomalous_df], ignore_index=True)

# Check for negative values in the Duration(hr) column and return the count
negative_duration_count = (df['Duration(hr)'] <= 0).sum()
print(f"Number of records with zero or negative Duration(hr): {negative_duration_count}")

# Check for negative values in the Distance(km) column and return the count
negative_distance_count = (df['Distance(km)'] <= 0).sum()
print(f"Number of records with zero or negative Distance(km): {negative_distance_count}")

# Check for negative values in the Speed(km/h) column and return the count
negative_speed_count = (df['Speed(km/h)'] <= 0).sum()
print(f"Number of records with zero or negative Speed(km/h): {negative_speed_count}")

# Remove records with negative or zero speed
merged_df = merged_df[merged_df['Duration(hr)'] > 0]

# Remove records with negative or zero speed
merged_df = merged_df[merged_df['Distance(km)'] > 0]

# Remove records with negative or zero speed
merged_df = merged_df[merged_df['Speed(km/h)'] > 0]

# Print merged_df DataFrame
print(merged_df)

# Save merged_df dataframe to merged_trajectories.csv file
output_merged_file = os.path.join(main_directory, 'merged_trajectories.csv')
merged_df.to_csv(output_merged_file, index=False)
print(f"Merged dataset saved to {output_merged_file}")

Number of records with zero or negative Duration(hr): 0
Number of records with zero or negative Distance(km): 1
Number of records with zero or negative Speed(km/h): 1
       User Transportation Mode            Date-Time  Altitude(ft)  \
0        10               Train  2008-03-28 14:54:40   -777.000000   
1        10               Train  2008-03-28 16:00:01   -777.000000   
2        10               Train  2008-03-29 01:32:52   -777.000000   
3        10               Train  2008-03-29 16:00:48   -777.000000   
4        10               Train  2008-03-30 16:00:39   -777.000000   
...     ...                 ...                  ...           ...   
59712   172                 Car  2008-06-28 05:03:23     95.100000   
59713   172                Walk  2008-07-18 11:14:30     98.400000   
59714   172                Walk  2008-07-18 11:14:40     98.400000   
59715   176                Bike  2007-11-30 09:33:10    272.309711   
59716   181                Walk  2007-12-07 10:06:05    226.377

# 5. Model to Detect Anomalies in Trajectories

### 5.1 Train & Evaluate Model - Combined Model: XGBoost, IsolationForest 

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from shapely.geometry import LineString
import joblib

# Load merged dataset
merged_file = os.path.join(main_directory, 'merged_trajectories.csv')
merged_df = pd.read_csv(merged_file, parse_dates=['Start Time', 'End Time'])

# Split dataset into normal and anomalous
normal_df = merged_df[merged_df['Anomalous'] == 0]
anomalous_df = merged_df[merged_df['Anomalous'] == 1]

# Split datasets into training and testing sets
normal_train, normal_test = train_test_split(normal_df, test_size=0.2, random_state=42)
anomalous_train, anomalous_test = train_test_split(anomalous_df, test_size=0.2, random_state=42)
train_df = pd.concat([normal_train, anomalous_train], ignore_index=True)
test_df = pd.concat([normal_test, anomalous_test], ignore_index=True)

# Function to parse each trajectory individually
def parse_trajectories(trajectory_string):
    # Remove 'LINESTRING (' and ')'
    coords_str = trajectory_string.replace('LINESTRING (', '').replace(')', '')
    
    # Split by ', ' to get individual coordinate pairs
    coord_pairs = coords_str.split(', ')
    
    if len(coord_pairs) < 2:
        raise ValueError(f"Insufficient coordinate pairs in LineString representation: {trajectory_string}")
    
    # Extract coordinates
    coords = []
    for coord_pair in coord_pairs:
        lon_lat = coord_pair.split()
        if len(lon_lat) != 2:
            raise ValueError(f"Invalid coordinate pair format: {coord_pair}")
        lon = float(lon_lat[0])
        lat = float(lon_lat[1])
        coords.append((lon, lat))
    
    return LineString(coords)

# Define features
features = ['Distance(km)', 'Duration(hr)', 'Speed(km/h)', 'Transportation Mode', 'Start Time', 'End Time',
            'Start Latitude(deg)', 'Start Longitude(deg)', 'End Latitude(deg)', 'End Longitude(deg)']

# Preprocess training set
le = LabelEncoder()
train_df['Transportation Mode'] = le.fit_transform(train_df['Transportation Mode'])
train_df['Start Time'] = pd.to_datetime(train_df['Start Time']).astype(np.int64) // 10**9
train_df['End Time'] = pd.to_datetime(train_df['End Time']).astype(np.int64) // 10**9
train_df['Parsed Trajectory'] = train_df['Trajectory'].apply(parse_trajectories)

# Initialize & perform Grid Search for XGBoost Model
param_grid_xgb = {
    'n_estimators': [300],
    'max_depth': [12],
    'learning_rate': [0.3],
    'colsample_bytree': [0.9],
    'subsample': [1.0],
    'random_state': [42]
}

xgb_model = XGBClassifier()
xgb_grid = GridSearchCV(xgb_model, param_grid_xgb, cv=3, scoring='accuracy', n_jobs=-1)
xgb_grid.fit(train_df[features], train_df['Anomalous'])

# Initialize & perform Grid Search for Isolation Forest Model
param_grid_if = {
    'n_estimators': [300],
    'max_samples': [2],
    'contamination': [0.1]
}

iso_forest = IsolationForest(random_state=42)
if_grid = GridSearchCV(iso_forest, param_grid_if, cv=3, scoring='accuracy', n_jobs=-1)
if_grid.fit(train_df[features])

# Save both models
xgb_model_file_path = os.path.join(main_directory, 'xgb_model.pkl')
joblib.dump(xgb_grid.best_estimator_, xgb_model_file_path)

iso_forest_file_path = os.path.join(main_directory, 'iso_forest_model.pkl')
joblib.dump(if_grid.best_estimator_, iso_forest_file_path)

# Preprocess testing set
test_df['Transportation Mode'] = le.transform(test_df['Transportation Mode'])
test_df['Start Time'] = pd.to_datetime(test_df['Start Time']).astype(np.int64) // 10**9
test_df['End Time'] = pd.to_datetime(test_df['End Time']).astype(np.int64) // 10**9
test_df['Parsed Trajectory'] = test_df['Trajectory'].apply(parse_trajectories)

# Predict anomalies
test_df['XGB_Predicted_Anomaly'] = xgb_grid.best_estimator_.predict(test_df[features])
test_df['IF_Predicted_Anomaly'] = if_grid.best_estimator_.predict(test_df[features])
test_df['IF_Predicted_Anomaly'] = test_df['IF_Predicted_Anomaly'].apply(lambda x: 1 if x == -1 else 0)

# Combine predictions using weighted voting
weights = {'XGB': 0.6, 'IF': 0.4}
test_df['Weighted_Predicted_Anomaly'] = np.zeros(len(test_df))
for model, preds in {
    'XGB': test_df['XGB_Predicted_Anomaly'],
    'IF': test_df['IF_Predicted_Anomaly']
}.items():
    test_df['Weighted_Predicted_Anomaly'] += weights[model] * preds

# Apply a threshold to determine final anomaly prediction
threshold = 0.5
test_df['Final_Predicted_Anomaly'] = (test_df['Weighted_Predicted_Anomaly'] > threshold).astype(int)

# Significant Variations in Speed or Duration
speed_threshold = 350  # in km/h
duration_threshold = 24  # in hours (1 day)

def detect_variation_anomalies(row):
    if row['Speed(km/h)'] > speed_threshold or row['Duration(hr)'] > duration_threshold:
        return 1
    return 0

test_df['Variation_Anomaly'] = test_df.apply(detect_variation_anomalies, axis=1)

# Combine all anomaly detections
test_df['Combined_Anomaly'] = np.maximum(test_df['Final_Predicted_Anomaly'], test_df['Variation_Anomaly'])

# Calculate the number of detected and generated anomalies
num_detected_anomalies = test_df['Combined_Anomaly'].sum()
num_anomalies_generated = test_df['Anomalous'].sum()

# Calculate the detection percentage
detection_percentage = (num_detected_anomalies / num_anomalies_generated) * 100 

print(f"Number of Anomalies Detected: {num_detected_anomalies}")
print(f"Number of Anomalies Generated: {num_anomalies_generated}")
print(f"Detection Percentage: {detection_percentage:}%")

# Drop columns related to anomalies
columns_to_drop = [col for col in test_df.columns if 'Anomaly' in col]
test_df = test_df.drop(columns=columns_to_drop)

# Print the final dataframe
print(test_df)



Number of Anomalies Detected: 491
Number of Anomalies Generated: 567
Detection Percentage: 86.59611992945327%
       User  Transportation Mode            Date-Time  Altitude(ft)  \
0        22                    1  2009-05-07 16:19:24         181.0   
1         0                    4  2009-07-04 04:31:25          14.0   
2       163                    4  2008-06-22 12:47:01           3.3   
3         8                    3  2008-11-19 08:22:23          94.0   
4         1                    4  2008-10-24 05:33:53         196.0   
...     ...                  ...                  ...           ...   
11937    84                    3  2008-10-25 09:57:02         494.0   
11938    25                    4  2009-06-18 07:19:33         269.0   
11939   112                    4  2008-06-11 16:01:27         154.2   
11940    70                    3  2008-09-28 07:22:40         120.0   
11941    23                    4  2009-01-17 03:08:49           0.0   

       Start Time    End Time  Start 

### 5.2 Detect Anomalies in Merged Dataset with Generated Anomalies (use merged_trajectories.csv)

In [6]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from shapely.geometry import LineString
from sklearn.metrics import classification_report, accuracy_score

# Define the functions for contextual anomaly detection
def parse_trajectories(trajectory_string):
    coords_str = trajectory_string.replace('LINESTRING (', '').replace(')', '')
    coord_pairs = coords_str.split(', ')
    if len(coord_pairs) < 2:
        raise ValueError(f"Insufficient coordinate pairs in LineString representation: {trajectory_string}")
    coords = [(float(pair.split()[0]), float(pair.split()[1])) for pair in coord_pairs]
    return LineString(coords)

# Significant Variations in Speed or Duration
speed_threshold = 350  # in km/h
duration_threshold = 24  # in hours (1 day)

def detect_variation_anomalies(row):
    if row['Speed(km/h)'] > speed_threshold or row['Duration(hr)'] > duration_threshold:
        return 1
    return 0

# Load the merged trajectories dataset
file_path = os.path.join(main_directory, 'merged_trajectories.csv')
df = pd.read_csv(file_path)

# Drop the 'Anomalous' column
df = df.drop(columns=['Anomalous'])

# Define the features
features = ['Distance(km)', 'Duration(hr)', 'Speed(km/h)', 'Transportation Mode', 'Start Time', 'End Time',
            'Start Latitude(deg)', 'Start Longitude(deg)', 'End Latitude(deg)', 'End Longitude(deg)']

# Initialize the label encoder
le = LabelEncoder()

# Convert Transportation Mode to numeric values using label encoder
df['Transportation Mode'] = le.fit_transform(df['Transportation Mode'])

# Convert Start Time and End Time to numeric values (timestamp)
df['Start Time'] = pd.to_datetime(df['Start Time']).astype(np.int64) // 10**9
df['End Time'] = pd.to_datetime(df['End Time']).astype(np.int64) // 10**9

# Check for missing or infinite values
initial_record_count = len(df)

for feature in features:
    df[feature] = pd.to_numeric(df[feature], errors='coerce')  # Convert to numeric, forcing errors to NaN
df = df.replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
df = df.dropna(subset=features)  # Drop rows with NaN in feature columns

# Record count after dropping NA values
record_count_after_dropping_na = len(df)
records_dropped = initial_record_count - record_count_after_dropping_na
print(f"Number of records dropped due to NA values: {records_dropped}")

# Load the trained models
xgb_model_file_path = os.path.join(main_directory, 'xgb_model.pkl')
iso_forest_file_path = os.path.join(main_directory, 'iso_forest_model.pkl')

xgb_model = joblib.load(xgb_model_file_path)
iso_forest = joblib.load(iso_forest_file_path)

# Parse trajectories for contextual anomaly detection
df['Parsed Trajectory'] = df['Trajectory'].apply(parse_trajectories)

# Predict anomalies with both models
df['XGB_Predicted_Anomaly'] = xgb_model.predict(df[features])
df['IF_Predicted_Anomaly'] = iso_forest.predict(df[features])
df['IF_Predicted_Anomaly'] = df['IF_Predicted_Anomaly'].apply(lambda x: 1 if x == -1 else 0)

# Define weights for each model
weights = {'XGB': 0.6, 'IF': 0.4}

# Combine predictions using weighted voting
df['Weighted_Predicted_Anomaly'] = np.zeros(len(df))
for model, preds in {
    'XGB': df['XGB_Predicted_Anomaly'],
    'IF': df['IF_Predicted_Anomaly']
}.items():
    df['Weighted_Predicted_Anomaly'] += weights[model] * preds

# Apply a threshold to determine final anomaly prediction
threshold = 0.5
df['Final_Predicted_Anomaly'] = (df['Weighted_Predicted_Anomaly'] > threshold).astype(int)

df['Variation_Anomaly'] = df.apply(detect_variation_anomalies, axis=1)

# Combine all anomaly detections
df['Combined_Anomaly'] = np.maximum(df['Final_Predicted_Anomaly'], df['Variation_Anomaly'])

# Convert Start & End Time and Transportation Mode back to their original forms
df['Transportation Mode'] = le.inverse_transform(df['Transportation Mode'])
df['Start Time'] = pd.to_datetime(df['Start Time'], unit='s')
df['End Time'] = pd.to_datetime(df['End Time'], unit='s')

# Print number of anomalies detected
num_anomalies = df['Combined_Anomaly'].sum()
print(f"Number of anomalies detected: {num_anomalies}")

# Print percentage of dataset that is anomaly
total_records = len(df)
percentage_anomalies = (num_anomalies / total_records) * 100
print(f"Percentage of dataset that are anomalous: {percentage_anomalies:.3f}%")

# Filter and print anomalous records
anomalous_records = df[df['Combined_Anomaly'] == 1]
print("Anomalous records:")
print(anomalous_records)

# Save the dataset with predicted anomalies to a new CSV file
output_file_path = os.path.join(main_directory, 'merged_trajectories_anomalies_detection.csv')
df.to_csv(output_file_path, index=False)
print(f"Merged trajectories with predicted anomalies saved to {output_file_path}")

Number of records dropped due to NA values: 0
Number of anomalies detected: 2964
Percentage of dataset that are anomalous: 4.964%
Anomalous records:
       User Transportation Mode            Date-Time  Altitude(ft)  \
198      10                Walk  2008-09-26 12:50:33     16.000000   
322      10               Train  2008-10-09 23:10:30     92.000000   
353      10                 Car  2008-10-16 00:01:50    135.000000   
407      10                Walk  2008-11-07 05:24:51    430.000000   
423      10                 Car  2008-12-07 10:19:25     10.000000   
...     ...                 ...                  ...           ...   
59699   172                 Bus  2008-06-27 02:02:12    187.000000   
59700   172                 Car  2008-06-27 02:05:36    187.000000   
59701   172                 Car  2008-06-28 05:03:23     95.100000   
59703   172                Walk  2008-07-18 11:14:40     98.400000   
59704   176                Bike  2007-11-30 09:33:10    272.309711   

          

### 5.3 Detect Anomalies in Combined Dataset w/o Generated Anomalies (use combined_trajectories_predictions.csv)

In [7]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from shapely.geometry import LineString
from sklearn.metrics import classification_report, accuracy_score

# Define the functions for contextual anomaly detection
def parse_trajectories(trajectory_string):
    coords_str = trajectory_string.replace('LINESTRING (', '').replace(')', '')
    coord_pairs = coords_str.split(', ')
    if len(coord_pairs) < 2:
        raise ValueError(f"Insufficient coordinate pairs in LineString representation: {trajectory_string}")
    coords = [(float(pair.split()[0]), float(pair.split()[1])) for pair in coord_pairs]
    return LineString(coords)

# Significant Variations in Speed or Duration
speed_threshold = 350  # in km/h
duration_threshold = 24  # in hours (1 day)

def detect_variation_anomalies(row):
    if row['Speed(km/h)'] > speed_threshold or row['Duration(hr)'] > duration_threshold:
        return 1
    return 0

# Load the merged trajectories dataset
file_path = os.path.join(main_directory, 'combined_trajectories_predictions.csv')
df = pd.read_csv(file_path)

# Define the features
features = ['Distance(km)', 'Duration(hr)', 'Speed(km/h)', 'Transportation Mode', 'Start Time', 'End Time',
            'Start Latitude(deg)', 'Start Longitude(deg)', 'End Latitude(deg)', 'End Longitude(deg)']

# Initialize the label encoder
le = LabelEncoder()

# Convert Transportation Mode to numeric values using label encoder
df['Transportation Mode'] = le.fit_transform(df['Transportation Mode'])

# Convert Start Time and End Time to numeric values (timestamp)
df['Start Time'] = pd.to_datetime(df['Start Time']).astype(np.int64) // 10**9
df['End Time'] = pd.to_datetime(df['End Time']).astype(np.int64) // 10**9

# Check for missing or infinite values
initial_record_count = len(df)

for feature in features:
    df[feature] = pd.to_numeric(df[feature], errors='coerce')  # Convert to numeric, forcing errors to NaN
df = df.replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
df = df.dropna(subset=features)  # Drop rows with NaN in feature columns

# Record count after dropping NA values
record_count_after_dropping_na = len(df)
records_dropped = initial_record_count - record_count_after_dropping_na
print(f"Number of records dropped due to NA values: {records_dropped}")

# Load the trained models
xgb_model_file_path = os.path.join(main_directory, 'xgb_model.pkl')
iso_forest_file_path = os.path.join(main_directory, 'iso_forest_model.pkl')

xgb_model = joblib.load(xgb_model_file_path)
iso_forest = joblib.load(iso_forest_file_path)

# Parse trajectories for contextual anomaly detection
df['Parsed Trajectory'] = df['Trajectory'].apply(parse_trajectories)

# Predict anomalies with both models
df['XGB_Predicted_Anomaly'] = xgb_model.predict(df[features])
df['IF_Predicted_Anomaly'] = iso_forest.predict(df[features])
df['IF_Predicted_Anomaly'] = df['IF_Predicted_Anomaly'].apply(lambda x: 1 if x == -1 else 0)

# Define weights for each model
weights = {'XGB': 0.6, 'IF': 0.4}

# Combine predictions using weighted voting
df['Weighted_Predicted_Anomaly'] = np.zeros(len(df))
for model, preds in {
    'XGB': df['XGB_Predicted_Anomaly'],
    'IF': df['IF_Predicted_Anomaly']
}.items():
    df['Weighted_Predicted_Anomaly'] += weights[model] * preds

# Apply a threshold to determine final anomaly prediction
threshold = 0.5
df['Final_Predicted_Anomaly'] = (df['Weighted_Predicted_Anomaly'] > threshold).astype(int)

df['Variation_Anomaly'] = df.apply(detect_variation_anomalies, axis=1)

# Combine all anomaly detections
df['Combined_Anomaly'] = np.maximum(df['Final_Predicted_Anomaly'], df['Variation_Anomaly'])

# Convert Start & End Time and Transportation Mode back to their original forms
df['Transportation Mode'] = le.inverse_transform(df['Transportation Mode'])
df['Start Time'] = pd.to_datetime(df['Start Time'], unit='s')
df['End Time'] = pd.to_datetime(df['End Time'], unit='s')

# Print number of anomalies detected
num_anomalies = df['Combined_Anomaly'].sum()
print(f"Number of anomalies detected: {num_anomalies}")

# Print percentage of dataset that is anomaly
total_records = len(df)
percentage_anomalies = (num_anomalies / total_records) * 100
print(f"Percentage of dataset that are anomalous: {percentage_anomalies:.3f}%")

# Filter and print anomalous records
anomalous_records = df[df['Combined_Anomaly'] == 1]
print("Anomalous records:")
print(anomalous_records)

# Save the dataset with predicted anomalies to a new CSV file
output_file_path = os.path.join(main_directory, 'combined_trajectories_predictions_detection.csv')
df.to_csv(output_file_path, index=False)
print(f"Merged trajectories with predicted anomalies saved to {output_file_path}")

Number of records dropped due to NA values: 0
Number of anomalies detected: 679
Percentage of dataset that are anomalous: 1.194%
Anomalous records:
       User Transportation Mode            Date-Time  Altitude(ft)  \
198      10                Walk  2008-09-26 12:50:33      16.00000   
322      10               Train  2008-10-09 23:10:30      92.00000   
353      10                 Car  2008-10-16 00:01:50     135.00000   
407      10                Walk  2008-11-07 05:24:51     430.00000   
423      10                 Car  2008-12-07 10:19:25      10.00000   
...     ...                 ...                  ...           ...   
55979   142                Walk  2011-01-22 08:53:21     106.00000   
56112   146                 Car  2007-08-01 14:19:12     311.67979   
56214   152                 Car  2008-08-23 03:10:56      81.00000   
56711   168                Walk  2011-08-17 22:40:53     528.00000   
56783   172                 Car  2008-06-27 05:19:25     387.10000   

           

# 6. Plot Trajectories 

### Kepler.gl

In [8]:
import os
import pandas as pd
from keplergl import KeplerGl

# Define file paths
file_path = os.path.join(main_directory, 'merged_trajectories_anomalies_detection.csv')
output_map_file = os.path.join(main_directory, 'trajectories_keplergl.html')

# Load the dataset
df = pd.read_csv(file_path)

# Sample 10% of the data
df_sampled = df.sample(frac=0.1, random_state=42)

# Define colors for different transportation modes and anomalies using RGB values
color_range = {
    "Walk": [255, 0, 0], 
    "Bike": [0, 255, 0],  
    "Train": [128, 0, 128], 
    "Car": [0, 0, 139],  
    "Bus": [255, 255, 0],  
    "Anomalous": [255, 0, 0]  
}

# Create separate DataFrames for each mode of transport and anomalies
mode_dfs = {mode: df_sampled[df_sampled['Transportation Mode'] == mode].copy() for mode in color_range.keys() if mode != "Anomalous"}
anomalous_dfs = df_sampled[df_sampled['Final_Predicted_Anomaly'] == 1].copy()

# Initialize Kepler.gl map
map_1 = KeplerGl()

# Add data to the map for each mode and anomalies
for mode, data in mode_dfs.items():
    map_1.add_data(data=data, name=mode)

# Add anomalous data
map_1.add_data(data=anomalous_dfs, name="Anomalous")

# Define the configuration for the map
config = {
    'version': 'v1',
    'config': {
        'visState': {
            'layers': [
                {
                    'id': f'{mode}_layer',
                    'type': 'line',
                    'config': {
                        'dataId': mode,
                        'label': f'{mode} Trajectories',
                        'color': color_range[mode],
                        'columns': {'geojson': 'Trajectory', 'color': 'Color'},
                        'isVisible': True,
                        'visConfig': {
                            'opacity': 0.8,
                            'thickness': 2,
                            'colorRange': {
                                'name': 'Custom Color Range',
                                'type': 'custom',
                                'colors': [color_range[mode]]
                            },
                            'sizeRange': [0, 10]
                        }
                    },
                    'visualChannels': {
                        'colorField': {'name': 'Color', 'type': 'rgb'},
                        'colorScale': 'ordinal'
                    }
                }
                for mode in color_range.keys() if mode != "Anomalous"
            ] + [
                {
                    'id': 'anomalous_layer',
                    'type': 'line',
                    'config': {
                        'dataId': 'Anomalous',
                        'label': 'Anomalous Trajectories',
                        'color': color_range["Anomalous"],
                        'columns': {'geojson': 'Trajectory', 'color': 'Color'},
                        'isVisible': True,
                        'visConfig': {
                            'opacity': 0.8,
                            'thickness': 2,
                            'colorRange': {
                                'name': 'Custom Color Range',
                                'type': 'custom',
                                'colors': [color_range["Anomalous"]]
                            },
                            'sizeRange': [0, 10]
                        }
                    },
                    'visualChannels': {
                        'colorField': {'name': 'Color', 'type': 'rgb'},
                        'colorScale': 'ordinal'
                    }
                }
            ]
        },
        'mapState': {
            'bearing': 0,
            'dragRotate': True,
            'latitude': 39.9042,
            'longitude': 116.4074,
            'pitch': 0,
            'zoom': 10,
            'isSplit': False
        },
        'mapStyle': {
            'styleType': 'dark',
            'topLayerGroups': {},
            'visibleLayerGroups': {
                'label': True,
                'road': True,
                'border': False,
                'building': True,
                'water': True,
                'land': True,
                '3d building': False
            },
            'threeDBuildingColor': [9.665468314072013, 17.18305478057247, 31.1442867897876],
            'mapStyles': {}
        }
    }
}

map_1.config = config

# Save the map configuration to an HTML file
map_1.save_to_html(file_name=output_map_file)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
Map saved to C:/Users/wengk.WK/OneDrive - Nanyang Technological University/Desktop/Anomaly Dectection In Trajectories Using GPS Data/Geolife Trajectories 1.3/Data\trajectories_keplergl.html!
