# Final Project Data Analytics and IoT
## GeoLife Trajectories 

In [1]:
import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Setting seed for reproducibility
np.random.seed(1234)  
PYTHONHASHSEED = 0
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from sklearn.model_selection import train_test_split
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, LSTM
from keras.layers.core import Activation
from keras.utils import pad_sequences
from sklearn.metrics import mean_squared_error as mse
from keras.models import Sequential
from keras.layers import LSTM, Dense
import warnings
warnings.filterwarnings('ignore')

2023-02-06 14:04:51.128998: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
# Create an empty list to store the DataFrames
df_list = []

# Loop over each of the 181 folders
for i in range(21):
    # Create the folder path for each folder
    folder_path = "./Data/{:03d}/Trajectory".format(i)

    # Get a list of all of the .plt files in the folder
    file_list = [f for f in os.listdir(folder_path) if f.endswith(".plt")]

    # Loop over each .plt file in the folder
    for file in file_list:
        # Create the file path for each .plt file
        file_path = os.path.join(folder_path, file)

        # Load the .plt file into a DataFrame
        df = pd.read_csv(file_path, delimiter=',', header=None, skiprows=7, 
                         names=["Latitude", "Longitude", "Altitude", "Reserved_1", "Reserved_2", "Date", "Time"])
        
        # Convert the Altitude column from feet to meters
        df['Altitude'] = df['Altitude'] * 0.3048

        # Convert the Date and Time columns into a single datetime column
        df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%Y-%m-%d %H:%M:%S')

        # Drop the original Date and Time columns
        df = df.drop(['Date', 'Time'], axis=1)

        ## Add user column

        df['user'] = "{:03d}".format(i)
        
        # Append the loaded DataFrame to the list
        df_list.append(df)

# Concatenate all of the DataFrames in the list into a single DataFrame
df = pd.concat(df_list)

In [12]:
df['label'] = ''

In [13]:
def is_between(time, row_to_check):
    if row_to_check['start_time'] <= time <= row_to_check['end_time']:
        return row_to_check['transportation_mode']

In [17]:
# add labels to df if they exist
for i in range(21):
    # Create the folder path for each folder
    folder_path = "./Data/{:03d}".format(i)
    # Check to see if a .txt file exists in the folder
    for f in os.listdir(folder_path):
        if f.endswith('.txt'):
            file_path = os.path.join(folder_path, f)
            # Load the .plt file into a DataFrame
            df_labels = pd.read_csv(file_path, delimiter='\t', header=None, skiprows=1,
                         names=["start_time", "end_time", "transportation_mode"])
            df_labels['user'] = "{:03d}".format(i)
            df_labels['start_time'] = pd.to_datetime(df_labels['start_time'], format='%Y/%m/%d %H:%M:%S')
            df_labels['end_time'] = pd.to_datetime(df_labels['end_time'], format='%Y/%m/%d %H:%M:%S')
            df_user = df[df['user'] == "{:03d}".format(i)]
            for j, row in df_labels.iterrows():
                mask = (df_user['Datetime'] >= row['start_time'])
                df_to_apply = df_user[mask]
                df[df['user'] == "{:03d}".format(i)].loc[mask, 'label'] = df_to_apply['Datetime'].apply(lambda x: is_between(x, row))
            break

10
row 0 of 434
row 1 of 434
row 2 of 434
row 3 of 434
row 4 of 434
row 5 of 434
row 6 of 434
row 7 of 434
row 8 of 434
row 9 of 434
row 10 of 434
row 11 of 434
row 12 of 434
row 13 of 434
row 14 of 434
row 15 of 434
row 16 of 434
row 17 of 434
row 18 of 434
row 19 of 434
row 20 of 434
row 21 of 434
row 22 of 434
row 23 of 434
row 24 of 434
row 25 of 434
row 26 of 434
row 27 of 434
row 28 of 434
row 29 of 434
row 30 of 434
row 31 of 434
row 32 of 434
row 33 of 434
row 34 of 434
row 35 of 434
row 36 of 434
row 37 of 434
row 38 of 434


KeyboardInterrupt: 

train       384968
walk         89235
bike         63710
taxi         56805
bus          51823
subway       20946
car           6584
              2420
airplane      1325
Name: label, dtype: int64

In [38]:
df_20['label'] = ''
for i, row in df_labels_20.iterrows():
    mask = (df_20['Datetime'] >= row['start_time'])
    df_to_apply = df_20[mask]
    df_20.loc[mask, 'label'] = df_to_apply['Datetime'].apply(lambda x: is_between(x, row))

In [67]:
df_20.label.value_counts()

Unnamed: 0,Latitude,Longitude,Altitude,Reserved_1,Reserved_2,Datetime,user,label
0,40.000168,116.327474,0.0,80.0,39915.314688,2009-04-12 07:33:09,0,
1,40.000055,116.327454,0.0,99.0,39915.314745,2009-04-12 07:33:14,0,
2,40.000021,116.327407,0.0,109.0,39915.314803,2009-04-12 07:33:19,0,
3,40.000035,116.327281,0.0,111.0,39915.314861,2009-04-12 07:33:24,0,
4,39.999983,116.327285,0.0,114.0,39915.314919,2009-04-12 07:33:29,0,


In [61]:
df_20.iloc[65000:65010]

In [None]:
df.Reserved_1

In [11]:
# get rid of the field 3 
df["Altitude"] = df["Reserved_1"]

# Drop the "Reserved_1" column
df = df.drop("Reserved_1", axis=1)

In [12]:
df.Altitude

0       492.000000
1       492.000000
2       492.000000
3       492.000000
4       493.000000
           ...    
1018    107.517093
1019    108.301296
1020    108.730738
1021    108.944337
1022    108.286214
Name: Altitude, Length: 24857636, dtype: float64

In [13]:
df.shape

(24857636, 5)

In [15]:
df.head(10)

Unnamed: 0,Latitude,Longitude,Altitude,Reserved_2,Datetime
0,39.984683,116.31845,492.0,39744.120255,2008-10-23 02:53:10
1,39.984686,116.318417,492.0,39744.120313,2008-10-23 02:53:15
2,39.984688,116.318385,492.0,39744.12037,2008-10-23 02:53:20
3,39.984655,116.318263,492.0,39744.120428,2008-10-23 02:53:25
4,39.984611,116.318026,493.0,39744.120486,2008-10-23 02:53:30
5,39.984608,116.317761,493.0,39744.120544,2008-10-23 02:53:35
6,39.984563,116.317517,496.0,39744.120602,2008-10-23 02:53:40
7,39.984539,116.317294,500.0,39744.12066,2008-10-23 02:53:45
8,39.984606,116.317065,505.0,39744.120718,2008-10-23 02:53:50
9,39.984568,116.316911,510.0,39744.120775,2008-10-23 02:53:55


In [16]:
# Convert the "Days_since_1230_1899" column to a datetime object
df["Date_Time"] = pd.to_datetime(df["Reserved_2"], unit='d', origin='12/30/1899')

In [17]:
df.head(10)

Unnamed: 0,Latitude,Longitude,Altitude,Reserved_2,Datetime,Date_Time
0,39.984683,116.31845,492.0,39744.120255,2008-10-23 02:53:10,2008-10-23 02:53:09.999997184
1,39.984686,116.318417,492.0,39744.120313,2008-10-23 02:53:15,2008-10-23 02:53:15.000000256
2,39.984688,116.318385,492.0,39744.12037,2008-10-23 02:53:20,2008-10-23 02:53:20.000002560
3,39.984655,116.318263,492.0,39744.120428,2008-10-23 02:53:25,2008-10-23 02:53:24.999996416
4,39.984611,116.318026,493.0,39744.120486,2008-10-23 02:53:30,2008-10-23 02:53:29.999998720
5,39.984608,116.317761,493.0,39744.120544,2008-10-23 02:53:35,2008-10-23 02:53:35.000001792
6,39.984563,116.317517,496.0,39744.120602,2008-10-23 02:53:40,2008-10-23 02:53:40.000004096
7,39.984539,116.317294,500.0,39744.12066,2008-10-23 02:53:45,2008-10-23 02:53:44.999997952
8,39.984606,116.317065,505.0,39744.120718,2008-10-23 02:53:50,2008-10-23 02:53:50.000000768
9,39.984568,116.316911,510.0,39744.120775,2008-10-23 02:53:55,2008-10-23 02:53:55.000003328


In [None]:
# Drop the "Reserved_2" column
df = df.drop("Reserved_2", axis=1)

In [20]:
# Drop the "Reserved_2" column
df = df.drop("Datetime", axis=1)

In [21]:
df.head(10)

Unnamed: 0,Latitude,Longitude,Altitude,Date_Time
0,39.984683,116.31845,492.0,2008-10-23 02:53:09.999997184
1,39.984686,116.318417,492.0,2008-10-23 02:53:15.000000256
2,39.984688,116.318385,492.0,2008-10-23 02:53:20.000002560
3,39.984655,116.318263,492.0,2008-10-23 02:53:24.999996416
4,39.984611,116.318026,493.0,2008-10-23 02:53:29.999998720
5,39.984608,116.317761,493.0,2008-10-23 02:53:35.000001792
6,39.984563,116.317517,496.0,2008-10-23 02:53:40.000004096
7,39.984539,116.317294,500.0,2008-10-23 02:53:44.999997952
8,39.984606,116.317065,505.0,2008-10-23 02:53:50.000000768
9,39.984568,116.316911,510.0,2008-10-23 02:53:55.000003328


# Preporcessing and clearning

In [23]:
# Replace -777 with NaN
df['Altitude'].replace(-777, pd.np.nan, inplace=True)

# Drop any rows with NaN values in the 'Altitude' field
df.dropna(subset=['Altitude'], inplace=True)

# Convert 'Altitude' field from feet to meters
df['Altitude'] = df['Altitude'] * 0.3048

In [24]:
df.head(10)

Unnamed: 0,Latitude,Longitude,Altitude,Date_Time
0,39.984683,116.31845,149.9616,2008-10-23 02:53:09.999997184
1,39.984686,116.318417,149.9616,2008-10-23 02:53:15.000000256
2,39.984688,116.318385,149.9616,2008-10-23 02:53:20.000002560
3,39.984655,116.318263,149.9616,2008-10-23 02:53:24.999996416
4,39.984611,116.318026,150.2664,2008-10-23 02:53:29.999998720
5,39.984608,116.317761,150.2664,2008-10-23 02:53:35.000001792
6,39.984563,116.317517,151.1808,2008-10-23 02:53:40.000004096
7,39.984539,116.317294,152.4,2008-10-23 02:53:44.999997952
8,39.984606,116.317065,153.924,2008-10-23 02:53:50.000000768
9,39.984568,116.316911,155.448,2008-10-23 02:53:55.000003328


In [31]:
# Replace any missing or invalid data with the mean value of the column
df.fillna(df.mean(), inplace=True)

# Normalize the data using MinMaxScaler
scaler = MinMaxScaler()
df[['Latitude', 'Longitude', 'Altitude']] = scaler.fit_transform(df[['Latitude', 'Longitude', 'Altitude']])

# Save the preprocessed data back to a new file
df.to_csv('preprocessed_dataset.csv', index=False)

In [2]:
# Opene the  preprocessed data back to a new file
# run this when you want run the program
df = pd.read_csv("preprocessed_dataset.csv")

In [3]:
df.shape

(24802163, 4)

In [4]:
df.dtypes

Latitude     float64
Longitude    float64
Altitude     float64
Date_Time     object
dtype: object

# Build the time serries Model use LSTM 
Model Building: Create a multi-layer LSTM network using the extracted features and normalized values as input. The output layer should have as many nodes as the number of features you want to predict (in this case, 3).

Training: Train the model using a suitable optimizer and loss function on the preprocessed data. Monitor the accuracy of the model on a validation set during training.

Testing: Finally, test the model on unseen data to evaluate its performance in predicting the GPS trajectory.

In [5]:
df['Date_Time'] = pd.to_datetime(df['Date_Time'])
df["timestamp"] = df["Date_Time"].astype(np.int64) // 10**9
df = df.drop("Date_Time", axis=1)
# data = df.values
df = df.astype(np.float32)

# Take only 20% of the data
df = df.iloc[:int(len(df)*0.2), :]

# Define the number of time steps and features
timesteps = 10
input_dim = 3

# Split the data into input sequences and corresponding outputs
X, y = [], []
for i in range(len(df) - timesteps - 1):
    X.append(df.iloc[i:(i+timesteps), :input_dim].values)
    y.append(df.iloc[i+timesteps, :input_dim].values)

# Convert the input sequences and outputs to numpy arrays
X = np.array(X)
y = np.array(y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# Define the model
model = Sequential()
model.add(LSTM(128, input_shape=(timesteps, input_dim), return_sequences=True))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(input_dim, activation='linear'))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

In [7]:
# Train the model
history = model.fit(X_train, y_train, epochs=1, batch_size=64, validation_data=(X_val, y_val))

# Save the model
model.save("lstm_model.h5")

# Evaluate the model
scores = model.evaluate(X_val, y_val)
print("Validation loss: ", scores)

Validation loss:  1.7903958848819457e-07


In [None]:
# You can later load the saved model using:
# loaded_model = keras.models.load_model("lstm_model.h5")