Define an application (e.g., predicting the next location) based on the datasets, and give a solution.
The solution Includes but not limited to traditional analytics, machine learning, deep learning, and
LLM-related tasks. Implement your solution, and provide some experimental results to show your
solution works.

In [1]:
import pandas as pd
import numpy as np
from shapely import wkt
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch

In [2]:
# Load data
df = pd.read_csv('/Users/linhtrankhanh/Documents/GitHub/HuMob-Data-Mining-P2/triplegs_data/triplegs_C.csv')

# Convert 'geom' to latitude and longitude
def extract_lat_long(geom):
    line = wkt.loads(geom)
    lat, long = zip(*line.coords)
    return lat[-1], long[-1]

df['latitude'], df['longitude'] = zip(*df['geom'].apply(extract_lat_long))

# Extract features from datetime
df['started_at'] = pd.to_datetime(df['started_at'])
df['finished_at'] = pd.to_datetime(df['finished_at'])
df['day_of_week'] = df['started_at'].dt.dayofweek
df['hour_of_day'] = df['started_at'].dt.hour
df['trip_duration'] = (df['finished_at'] - df['started_at']).dt.total_seconds() / 60

# Creating sequences of past locations
sequence_length = 10

sequences = []
labels = []

for user_id, group in df.groupby('user_id'):
    locations = group[['latitude', 'longitude']].values
    for i in range(len(locations) - sequence_length):
        sequences.append(locations[i:i + sequence_length])
        labels.append(locations[i + sequence_length])

sequences = np.array(sequences)
labels = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, input_size, num_layers, num_heads, hidden_dim):
        super(TransformerModel, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_size, nhead=num_heads, dim_feedforward=hidden_dim, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(input_size, 2)  # Output 2 values (latitude and longitude)

    def forward(self, x):
        x = self.transformer_encoder(x)
        x = self.fc_out(x[:, -1, :])  # Use the output from the last time step
        return x

# Hyperparameters
input_size = 2  # Latitude and Longitude
num_layers = 3
num_heads = 2
hidden_dim = 128
lr = 0.001
num_epochs = 50
batch_size = 32

# Prepare data
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, loss function, and optimizer
model = TransformerModel(input_size, num_layers, num_heads, hidden_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

In [None]:
# Evaluation
model.eval()
predictions = []
true_values = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        predictions.extend(outputs.numpy())
        true_values.extend(y_batch.numpy())

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(true_values, predictions)
print(f'Mean Absolute Error: {mae}')