In [None]:
!pip install transformers==4.31.0 tqdm sklearn

import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from collections import Counter
import gzip

In [None]:


# Step 1: Load the Check-in Data
def load_checkins(file_path):
    """
    Load Gowalla check-in data from a compressed file.
    """
    with gzip.open(file_path, 'rt') as f:
        # Column names based on Gowalla data description
        columns = ["user_id", "check_in_time", "latitude", "longitude", "location_id"]
        data = pd.read_csv(f, sep="\t", names=columns)

    # Convert timestamp to datetime for easier manipulation
    data["check_in_time"] = pd.to_datetime(data["check_in_time"])
    return data

checkins_file = "loc-gowalla_totalCheckins.txt.gz"
gowalla_data = load_checkins(checkins_file)

# Step 2: Construct User Trajectories
def construct_trajectories(data, time_window="1D"):
    """
    Group check-ins into user trajectories based on a time window.
    :param data: DataFrame of check-ins
    :param time_window: Pandas time frequency string (e.g., '1D' for daily)
    """
    trajectories = []

    # Group by user and sort by timestamp
    grouped = data.groupby("user_id")
    for user_id, group in grouped:
        group = group.sort_values("check_in_time")
        group["trajectory_id"] = (group["check_in_time"].diff() > pd.Timedelta(time_window)).cumsum()

        # Create a trajectory for each group
        for traj_id, traj_group in group.groupby("trajectory_id"):
            trajectory = {
                "user_id": user_id,
                "trajectory_id": traj_id,
                "timestamps": traj_group["check_in_time"].tolist(),
                "locations": traj_group["location_id"].tolist(),
            }
            trajectories.append(trajectory)

    return pd.DataFrame(trajectories)

trajectories_df = construct_trajectories(gowalla_data)

# Step 3: Encode POIs
def encode_pois(trajectories):
    """
    Encode POIs into unique numeric IDs.
    :param trajectories: DataFrame containing trajectory information
    """
    all_locations = set(loc for traj in trajectories["locations"] for loc in traj)
    location_mapping = {loc: idx for idx, loc in enumerate(all_locations)}

    # Replace locations with their numeric IDs
    trajectories["encoded_locations"] = trajectories["locations"].apply(
        lambda locs: [location_mapping[loc] for loc in locs]
    )
    return trajectories, location_mapping

trajectories_df, poi_mapping = encode_pois(trajectories_df)

# Save
trajectories_df.to_csv("gowalla_trajectories.csv", index=False)

# Display some trajectories
print(trajectories_df.head())


In [6]:
!pip install transformers==4.31.0 tqdm sklearn

import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from collections import Counter

Collecting transformers==4.31.0
  Using cached transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [7]:
# Step 1: Load the Processed Data
trajectories_df = pd.read_csv("gowalla_trajectories.csv")

# Use a smaller subset for faster preliminary experiments
subset_size = 10000
trajectories_df = trajectories_df.sample(n=subset_size, random_state=42).reset_index(drop=True)

trajectories = trajectories_df["encoded_locations"].apply(eval).tolist()
labels = trajectories_df["user_id"].tolist()

# Step 2: Re-encode the Labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

num_users = len(label_encoder.classes_)
print(f"Number of users (classes): {num_users}")
print(f"Label range: {min(labels)} to {max(labels)}")

# Train-test split
train_trajectories, val_trajectories, train_labels, val_labels = train_test_split(
    trajectories, labels, test_size=0.2, random_state=42
)

# Step 3: Define a Dataset Class for Trajectories
class GowallaDataset(Dataset):
    def __init__(self, trajectories, labels, tokenizer, max_length):
        self.trajectories = trajectories
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.trajectories)

    def __getitem__(self, idx):
        trajectory = self.trajectories[idx]
        label = self.labels[idx]

        trajectory_str = " ".join(map(str, trajectory))
        inputs = self.tokenizer(
            trajectory_str,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Step 4: Prepare Tokenizer and Datasets
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
max_length = 64

train_dataset = GowallaDataset(train_trajectories, train_labels, tokenizer, max_length)
val_dataset = GowallaDataset(val_trajectories, val_labels, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Step 5: Define the Model
class TrajectoryDistilBERT(torch.nn.Module):
    def __init__(self, num_users):
        super(TrajectoryDistilBERT, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.classifier = torch.nn.Linear(self.bert.config.dim, num_users)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # first token
        logits = self.classifier(cls_output)
        return logits

model = TrajectoryDistilBERT(num_users)

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

# Step 6: Setup Optimizer and Loss
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Additional Metrics
def compute_metrics(outputs, labels, top_k=5):
    # outputs: (batch_size, num_classes)
    # labels: (batch_size)
    with torch.no_grad():
        # Top-1 accuracy
        _, preds = torch.max(outputs, dim=1)
        correct_top1 = (preds == labels).sum().item()
        top1_acc = correct_top1 / labels.size(0)

        # Top-k accuracy
        topk_values, topk_indices = torch.topk(outputs, k=top_k, dim=1)
        correct_topk = 0
        for i in range(labels.size(0)):
            if labels[i].item() in topk_indices[i]:
                correct_topk += 1
        topk_acc = correct_topk / labels.size(0)

    return top1_acc, topk_acc

def evaluate_model(model, loader, device, loss_fn):
    model.eval()
    total_loss = 0.0
    total_top1 = 0
    total_top5 = 0
    total_samples = 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            batch_size = labels.size(0)
            total_loss += loss.item() * batch_size

            top1_acc, top5_acc = compute_metrics(outputs, labels, top_k=5)
            total_top1 += top1_acc * batch_size
            total_top5 += top5_acc * batch_size
            total_samples += batch_size

    avg_loss = total_loss / total_samples
    avg_top1 = total_top1 / total_samples
    avg_top5 = total_top5 / total_samples
    return avg_loss, avg_top1, avg_top5

def train_model(model, train_loader, val_loader, device, loss_fn, optimizer, epochs=3):
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        train_iter = tqdm(train_loader, desc=f"Epoch {epoch+1} [Training]", leave=False)

        for batch in train_iter:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            train_iter.set_description(f"Epoch {epoch+1} [Training] loss: {loss.item():.4f}")

        avg_train_loss = total_loss / len(train_loader)

        # Evaluate on train set
        train_loss, train_top1, train_top5 = evaluate_model(model, train_loader, device, loss_fn)
        # Evaluate on validation set
        val_loss, val_top1, val_top5 = evaluate_model(model, val_loader, device, loss_fn)

        print(f"\nEpoch {epoch+1}:")
        print(f"Train Loss: {train_loss:.4f}, Train Top-1 Acc: {train_top1:.4f}, Train Top-5 Acc: {train_top5:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Top-1 Acc: {val_top1:.4f}, Val Top-5 Acc: {val_top5:.4f}")

# Train the model
train_model(model, train_loader, val_loader, device, loss_fn, optimizer, epochs=3)


Number of users (classes): 9066
Label range: 0 to 9065
Using device: cuda





Epoch 1:
Train Loss: 9.1221, Train Top-1 Acc: 0.0001, Train Top-5 Acc: 0.0004
Val Loss: 9.1432, Val Top-1 Acc: 0.0000, Val Top-5 Acc: 0.0005





Epoch 2:
Train Loss: 9.0192, Train Top-1 Acc: 0.0003, Train Top-5 Acc: 0.0019
Val Loss: 9.5721, Val Top-1 Acc: 0.0000, Val Top-5 Acc: 0.0000





Epoch 3:
Train Loss: 8.9334, Train Top-1 Acc: 0.0004, Train Top-5 Acc: 0.0020
Val Loss: 10.2561, Val Top-1 Acc: 0.0000, Val Top-5 Acc: 0.0000


### Lowering number of classes to 1000

In [8]:
# Step 1: Load and Filter the Data to Reduce Number of Classes

# Load the dataset
trajectories_df = pd.read_csv("gowalla_trajectories.csv")

# Count occurrences per user
user_counts = Counter(trajectories_df['user_id'])

# Choose top N users (adjust N as desired)
N = 1000
top_users = {user for user, count in user_counts.most_common(N)}

# Filter the DataFrame to only keep trajectories of top N users
filtered_df = trajectories_df[trajectories_df['user_id'].isin(top_users)].reset_index(drop=True)

# Prepare Data: Label Encoding and Train/Test Split

trajectories = filtered_df["encoded_locations"].apply(eval).tolist()
labels = filtered_df["user_id"].tolist()

# Label encode the users
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

num_users = len(label_encoder.classes_)
print(f"Number of users (classes): {num_users}")
print(f"Label range: {min(labels)} to {max(labels)}")

train_trajectories, val_trajectories, train_labels, val_labels = train_test_split(
    trajectories, labels, test_size=0.2, random_state=42
)

# Step 2: Define the Dataset Class

class GowallaDataset(Dataset):
    def __init__(self, trajectories, labels, tokenizer, max_length):
        self.trajectories = trajectories
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.trajectories)

    def __getitem__(self, idx):
        trajectory = self.trajectories[idx]
        label = self.labels[idx]

        # Convert the trajectory (list of POIs) to a string
        trajectory_str = " ".join(map(str, trajectory))

        # Tokenize input sequence
        inputs = self.tokenizer(
            trajectory_str,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Step 3: Create Datasets and Loaders

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
max_length = 64

train_dataset = GowallaDataset(train_trajectories, train_labels, tokenizer, max_length)
val_dataset = GowallaDataset(val_trajectories, val_labels, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Step 4: Define the Model

class TrajectoryDistilBERT(torch.nn.Module):
    def __init__(self, num_users):
        super(TrajectoryDistilBERT, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.classifier = torch.nn.Linear(self.bert.config.dim, num_users)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # DistilBERT: last_hidden_state is (batch_size, seq_len, hidden_size)
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output)
        return logits

model = TrajectoryDistilBERT(num_users)

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

# Step 5: Setup Optimizer, Loss, and Metrics

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

def compute_metrics(outputs, labels, top_k=5):
    with torch.no_grad():
        _, preds = torch.max(outputs, dim=1)
        correct_top1 = (preds == labels).sum().item()
        top1_acc = correct_top1 / labels.size(0)

        topk_values, topk_indices = torch.topk(outputs, k=top_k, dim=1)
        correct_topk = 0
        for i in range(labels.size(0)):
            if labels[i].item() in topk_indices[i]:
                correct_topk += 1
        topk_acc = correct_topk / labels.size(0)

    return top1_acc, topk_acc

def evaluate_model(model, loader, device, loss_fn):
    model.eval()
    total_loss = 0.0
    total_top1 = 0
    total_top5 = 0
    total_samples = 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            batch_size = labels.size(0)
            total_loss += loss.item() * batch_size

            top1_acc, top5_acc = compute_metrics(outputs, labels, top_k=5)
            total_top1 += top1_acc * batch_size
            total_top5 += top5_acc * batch_size
            total_samples += batch_size

    avg_loss = total_loss / total_samples
    avg_top1 = total_top1 / total_samples
    avg_top5 = total_top5 / total_samples
    return avg_loss, avg_top1, avg_top5

# Step 6: Training Loop

def train_model(model, train_loader, val_loader, device, loss_fn, optimizer, epochs=3):
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        train_iter = tqdm(train_loader, desc=f"Epoch {epoch+1} [Training]", leave=False)

        for batch in train_iter:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            train_iter.set_description(f"Epoch {epoch+1} [Training] loss: {loss.item():.4f}")

        # Evaluate on train and val sets
        train_loss, train_top1, train_top5 = evaluate_model(model, train_loader, device, loss_fn)
        val_loss, val_top1, val_top5 = evaluate_model(model, val_loader, device, loss_fn)

        print(f"\nEpoch {epoch+1}:")
        print(f"Train Loss: {train_loss:.4f}, Train Top-1 Acc: {train_top1:.4f}, Train Top-5 Acc: {train_top5:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Top-1 Acc: {val_top1:.4f}, Val Top-5 Acc: {val_top5:.4f}")

# Run training
train_model(model, train_loader, val_loader, device, loss_fn, optimizer, epochs=3)


Collecting transformers==4.31.0
  Using cached transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Number of users (classes): 1000
Label range: 0 to 999




Using device: cuda





Epoch 1:
Train Loss: 5.9143, Train Top-1 Acc: 0.1454, Train Top-5 Acc: 0.2515
Val Loss: 6.0755, Val Top-1 Acc: 0.1248, Val Top-5 Acc: 0.2143





Epoch 2:
Train Loss: 4.9110, Train Top-1 Acc: 0.2624, Train Top-5 Acc: 0.3943
Val Loss: 5.2770, Val Top-1 Acc: 0.2217, Val Top-5 Acc: 0.3310





Epoch 3:
Train Loss: 4.1185, Train Top-1 Acc: 0.3573, Train Top-5 Acc: 0.5034
Val Loss: 4.6998, Val Top-1 Acc: 0.2954, Val Top-5 Acc: 0.3984
