In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import models
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoConfig
import random
from matplotlib import pyplot as plt
import pickle
from torch.utils.data import DataLoader, TensorDataset

In [2]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [3]:
class TokenEmbedding(nn.Module):
    def __init__(self, in_channels, seq_len, d_model=768):
        """
        Token Embedding layer for patched data.
        Args:
            in_channels (int): Number of input channels: 2 for RF fingerprint.
            d_model (int): Dimension of the model: 768 for BERT_base.
            seq_len (int): Length of the patched sequence.
        """
        super(TokenEmbedding, self).__init__()
        self.conv = nn.Conv1d(in_channels, d_model, kernel_size=3, padding=1, bias=False)
        self.linear = nn.Linear(seq_len, 1)

        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')

    def forward(self, x):
        # x: (batch_size*n_patches=B*N, in_channels=C, seq_len=L)
        x = self.conv(x)  # (B*N, d_model, L)
        x = self.linear(x)  # (B*N, d_model, 1)
        x = x.squeeze(-1)  # (B*N, d_model)
        return x
    
# Patching and Embedding
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels=2, patch_size=32, stride=16, d_model=768, dropout=0.1, norm=True):
        """
        Args:
            in_channels (int): Number of input channels. 6 for IMU accelerometer+gyroscope.
            patch_size (int): Size (length) of the patches.  
            stride (int): Stride for patching.
            d_model (int): Dimension of the model: 768 for BERT_base. 1024 for BERT_large.
            dropout (float): Dropout rate.
        """
        super(PatchEmbedding, self).__init__()
        self.patch_size = patch_size
        self.in_channels = in_channels
        self.stride = stride
        self.d_model = d_model

        # Instance Normalization before patching
        self.norm = norm
        self.instance_norm = nn.InstanceNorm1d(num_features=in_channels, affine=True)

        # embed each patch (token)
        self.value_embedding = TokenEmbedding(in_channels, patch_size, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x: (batch_size=B, in_channels=C, total_seq_len)
        batch_size, _, seq_len = x.size()

        # apply instance normalization
        if self.norm :
            x = self.instance_norm(x)

        # pad last value of stride length to the end
        last_value = x[:, :, -1].unsqueeze(2).repeat(1, 1, self.stride)
        x = torch.cat((x, last_value), dim=2)  # (batch_size, in_channels, seq_len + stride)

        # patching
        patches = []
        for i in range(0, seq_len + self.stride, self.stride):
            if i + self.patch_size <= seq_len + self.stride:
                patch = x[:, :, i:i + self.patch_size]
                patches.append(patch)
        patches = torch.stack(patches, dim=3)  # (batch_size, in_channels, patch_size, num_patches)
        patches = patches.permute(0, 3, 1, 2)  # (batch_size, num_patches, in_channels, patch_size)
        # (B*N, C, L)
        patches = patches.reshape(batch_size*patches.size(1), self.in_channels, self.patch_size)

        # token embedding
        embeddings = self.value_embedding(patches)  # (B*N, d_model)
        embeddings = embeddings.view(batch_size, -1, self.d_model)  # (B, N, d_model)
        # print('emb:',embeddings.size())
        return embeddings
    
    
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=256):
        """
        Learnable Positional Embedding for BERT.
        Args:
            d_model (int): Dimension of the model (e.g., 768 for BERT_base).
            max_len (int): Maximum sequence length (number of patches).
            dropout (float): Dropout rate.
        """
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # learnable positional embeddings
        self.positional_embedding = nn.Parameter(torch.zeros(1, max_len, d_model))
        # initialize positional embeddings
        nn.init.normal_(self.positional_embedding, mean=0.0, std=0.02)

    def forward(self, x):
        # x: (batch_size, num_patches, d_model)
        batch_size, num_patches, d_model = x.size()
        # add positional embeddings
        positional_embeddings = self.positional_embedding[:, :num_patches, :]
        x = x + positional_embeddings
        x = self.dropout(x)
        return x

In [4]:
class FineTunedLLM(nn.Module):
    def __init__(self, patch_embed, model_name="bert-base-uncased", 
                 max_len=512, dropout=0.1, num_classes=6):
        """
        BERT model with pre-trained weights, fine-tuning only positional embeddings and layer normalization.
        Args:
            patch_embed (nn.Module): Patch embedding module.
            model_name (str): Name of the pre-trained BERT model.
            max_len (int): Maximum sequence length for positional embeddings.
            dropout (float): Dropout rate.
        """
        super(FineTunedLLM, self).__init__()
        self.patch_embed = patch_embed

        # Load pre-trained BERT model
        self.config = AutoConfig.from_pretrained(model_name)
        self.llm = AutoModel.from_pretrained(model_name)
        d_model = self.config.hidden_size # 768 for BERT_base, 1024 for BERT_large

        # Use custom PositionalEncoding
        self.positional_encoding = PositionalEncoding(d_model, dropout, max_len)
        
        self.classifier = nn.Linear(d_model, num_classes)

        # Freeze all params
        for p in self.llm.parameters():
            p.requires_grad = False

        # Unfreeze LayerNorm or RMSNorm layers
        for name, module in self.llm.named_modules():
            if isinstance(module, (nn.LayerNorm)):
                if hasattr(module, 'weight') and module.weight is not None:
                    module.weight.requires_grad = True
                if hasattr(module, 'bias') and module.bias is not None:
                    module.bias.requires_grad = True

        # Don't use input_ids embedding
        # Important: do NOT delete them â€” just avoid using them in forward
        # Enable PE to be trained
        self.positional_encoding.positional_embedding.requires_grad = True

    def forward(self, x):
        # x: (batch_size, in_channels, total_seq_len)
        # Patch embedding
        embeddings = self.patch_embed(x) # (batch_size, num_patches, d_model)
        # Apply positional encoding
        embeddings = self.positional_encoding(embeddings)

        # Pass through LLM encoder (skip token embeddings etc.)
        if hasattr(self.llm, "encoder"):  # BERT-style
            last_hidden_state = self.llm.encoder(
                hidden_states=embeddings,
                attention_mask=None,
                return_dict=True
            ).last_hidden_state # (batch_size, num_patches, d_model)
        else:
            # GPT / LLaMA etc.
            last_hidden_state = self.llm(inputs_embeds=embeddings, attention_mask=None).last_hidden_state

        # Use the mean of the last hidden state as the representation
        representation = last_hidden_state.mean(dim=1) # (batch_size, d_model)

        # Classifier
        logits = self.classifier(representation)
        return logits

In [5]:
model_name = "bert-base-uncased"

### HHAR

In [6]:
# load hhar data as example
hhar_data = np.load('../dataset/hhar/data_20_120.npy')
hhar_label = np.load('../dataset/hhar/label_20_120.npy')

# transpose from (N, 120, 6) to (N, 6, 120)
hhar_data = hhar_data.transpose(0, 2, 1)

print(hhar_data.shape) # (9166, 120, 6) 120 = sequence length, 6 = 3 axis * 2 (acc + gyro)
print(hhar_label.shape) # (9166, 120, 3) ('user', 'model', 'gt') gt = (bike, sit, stairsdown, stairsup, stand, walk): 6 classes

# extract the last dimension of the label (gt) to form a 1D array (9166,)
hhar_label = hhar_label[:, 0, 2]

(9166, 6, 120)
(9166, 120, 3)


In [7]:
def train_val_test_split(data, label, seed, test_size=0.4, val_size=0.5):
    set_seed(seed)
    X_train, X_val, y_train, y_val = train_test_split(data, label, test_size=test_size, random_state=seed)
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=val_size, random_state=seed)
    
    train_set = torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)

    val_set = torch.utils.data.TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.long))
    val_loader = torch.utils.data.DataLoader(val_set, batch_size=64, shuffle=False)

    test_set = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=64, shuffle=False)

    return train_loader, val_loader, test_loader


set_seed(3431)
train_loader, val_loader, test_loader = train_val_test_split(hhar_data, hhar_label, seed=3431)

In [8]:
set_seed(3431)
# load the model to get the output embedding as the pseudo label
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

patch_size = 8
stride = 4
d_model = 768 # BERT base model dimension
norm = False  # Use Instance Normalization before patching

patch_embed = PatchEmbedding(in_channels=6, patch_size=patch_size, stride=stride, d_model=d_model, norm=norm)
model = FineTunedLLM(patch_embed, model_name="gpt2", max_len=256, dropout=0.1)
model.to(device)



FineTunedLLM(
  (patch_embed): PatchEmbedding(
    (instance_norm): InstanceNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
    (value_embedding): TokenEmbedding(
      (conv): Conv1d(6, 768, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
      (linear): Linear(in_features=8, out_features=1, bias=True)
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (llm): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Con

In [9]:
def train_model(model, train_loader, val_loader, optimizer, criterion,
                device, num_epochs=50, save_path='best_model.pth', norm=False):
    best_acc = 0.0

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            # Classification loss
            cls_loss = criterion(outputs, labels)

            cls_loss.backward()
            optimizer.step()

            train_loss += cls_loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, dim=1)
            train_correct += (predicted == labels).sum().item()
            train_total += labels.size(0)

        train_loss /= train_total
        train_accuracy = train_correct / train_total
        print(f'Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.6f}')

        # --- Validation ---
        if epoch % 10 == 9:
            print(f"Epoch {epoch+1}", '-' * 40)
            eval_acc = evaluate_model(model, val_loader, device)
            print('-' * 50)
            if eval_acc > best_acc:
                best_acc = eval_acc
                torch.save(model.state_dict(), save_path)
                print(f"Best model saved with accuracy: {best_acc:.6f}")


def evaluate_model(model, test_loader, device):
    model.eval()
    test_correct = 0
    test_total = 0
    test_loss = 0.0
    criterion = nn.CrossEntropyLoss()
    with torch.no_grad():
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device)
            outputs = model(data)
            loss = criterion(outputs, labels)
            test_loss += loss.item() * data.size(0)
            _, predicted = torch.max(outputs, 1)
            test_correct += (predicted == labels).sum().item()
            test_total += labels.size(0)
    test_loss /= test_total
    test_accuracy = test_correct / test_total
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.6f}')
    return test_accuracy

In [10]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

train_model(model, train_loader, val_loader, optimizer, criterion,
            device, num_epochs=50, save_path='hhar.pth', norm=norm)

Epoch [1/50] - Train Loss: 1.5246, Accuracy: 0.398618
Epoch [2/50] - Train Loss: 1.3529, Accuracy: 0.437170
Epoch [3/50] - Train Loss: 1.1824, Accuracy: 0.524459
Epoch [4/50] - Train Loss: 0.7914, Accuracy: 0.726314
Epoch [5/50] - Train Loss: 0.5822, Accuracy: 0.812148
Epoch [6/50] - Train Loss: 0.4448, Accuracy: 0.862339
Epoch [7/50] - Train Loss: 0.2865, Accuracy: 0.907256
Epoch [8/50] - Train Loss: 0.2150, Accuracy: 0.933806
Epoch [9/50] - Train Loss: 0.1734, Accuracy: 0.948354
Epoch [10/50] - Train Loss: 0.1560, Accuracy: 0.952173
Epoch 10 ----------------------------------------
Test Loss: 0.1140, Test Accuracy: 0.963448
--------------------------------------------------
Best model saved with accuracy: 0.963448
Epoch [11/50] - Train Loss: 0.1358, Accuracy: 0.956901
Epoch [12/50] - Train Loss: 0.1274, Accuracy: 0.961629
Epoch [13/50] - Train Loss: 0.1146, Accuracy: 0.965630
Epoch [14/50] - Train Loss: 0.1133, Accuracy: 0.966721
Epoch [15/50] - Train Loss: 0.1011, Accuracy: 0.972904

In [11]:
evaluate_model(model, test_loader, device)

Test Loss: 0.0295, Test Accuracy: 0.994547


0.9945474372955289

### UCI

In [12]:
uci_data = np.load('../dataset/uci/data_20_120.npy') 
uci_label = np.load('../dataset/uci/label_20_120.npy')

print(uci_data.shape) # (2088, 120, 6) 120 = sequence length, 6 = 3 axis * 2 (acc + gyro)
print(uci_label.shape) # (2088, 120, 2) 6 classes (walk, upstairs, downstairs, sit, stand, lay), 30 participants

# transpose from (N, 120, 6) to (N, 6, 120)
uci_data = uci_data.transpose(0, 2, 1)

# extract the first dimension of the label (gt) to form a 1D array (2088,)
uci_label = uci_label[:, 0, 0]
print(uci_label.shape)

# statistics of the label
# count the number of each class
unique, counts = np.unique(uci_label, return_counts=True)
print(dict(zip(unique, counts)))

(2088, 120, 6)
(2088, 120, 2)
(2088,)
{0.0: 355, 1.0: 318, 2.0: 254, 3.0: 362, 4.0: 398, 5.0: 401}


In [13]:
set_seed(3431)
train_loader, val_loader, test_loader = train_val_test_split(uci_data, uci_label, seed=3431)

patch_embed = PatchEmbedding(in_channels=6, patch_size=patch_size, stride=stride, d_model=d_model, norm=norm)
model = FineTunedLLM(patch_embed, model_name="gpt2", max_len=256, dropout=0.1)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

train_model(model, train_loader, val_loader, optimizer, criterion,
            device, num_epochs=50, save_path='uci.pth', norm=norm)



Epoch [1/50] - Train Loss: 1.6101, Accuracy: 0.359425
Epoch [2/50] - Train Loss: 1.1431, Accuracy: 0.514377
Epoch [3/50] - Train Loss: 1.0505, Accuracy: 0.541534
Epoch [4/50] - Train Loss: 0.9938, Accuracy: 0.550319
Epoch [5/50] - Train Loss: 0.9530, Accuracy: 0.587859
Epoch [6/50] - Train Loss: 0.9336, Accuracy: 0.599042
Epoch [7/50] - Train Loss: 0.8434, Accuracy: 0.651757
Epoch [8/50] - Train Loss: 0.7889, Accuracy: 0.678914
Epoch [9/50] - Train Loss: 0.6750, Accuracy: 0.729233
Epoch [10/50] - Train Loss: 0.6257, Accuracy: 0.736422
Epoch 10 ----------------------------------------
Test Loss: 0.6920, Test Accuracy: 0.679426
--------------------------------------------------
Best model saved with accuracy: 0.679426
Epoch [11/50] - Train Loss: 0.5611, Accuracy: 0.790735
Epoch [12/50] - Train Loss: 0.4507, Accuracy: 0.844249
Epoch [13/50] - Train Loss: 0.3947, Accuracy: 0.866613
Epoch [14/50] - Train Loss: 0.3470, Accuracy: 0.882588
Epoch [15/50] - Train Loss: 0.3346, Accuracy: 0.875399

In [14]:
evaluate_model(model, test_loader, device)

Test Loss: 0.1942, Test Accuracy: 0.911483


0.9114832535885168

### Motion

In [15]:
# motion
motion_data = np.load('../dataset/motion/data_20_120.npy')
motion_label = np.load('../dataset/motion/label_20_120.npy')

print(motion_data.shape) # (4534, 120, 6) 120 = sequence length, 6 = 3 axis * 2 (acc + gyro)
print(motion_label.shape) # (4534, 120, 2) 6 classes: (stairdown, stairup, sit, stand, walk, jog) + 24 participants

# Normalize
# divide 9.8 for acceleration
# motion_data[:, :, :3] /= 9.8

# transpose from (N, 120, 6) to (N, 6, 120)
motion_data = motion_data.transpose(0, 2, 1)

# extract the first dimension of the label (gt) to form a 1D array (4534,)
motion_label = motion_label[:, 0, 0]
print(motion_label.shape)

# statistics of the label
# count the number of each class
unique, counts = np.unique(motion_label, return_counts=True)
print(dict(zip(unique, counts)))

(4534, 120, 6)
(4534, 120, 2)
(4534,)
{0.0: 402, 1.0: 490, 2.0: 1105, 3.0: 999, 4.0: 1112, 5.0: 426}


In [16]:
set_seed(3431)
train_loader, val_loader, test_loader = train_val_test_split(motion_data, motion_label, seed=3431)
patch_embed = PatchEmbedding(in_channels=6, patch_size=patch_size, stride=stride, d_model=d_model, norm=norm)
model = FineTunedLLM(patch_embed, model_name="gpt2", max_len=256, dropout=0.1)
model.to(device)  

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

train_model(model, train_loader, val_loader, optimizer, criterion,
            device, num_epochs=50, save_path='motion.pth', norm=norm)

Epoch [1/50] - Train Loss: 1.4349, Accuracy: 0.454412
Epoch [2/50] - Train Loss: 0.8997, Accuracy: 0.644853
Epoch [3/50] - Train Loss: 0.5055, Accuracy: 0.823529
Epoch [4/50] - Train Loss: 0.3303, Accuracy: 0.881618
Epoch [5/50] - Train Loss: 0.2800, Accuracy: 0.911397
Epoch [6/50] - Train Loss: 0.2284, Accuracy: 0.930147
Epoch [7/50] - Train Loss: 0.2125, Accuracy: 0.931618
Epoch [8/50] - Train Loss: 0.1934, Accuracy: 0.943750
Epoch [9/50] - Train Loss: 0.1973, Accuracy: 0.938603
Epoch [10/50] - Train Loss: 0.1835, Accuracy: 0.946691
Epoch 10 ----------------------------------------
Test Loss: 0.1881, Test Accuracy: 0.941566
--------------------------------------------------
Best model saved with accuracy: 0.941566
Epoch [11/50] - Train Loss: 0.1592, Accuracy: 0.960294
Epoch [12/50] - Train Loss: 0.1506, Accuracy: 0.959926
Epoch [13/50] - Train Loss: 0.1500, Accuracy: 0.961397
Epoch [14/50] - Train Loss: 0.1440, Accuracy: 0.958088
Epoch [15/50] - Train Loss: 0.1436, Accuracy: 0.957353

In [17]:
evaluate_model(model, test_loader, device)

Test Loss: 0.1163, Test Accuracy: 0.976847


0.9768467475192943

### Shoaib

In [18]:
shoaib_data = np.load('../dataset/shoaib/data_20_120.npy')
shoaib_label = np.load('../dataset/shoaib/label_20_120.npy')

print(shoaib_data.shape) # (10500, 120, 9) 120 = sequence length, 9 = 3 axis * 3 (acc + gyro + mag)
print(shoaib_label.shape) # (10500, 120, 3) 7 classes: (walking, sitting, standing, jogging, biking, upstairs, downstairs) + 10 participants

# remove mag data (last 3 channels)
shoaib_data = shoaib_data[:, :, :6]
# transpose from (N, 120, 9) to (N, 9, 120)
shoaib_data = shoaib_data.transpose(0, 2, 1)

# extract the first dimension of the label (gt) to form a 1D array (10500,)
shoaib_label = shoaib_label[:, 0, 0]
print(shoaib_label.shape)

# statistics of the label
# count the number of each class
unique, counts = np.unique(shoaib_label, return_counts=True)
print(dict(zip(unique, counts)))

(10500, 120, 9)
(10500, 120, 3)
(10500,)
{0.0: 1650, 1.0: 1500, 2.0: 1500, 3.0: 1500, 4.0: 1500, 5.0: 1350, 6.0: 1500}


In [19]:
set_seed(3431)

train_loader, val_loader, test_loader = train_val_test_split(shoaib_data, shoaib_label, seed=3431)
patch_embed = PatchEmbedding(in_channels=6, patch_size=patch_size, stride=stride, d_model=d_model, norm=norm)
model = FineTunedLLM(patch_embed, model_name="gpt2", max_len=256, dropout=0.1, num_classes=7)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

train_model(model, train_loader, val_loader, optimizer, criterion,
            device, num_epochs=50, save_path='shoaib.pth', norm=norm)

Epoch [1/50] - Train Loss: 1.4924, Accuracy: 0.405556
Epoch [2/50] - Train Loss: 0.8509, Accuracy: 0.692698
Epoch [3/50] - Train Loss: 0.4280, Accuracy: 0.838730
Epoch [4/50] - Train Loss: 0.2966, Accuracy: 0.890952
Epoch [5/50] - Train Loss: 0.2309, Accuracy: 0.918254
Epoch [6/50] - Train Loss: 0.2013, Accuracy: 0.924921
Epoch [7/50] - Train Loss: 0.1692, Accuracy: 0.940159
Epoch [8/50] - Train Loss: 0.1680, Accuracy: 0.937460
Epoch [9/50] - Train Loss: 0.1460, Accuracy: 0.948571
Epoch [10/50] - Train Loss: 0.1377, Accuracy: 0.950794
Epoch 10 ----------------------------------------
Test Loss: 0.1419, Test Accuracy: 0.950000
--------------------------------------------------
Best model saved with accuracy: 0.950000
Epoch [11/50] - Train Loss: 0.1220, Accuracy: 0.958095
Epoch [12/50] - Train Loss: 0.1169, Accuracy: 0.959683
Epoch [13/50] - Train Loss: 0.1059, Accuracy: 0.961587
Epoch [14/50] - Train Loss: 0.1093, Accuracy: 0.960159
Epoch [15/50] - Train Loss: 0.0932, Accuracy: 0.966508

In [20]:
evaluate_model(model, test_loader, device)

Test Loss: 0.1094, Test Accuracy: 0.964762


0.9647619047619047