In [1]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from matplotlib.ticker import MaxNLocator

import torchmetrics
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

from multiprocessing import cpu_count

In [2]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 16, 10

tqdm.pandas()

In [3]:
pl.seed_everything(42)

Seed set to 42


42

In [4]:
X_train = pd.read_csv('./career-con-2019/X_train.csv')
y_train = pd.read_csv('./career-con-2019/y_train.csv')

In [5]:
y_train.head()

Unnamed: 0,series_id,group_id,surface
0,0,13,fine_concrete
1,1,31,concrete
2,2,20,concrete
3,3,31,concrete
4,4,22,soft_tiles


In [6]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(y_train.surface)
encoded_labels[:5]

array([2, 1, 1, 1, 6])

In [7]:
label_encoder.classes_

array(['carpet', 'concrete', 'fine_concrete', 'hard_tiles',
       'hard_tiles_large_space', 'soft_pvc', 'soft_tiles', 'tiled',
       'wood'], dtype=object)

In [8]:
y_train['label'] = encoded_labels
y_train.head()

Unnamed: 0,series_id,group_id,surface,label
0,0,13,fine_concrete,2
1,1,31,concrete,1
2,2,20,concrete,1
3,3,31,concrete,1
4,4,22,soft_tiles,6


In [9]:
FEATURE_COLUMNS = X_train.columns.tolist()[3:]
FEATURE_COLUMNS

['orientation_X',
 'orientation_Y',
 'orientation_Z',
 'orientation_W',
 'angular_velocity_X',
 'angular_velocity_Y',
 'angular_velocity_Z',
 'linear_acceleration_X',
 'linear_acceleration_Y',
 'linear_acceleration_Z']

In [10]:
LABEL_COLUMNS = y_train.columns.tolist()

In [11]:
LABEL_COLUMNS

['series_id', 'group_id', 'surface', 'label']

In [12]:
# checking to see if each of the examples has a label in the y_train

(X_train.series_id.value_counts() == 128).sum() == len(y_train)

True

In [13]:
sequences = []

for series_id, group in X_train.groupby('series_id'):
  sequence_features = group[FEATURE_COLUMNS]

  label = y_train[y_train.series_id == series_id].iloc[0].label

  sequences.append((sequence_features, label))

In [14]:
train_sequences, test_sequences = train_test_split(sequences,test_size=0.2)

In [15]:
len(train_sequences), len(test_sequences)

(3048, 762)

In [16]:
class SurfaceDataset(Dataset):

  def __init__(self,sequences):
    self.sequences = sequences

  def __len__(self):
    return len(self.sequences)

  def __getitem__(self,idx):
    sequence, label = self.sequences[idx]
    sequence_tensor = torch.Tensor(sequence.to_numpy())
    # Ensure label is a tensor with the same dimensions as sequence
    label_tensor = torch.Tensor([label]).long()
    return dict(
        sequence = sequence_tensor,
        labels = label_tensor
    )

In [17]:
class SurfaceDataModule(pl.LightningDataModule):

  def __init__(self, train_sequences, test_sequences, batch_size=8):
    super().__init__()
    self.batch_size = batch_size
    self.train_sequences = train_sequences
    self.test_sequences = test_sequences

  def setup(self, stage=None):
    self.train_dataset = SurfaceDataset(self.train_sequences)
    self.test_dataset = SurfaceDataset(self.test_sequences)

  def train_dataloader(self):
    return DataLoader(
      self.train_dataset,
      batch_size=self.batch_size,
      shuffle=True,
      num_workers=cpu_count()
    )

  def val_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      shuffle=False,
      num_workers=cpu_count()
    )

  def test_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      shuffle=False,
      num_workers=cpu_count()
    )

In [18]:
N_EPOCHS = 20
BATCH_SIZE = 64

data_module = SurfaceDataModule(
  train_sequences,
  test_sequences,
  batch_size=BATCH_SIZE
)

In [19]:
class SequenceModel(nn.Module):
  def __init__(self,n_features, n_classes, n_hidden=256, n_layers=3):
    super().__init__()
    
    self.lstm = nn.LSTM(
        input_size=n_features,
        hidden_size=n_hidden,
        num_layers=n_layers,
        batch_first=True,
        dropout=0.75
    )

    self.classifier = nn.Linear(n_hidden,n_classes)

  def forward(self,x):
    self.lstm.flatten_parameters()
    _,(hidden,_) = self.lstm(x)

    out = hidden[-1]
    return self.classifier(out)


In [20]:
class SurfacePredictor(pl.LightningModule):

  def __init__(self,n_features:int, n_classes: int):
    super().__init__()
    self.model = SequenceModel(n_features, n_classes)
    self.criterion = nn.CrossEntropyLoss()
    self.n_classes = n_classes

  def forward(self, x, labels=None):
    output = self.model(x)
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    sequences = batch["sequence"]
    labels = batch["labels"].squeeze()
    loss, outputs = self(sequences, labels)
    predictions = torch.argmax(outputs,dim=1)
    step_accuracy = torchmetrics.functional.accuracy(predictions, labels, task="multiclass", num_classes=self.n_classes)

    self.log("train_loss", loss, prog_bar=True, logger=True)
    self.log("train_accuracy", step_accuracy, prog_bar=True, logger=True)
    return {"loss": loss, "accuracy": step_accuracy}

  def validation_step(self, batch, batch_idx):
    sequences = batch["sequence"]
    labels = batch["labels"].squeeze()
    loss, outputs = self(sequences, labels)
    predictions = torch.argmax(outputs,dim=1)
    step_accuracy = torchmetrics.functional.accuracy(predictions, labels, task="multiclass", num_classes=self.n_classes)

    self.log("val_loss", loss, prog_bar=True, logger=True)
    self.log("val_accuracy", step_accuracy, prog_bar=True, logger=True)
    return {"loss": loss, "accuracy": step_accuracy}

  def test_step(self, batch, batch_idx):
    
    sequences = batch["sequence"]
    labels = batch["labels"].squeeze()
    loss, outputs = self(sequences, labels)
    predictions = torch.argmax(outputs,dim=1)
    step_accuracy = torchmetrics.functional.accuracy(predictions, labels, task="multiclass", num_classes=self.n_classes)

    self.log("test_loss", loss, prog_bar=True, logger=True)
    self.log("test_accuracy", step_accuracy, prog_bar=True, logger=True)
    return {"loss": loss, "accuracy": step_accuracy}

  
  def configure_optimizers(self):
    return optim.Adam(self.parameters(), lr=0.001)

In [21]:
model = SurfacePredictor(n_features=len(FEATURE_COLUMNS),n_classes=len(label_encoder.classes_))

In [22]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

Reusing TensorBoard on port 6006 (pid 35261), started 0:01:54 ago. (Use '!kill 35261' to kill it.)

In [23]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10)

trainer = pl.Trainer(
    callbacks=[early_stopping_callback, checkpoint_callback],
    max_epochs=N_EPOCHS,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [24]:
trainer.fit(model, data_module)

/home/nick/anaconda3/envs/wearables/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /home/nick/Documents/ss24/ds4w-user-identification/checkpoints exists and is not empty.

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | SequenceModel    | 1.3 M 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.318     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/nick/anaconda3/envs/wearables/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (48) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 48: 'val_loss' reached 2.00916 (best 2.00916), saving model to '/home/nick/Documents/ss24/ds4w-user-identification/checkpoints/best-checkpoint-v10.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 96: 'val_loss' reached 1.95475 (best 1.95475), saving model to '/home/nick/Documents/ss24/ds4w-user-identification/checkpoints/best-checkpoint-v10.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 144: 'val_loss' reached 1.93585 (best 1.93585), saving model to '/home/nick/Documents/ss24/ds4w-user-identification/checkpoints/best-checkpoint-v10.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 192: 'val_loss' reached 1.84139 (best 1.84139), saving model to '/home/nick/Documents/ss24/ds4w-user-identification/checkpoints/best-checkpoint-v10.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 240: 'val_loss' reached 1.80959 (best 1.80959), saving model to '/home/nick/Documents/ss24/ds4w-user-identification/checkpoints/best-checkpoint-v10.ckpt' as top 1


: 

In [None]:
trainer.test(datamodule=data_module)

Restoring states from the checkpoint path at /home/nick/Documents/ss24/ds4w-user-identification/checkpoints/best-checkpoint-v7.ckpt
Loaded model weights from the checkpoint at /home/nick/Documents/ss24/ds4w-user-identification/checkpoints/best-checkpoint-v7.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_accuracy         0.3215222954750061
        test_loss            1.788115382194519
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 1.788115382194519, 'test_accuracy': 0.3215222954750061}]

In [None]:
trained_model = SurfacePredictor.load_from_checkpoint(
  trainer.checkpoint_callback.best_model_path,
  n_features=len(FEATURE_COLUMNS),
  n_classes=len(label_encoder.classes_)
)
trained_model.eval()
trained_model.freeze()

In [None]:
from tqdm.notebook import tqdm 
test_dataset = SurfaceDataset(test_sequences)

predictions = []
labels = []

for item in tqdm(test_dataset):
  sequence = item["sequence"]
  label = item["labels"]

  _, output = trained_model(sequence.unsqueeze(dim=0))
  prediction = torch.argmax(output, dim=1)
  predictions.append(prediction.item())
  labels.append(label.item())

  0%|          | 0/762 [00:00<?, ?it/s]

In [None]:
print(
    classification_report(labels, predictions, target_names=label_encoder.classes_ )
)

                        precision    recall  f1-score   support

                carpet       0.00      0.00      0.00        47
              concrete       0.40      0.83      0.54       172
         fine_concrete       0.00      0.00      0.00        65
            hard_tiles       0.00      0.00      0.00         5
hard_tiles_large_space       0.00      0.00      0.00        63
              soft_pvc       0.34      0.42      0.37       141
            soft_tiles       0.00      0.00      0.00        63
                 tiled       0.00      0.00      0.00        89
                  wood       0.19      0.38      0.25       117

              accuracy                           0.32       762
             macro avg       0.10      0.18      0.13       762
          weighted avg       0.18      0.32      0.23       762



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True surface')
  plt.xlabel('Predicted surface')

In [None]:
cm = confusion_matrix(labels, predictions)
df_cm = pd.DataFrame(
    cm, index=label_encoder.classes_, columns=label_encoder.classes_
)
df_cm

Unnamed: 0,carpet,concrete,fine_concrete,hard_tiles,hard_tiles_large_space,soft_pvc,soft_tiles,tiled,wood
carpet,0,20,0,0,0,4,0,0,23
concrete,0,142,0,0,0,7,0,0,23
fine_concrete,0,14,0,0,0,8,0,0,43
hard_tiles,0,0,0,0,0,5,0,0,0
hard_tiles_large_space,0,48,0,0,0,7,0,0,8
soft_pvc,0,18,0,0,0,59,0,0,64
soft_tiles,0,4,0,0,0,39,0,0,20
tiled,0,73,0,0,0,8,0,0,8
wood,0,34,0,0,0,39,0,0,44
