In [25]:
import os
import json
import torch
import torch.optim as optim
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [3]:
### DATA INGESTION

class SensorDataset(Dataset):
    def __init__(self, csv_dir, label_file):
        self.csv_dir = csv_dir
        with open(label_file, 'r') as f:
            self.labels_dict = json.load(f)
        self.file_list = [f for f in os.listdir(csv_dir) if f in self.labels_dict]

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_name = self.file_list[idx]
        file_path = os.path.join(self.csv_dir, file_name)
        
        df = pd.read_csv(file_path)
        
        features = df.iloc[:, 1:].values
        features = torch.tensor(features, dtype=torch.float32)
        
        label = self.labels_dict[file_name]
        label = torch.tensor(label, dtype=torch.float32)
        
        return features, label

def create_dataloader(csv_dir, label_file, batch_size=4):
    """
    Create dataloader for features (folder of CSVs) and labels (single JSON file).
    """
    dataset = SensorDataset(csv_dir, label_file)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [44]:
### MODELS

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# 3-layer MLP
class MLPClassifier(nn.Module):
    def __init__(self, input_size, num_classes, dropout_rate=0.2):
        super(MLPClassifier, self).__init__()
        self.network = nn.Sequential(
            nn.Flatten(),
            nn.Linear(input_size, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(128, 64),
            nn.ReLU(),
            
            nn.Linear(64, 32),
            nn.ReLU(),
            
            nn.Linear(32, num_classes)
        )

    def forward(self, x):
        return self.network(x)
    
# 1D CNN
class CNN1DClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(CNN1DClassifier, self).__init__()
        self.conv_block = nn.Sequential(
            nn.Conv1d(in_channels=input_size, out_channels=32, kernel_size=3),
            nn.ReLU(),
            nn.Conv1d(32, 64, kernel_size=3),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)
        )
        self.fc = nn.Linear(64, num_classes)

    def forward(self, x):
        x = x.transpose(1, 2)
        x = self.conv_block(x)
        x = x.squeeze(-1)
        return self.fc(x)

In [46]:
### TRAINING

def train_model(dataloader, model, criterion, optimiser, epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for batch_features, batch_labels in dataloader:
            batch_features = batch_features.to(device)
            batch_labels = batch_labels.to(device).long()
            
            optimiser.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimiser.step()
            
            total_loss += loss.item()
            
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

    # return loss.item()         # For Optuna, eventually

dataloader = create_dataloader("data/dummy/dataset", "data/dummy/dataset/labels.json", batch_size=3)
# model = LSTMClassifier(input_size=8, hidden_size=64, num_layers=2, num_classes=8)
# model = MLPClassifier(input_size=8, num_classes=8)
model = CNN1DClassifier(input_size=8, num_classes=8)

lr = 0.001
batch_size = 16
epochs = 20
criterion = nn.CrossEntropyLoss()
optimiser = optim.Adam(model.parameters(), lr)

train_model(dataloader, model, criterion, optimiser, epochs)

Epoch 1/20 - Loss: 17.7142
Epoch 2/20 - Loss: 3.0993
Epoch 3/20 - Loss: 3.9404
Epoch 4/20 - Loss: 1.9289
Epoch 5/20 - Loss: 2.4119
Epoch 6/20 - Loss: 2.1205
Epoch 7/20 - Loss: 1.4929
Epoch 8/20 - Loss: 1.1879
Epoch 9/20 - Loss: 1.4211
Epoch 10/20 - Loss: 1.5772
Epoch 11/20 - Loss: 1.0593
Epoch 12/20 - Loss: 1.1646
Epoch 13/20 - Loss: 1.0457
Epoch 14/20 - Loss: 1.7416
Epoch 15/20 - Loss: 0.9086
Epoch 16/20 - Loss: 1.3006
Epoch 17/20 - Loss: 0.8041
Epoch 18/20 - Loss: 0.7933
Epoch 19/20 - Loss: 0.6908
Epoch 20/20 - Loss: 0.8727


In [48]:
### PREDICTION AND EVALUATE

def get_features_from_csv(file_path):
    df = pd.read_csv(file_path)
    features = df.iloc[:, 1:].values
    return features

def predict_csv(model, features):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    input_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
    
    with torch.no_grad():
        logits = model(input_tensor)
        probs = torch.softmax(logits, dim=1)
        
        pred_class = torch.argmax(probs, dim=1).item()
        confidence = torch.max(probs).item()
        
    return pred_class, confidence

def evaluate_folder(model, folder_path, label_path):
    with open(label_path, 'r') as f:
        ground_truth = json.load(f)
        
    y_true = []
    y_pred = []

    for file_name, label in ground_truth.items():
        file_path = os.path.join(folder_path, file_name)
        features = get_features_from_csv(file_path)

        pred, confidence = predict_csv(model, features)
        y_true.append(label)
        y_pred.append(pred)

    accuracy = accuracy_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred, labels=list(range(8)))
    report = classification_report(y_true, y_pred, labels=list(range(8)), zero_division=0)
    
    return y_true, y_pred, accuracy, conf_matrix, report

folder_path = "data/dummy/dataset"
for item in evaluate_folder(model, folder_path, "data/dummy/dataset/labels.json"):
    print(item)

[6, 1, 4, 6, 7]
[7, 1, 4, 7, 7]
0.6
[[0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 2]
 [0 0 0 0 0 0 0 1]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      1.00      1.00         1
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       1.00      1.00      1.00         1
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         2
           7       0.33      1.00      0.50         1

   micro avg       0.60      0.60      0.60         5
   macro avg       0.29      0.38      0.31         5
weighted avg       0.47      0.60      0.50         5

