In [None]:
!python3 preprocess_data.py -h

In [None]:
pip install torch

In [None]:
pip install torch==2.0.1+cu117 torchvision==0.15.2+cu117 torchaudio==2.0.2+cu117 --index-url https://download.pytorch.org/whl/cu117

In [None]:
pip install scikit-learn

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import time
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from torch.quantization import QuantStub, DeQuantStub

In [2]:
# Load the datasets from the provided files
X_train = np.load('./mesl_data/x_train.npy')
X_test = np.load('./mesl_data/x_test.npy')
y_train = np.load('./mesl_data/y_train.npy')
y_test = np.load('./mesl_data/y_test.npy')


# Verify the shapes of the data
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Check the unique values in y_train and y_test
print("Unique values in y_train:", np.unique(y_train))
print("Unique values in y_test:", np.unique(y_test))

# Ensure all labels are within the correct range
assert np.all((y_train >= 0) & (y_train < 26)), "y_train contains out-of-range values"
assert np.all((y_test >= 0) & (y_test < 26)), "y_test contains out-of-range values"

# Normalize data
def normalize(data):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    return (data - mean) / std

# Normalize the data
X_train = normalize(X_train)
X_test = normalize(X_test)

# Convert data to float32
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.int64)  # Ensuring labels are in int64
y_test = y_test.astype(np.int64)    # Ensuring labels are in int64

# Reshape data for the model
X_train = X_train.reshape((-1, 1, X_train.shape[1], X_train.shape[2]))
X_test = X_test.reshape((-1, 1, X_test.shape[1], X_test.shape[2]))

print(f"Train data shape after reshape: {X_train.shape}")
print(f"Test data shape after reshape: {X_test.shape}")

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


X_train shape: (4953, 150, 9)
X_test shape: (1320, 150, 9)
y_train shape: (4953,)
y_test shape: (1320,)
Unique values in y_train: [ 0  1  3  5  7  8 11 15 21 23 25]
Unique values in y_test: [ 0  1  3  5  7  8 11 15 21 23 25]
Train data shape after reshape: (4953, 1, 150, 9)
Test data shape after reshape: (1320, 1, 150, 9)
Using device: cuda


In [3]:
# Hardcoded parameters for the new dataset
NB_SENSOR_CHANNELS = 9
NUM_CLASSES = 26  # Updated number of classes
SLIDING_WINDOW_LENGTH = 150
BATCH_SIZE = 16
NUM_FILTERS = 64
FILTER_SIZE = 5
NUM_UNITS_LSTM = 128
LEARNING_RATE = 0.0001
NUM_EPOCHS = 150

In [4]:
'''
class QuantizedLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size, bias=True):
        super(QuantizedLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias

        self.ih = nn.Linear(input_size, 4 * hidden_size, bias=bias)
        self.hh = nn.Linear(hidden_size, 4 * hidden_size, bias=bias)

        self.quant1 = QuantStub()
        self.dequant1 = DeQuantStub()

        self.quant2 = QuantStub()
        self.dequant2 = DeQuantStub()

    def forward(self, input, hx):
        hx, cx = hx

        input = self.quant1(input)
        hx = self.quant1(hx)
        cx = self.quant2(cx)

        gates = self.ih(input) + self.hh(hx)
        i, f, g, o = gates.chunk(4, 1)

        i = torch.sigmoid(i)
        f = torch.sigmoid(f)
        g = torch.tanh(g)
        o = torch.sigmoid(o)

        cy = f * cx + i * g
        hy = o * torch.tanh(cy)

        hy = self.dequant1(hy)
        cy = self.dequant2(cy)

        return hy, cy

    def _init_hidden(self, batch_size, device):
        weight = next(self.parameters()).data
        return (weight.new(batch_size, self.hidden_size).zero_().to(device),
                weight.new(batch_size, self.hidden_size).zero_().to(device))
    
'''
import torch
import torch.nn as nn
import torch.quantization
'''
class QATDeepConvLSTM(nn.Module):
    def __init__(self):
        super(QATDeepConvLSTM, self).__init__()
        self.quant = torch.quantization.QuantStub()
        self.conv1 = nn.Conv2d(1, NUM_FILTERS, (FILTER_SIZE, 1))
        self.conv2 = nn.Conv2d(NUM_FILTERS, NUM_FILTERS, (FILTER_SIZE, 1))
        self.conv3 = nn.Conv2d(NUM_FILTERS, NUM_FILTERS, (FILTER_SIZE, 1))
        self.conv4 = nn.Conv2d(NUM_FILTERS, NUM_FILTERS, (FILTER_SIZE, 1))
        self.lstm1 = nn.LSTM(NUM_FILTERS * NB_SENSOR_CHANNELS, NUM_UNITS_LSTM, batch_first=True)
        self.lstm2 = nn.LSTM(NUM_UNITS_LSTM, NUM_UNITS_LSTM, batch_first=True)
        self.fc = nn.Linear(NUM_UNITS_LSTM, NUM_CLASSES)
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        self.relu3 = nn.ReLU()
        self.relu4 = nn.ReLU()


        self.quant1 = torch.quantization.QuantStub()
        self.quant2 = torch.quantization.QuantStub()
        self.quant3 = torch.quantization.QuantStub()
        self.quant4 = torch.quantization.QuantStub()
        self.dequant1 = torch.quantization.DeQuantStub()
        self.dequant2 = torch.quantization.DeQuantStub()
        self.dequant3 = torch.quantization.DeQuantStub()
        self.dequant4 = torch.quantization.DeQuantStub()

        self.quant_lstm1 = torch.quantization.QuantStub()
        self.quant_lstm2 = torch.quantization.QuantStub()
        self.dequant_lstm1 = torch.quantization.DeQuantStub()
        self.dequant_lstm2 = torch.quantization.DeQuantStub()

        self.quant_fc = torch.quantization.QuantStub()
        self.dequant_fc = torch.quantization.DeQuantStub()
        # Weight initialization
        nn.init.kaiming_uniform_(self.conv1.weight, nonlinearity='relu')
        nn.init.kaiming_uniform_(self.conv2.weight, nonlinearity='relu')
        nn.init.kaiming_uniform_(self.conv3.weight, nonlinearity='relu')
        nn.init.kaiming_uniform_(self.conv4.weight, nonlinearity='relu')
        nn.init.xavier_uniform_(self.fc.weight)

    def forward(self, x):
        #x = self.quant1(x)
        x = self.relu1(self.conv1(x))
        #x = self.dequant1(x)
        x = self.quant2(x)
        x = self.relu2(self.conv2(x))
        x = self.dequant2(x)
        x = self.quant3(x)
        x = self.relu3(self.conv3(x))
        x = self.dequant3(x)
        x = self.quant4(x)
        x = self.relu4(self.conv4(x))
        x = self.dequant4(x)
        x = x.permute(0, 2, 1, 3).contiguous().view(x.size(0), x.size(2), -1)
        
        x = self.quant_lstm1(x)
        x, _ = self.lstm1(x)
        x = self.dequant_lstm1(x)

        x = self.quant_lstm2(x)
        x, _ = self.lstm2(x)
        x = self.dequant_lstm2(x)

        x = self.quant_fc(x)
        x = self.fc(x[:, -1, :])
        x = self.dequant_fc(x)

        #x = self.dequant(x)
        return x
'''
# Define the QAT-ready network
class QATDeepConvLSTM(nn.Module):
    def __init__(self):
        super(QATDeepConvLSTM, self).__init__()
        self.quant = torch.quantization.QuantStub()
        self.conv1 = nn.Conv2d(1, NUM_FILTERS, (FILTER_SIZE, 1))
        self.conv2 = nn.Conv2d(NUM_FILTERS, NUM_FILTERS, (FILTER_SIZE, 1))
        self.conv3 = nn.Conv2d(NUM_FILTERS, NUM_FILTERS, (FILTER_SIZE, 1))
        self.conv4 = nn.Conv2d(NUM_FILTERS, NUM_FILTERS, (FILTER_SIZE, 1))
        self.relu = nn.ReLU()
        self.lstm1 = nn.LSTM(NUM_FILTERS * NB_SENSOR_CHANNELS, NUM_UNITS_LSTM, batch_first=True)
        self.lstm2 = nn.LSTM(NUM_UNITS_LSTM, NUM_UNITS_LSTM, batch_first=True)
        self.fc = nn.Linear(NUM_UNITS_LSTM, NUM_CLASSES)
        self.dequant = torch.quantization.DeQuantStub()
        
    def forward(self, x):
        x = self.quant(x)
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))
        x = x.permute(0, 2, 1, 3).contiguous().view(x.size(0), x.size(2), -1)
        
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        
        x = self.fc(x[:, -1, :])
        x = self.dequant(x)
        
        return x


In [5]:
import torch.quantization as quant
from torch.quantization.observer import MovingAverageMinMaxObserver, default_weight_observer
from torch.utils.data import DataLoader, Subset, TensorDataset

class CustomObserver(MovingAverageMinMaxObserver):
    def calculate_qparams(self):
        scale, _ = super().calculate_qparams()
        zero_point = torch.tensor(0, dtype=torch.int32)
        return scale, zero_point
# Create datasets and dataloaders
train_data = torch.utils.data.TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
test_data = torch.utils.data.TensorDataset(torch.tensor(X_test), torch.tensor(y_test))
train_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

# Initialize the QAT-ready model, loss function, and optimizer
model = QATDeepConvLSTM().to(device)

'''
model.qconfig = quant.QConfig(
    activation=quant.FakeQuantize.with_args(observer=CustomObserver, quant_min=0, quant_max=255, dtype=torch.quint8, qscheme=torch.per_tensor_affine),
    weight=quant.FakeQuantize.with_args(observer=default_weight_observer, quant_min=-128, quant_max=127, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
)
'''
torch.backends.quantized.engine = 'x86'
#model = torch.quantization.QuantWrapper(model)

model.qconfig = torch.quantization.get_default_qat_qconfig('x86')

torch.quantization.prepare_qat(model, inplace=True)
print("Model initialized and prepared for QAT")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Calibrate the model with representative data
calibration_size = int(0.1 * len(train_data))
calibration_indices = np.random.choice(len(train_data), calibration_size, replace=False)
calibration_subset = Subset(train_data, calibration_indices)
calibration_loader = DataLoader(calibration_subset, batch_size=BATCH_SIZE, shuffle=False)


Model initialized and prepared for QAT




In [6]:
device = torch.device("cuda")
model.to(device)
# Training loop with QAT
for epoch in range(1):
    model.train()
    train_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        train_loss += loss.item()

    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}, Loss: {train_loss / len(train_loader)}")




  return torch.fused_moving_avg_obs_fake_quant(
  return torch.fused_moving_avg_obs_fake_quant(


Epoch 1/150, Loss: 2.6766617205835157


In [7]:
model

QATDeepConvLSTM(
  (quant): QuantStub(
    (activation_post_process): FusedMovingAvgObsFakeQuantize(
      fake_quant_enabled=tensor([1], device='cuda:0'), observer_enabled=tensor([1], device='cuda:0'), scale=tensor([0.1443], device='cuda:0'), zero_point=tensor([57], device='cuda:0', dtype=torch.int32), dtype=torch.quint8, quant_min=0, quant_max=127, qscheme=torch.per_tensor_affine, reduce_range=True
      (activation_post_process): MovingAverageMinMaxObserver(min_val=-8.175763130187988, max_val=10.149259567260742)
    )
  )
  (conv1): Conv2d(
    1, 64, kernel_size=(5, 1), stride=(1, 1)
    (weight_fake_quant): FusedMovingAvgObsFakeQuantize(
      fake_quant_enabled=tensor([1], device='cuda:0'), observer_enabled=tensor([1], device='cuda:0'), scale=tensor([0.0032, 0.0017, 0.0026, 0.0029, 0.0034, 0.0034, 0.0031, 0.0034, 0.0030,
              0.0034, 0.0031, 0.0032, 0.0035, 0.0031, 0.0027, 0.0027, 0.0024, 0.0017,
              0.0032, 0.0035, 0.0028, 0.0017, 0.0031, 0.0011, 0.0033, 0.002

In [8]:

# Convert the model to a quantized version
model.to('cpu')
model.eval()
model_int8 = torch.quantization.convert(model, inplace=False)
print("Model converted to quantized version")

# Save the quantized model parameters
torch.save(model_int8.state_dict(), 'weights/QATDeepConvLSTM_trained_mesl_data.pth')



Model converted to quantized version


In [9]:
def fold_batch_norm(conv, bn):
    # Fold BatchNorm parameters into Convolution layer
    with torch.no_grad():
        scale_factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
        conv.weight.copy_(conv.weight * scale_factor.reshape([-1, 1, 1, 1]))
        if conv.bias is None:
            conv.bias = torch.nn.Parameter(torch.zeros(conv.weight.size(0), dtype=conv.weight.dtype, device=conv.weight.device))
        conv.bias.copy_((conv.bias - bn.running_mean) * scale_factor + bn.bias)
    return conv

# Fold BatchNorm layers
model.conv1 = fold_batch_norm(model.conv1, model.bn1)
model.conv2 = fold_batch_norm(model.conv2, model.bn2)
model.conv3 = fold_batch_norm(model.conv3, model.bn2)
model.conv4 = fold_batch_norm(model.conv4, model.bn2)

# Remove BatchNorm layers
model.bn1 = None
model.bn2 = None
model.bn3 = None
model.bn4 = None


AttributeError: 'QATDeepConvLSTM' object has no attribute 'bn1'

In [10]:
import torch
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
torch.backends.quantized.engine = 'x86'
# Ensure the model is in evaluation mode
model_int8.eval()
model_int8.to('cpu')
test_pred = []
test_true = []

batch_times = []
start_time = time.time()
with torch.no_grad():
    for inputs, targets in test_loader:
        batch_start_time = time.time()
        
        # Move inputs and targets to CPU and quantize the inputs
        inputs = inputs.to('cpu')
        #inputs_quantized = torch.quantize_per_tensor(inputs, scale=0.1, zero_point=0, dtype=torch.qint8)
        #inputs_quantized.to('cpu')
        targets = targets.to('cpu')
        
        # Perform inference
        outputs = model_int8(inputs)
        
        # Dequantize the outputs for further processing
        outputs_dequantized = outputs.dequantize()
        
        _, preds = torch.max(outputs_dequantized, 1)
        batch_end_time = time.time()
        
        batch_times.append(batch_end_time - batch_start_time)
        
        test_pred.extend(preds.numpy())
        test_true.extend(targets.numpy())

end_time = time.time()

# Calculate metrics
accuracy = accuracy_score(test_true, test_pred)
macro_precision = precision_score(test_true, test_pred, average='macro')
macro_recall = recall_score(test_true, test_pred, average='macro')
macro_f1 = f1_score(test_true, test_pred, average='macro')

# Results presentation
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Macro Precision: {macro_precision:.4f}")
print(f"Macro Recall: {macro_recall:.4f}")
print(f"Macro F1-score: {macro_f1:.4f}")

# Performance time
total_inference_time = end_time - start_time
average_batch_time = sum(batch_times) / len(batch_times)
average_sample_time = total_inference_time / len(test_loader.dataset)

print(f"Total inference time on CPU: {total_inference_time:.4f} seconds")
print(f"Average inference time per batch: {average_batch_time:.6f} seconds")
print(f"Average inference time per sample: {average_sample_time:.6f} seconds")


Test Accuracy: 0.6220
Macro Precision: 0.6498
Macro Recall: 0.6220
Macro F1-score: 0.6030
Total inference time on CPU: 2.1234 seconds
Average inference time per batch: 0.025475 seconds
Average inference time per sample: 0.001609 seconds


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Verify PyTorch version
print(f"PyTorch version: {torch.__version__}")

# Set the quantized engine to 'qnnpack'
torch.backends.quantized.engine = 'qnnpack'

# Verify supported quantization engines
print(f"Supported quantized engines: {torch.backends.quantized.supported_engines}")
