# Note:
- This notebook file may contain methods or algorithms that are NOT covered by the teaching content of BT4222 and hence will not be assessed in your midterm exam.
- It serves to increase your exposure in depth and breath to the practical methods in addressing the specific project topic. We believe it will be helpful for your current project and also your future internship endeavors.

# **Import Library**

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR

# **Define Network Structure**
 The network structure was from
"Hierarchical Attentional Hybrid Neural Networks for Document Classification"
Figure1. For fair compasrsion, we encoding the whole document as a vector of 50 and add 3 CNN layers to extract the information
Then, follow by one block mention in paper
```
conv1.weight     torch.Size([32, 1, 3])
conv1.bias       torch.Size([32])
Bn1.weight       torch.Size([32])
Bn1.bias         torch.Size([32])
Bn1.running_mean         torch.Size([32])
Bn1.running_var          torch.Size([32])
Bn1.num_batches_tracked          torch.Size([])
conv2.weight     torch.Size([32, 1, 3])
conv2.bias       torch.Size([32])
Bn2.weight       torch.Size([32])
Bn2.bias         torch.Size([32])
Bn2.running_mean         torch.Size([32])
Bn2.running_var          torch.Size([32])
Bn2.num_batches_tracked          torch.Size([])
conv3.weight     torch.Size([32, 1, 3])
conv3.bias       torch.Size([32])
Bn3.weight       torch.Size([32])
Bn3.bias         torch.Size([32])
Bn3.running_mean         torch.Size([32])
Bn3.running_var          torch.Size([32])
Bn3.num_batches_tracked          torch.Size([])
bi_lstm1.weight_ih_l0    torch.Size([400, 960])
bi_lstm1.weight_hh_l0    torch.Size([400, 100])
bi_lstm1.bias_ih_l0      torch.Size([400])
bi_lstm1.bias_hh_l0      torch.Size([400])
fc1.weight       torch.Size([100, 100])
fc1.bias         torch.Size([100])
self_attn_1.in_proj_weight       torch.Size([300, 100])
self_attn_1.in_proj_bias         torch.Size([300])
self_attn_1.out_proj.weight      torch.Size([100, 100])
self_attn_1.out_proj.bias        torch.Size([100])
fc2.weight       torch.Size([5, 100])
fc2.bias         torch.Size([5])
```
self.self_attn_1: This is a multi-head self-attention mechanism that is a core component of the Transformer model. The nn.MultiheadAttention module expects input of size (seq_len, batch, embed_dim), where seq_len is the sequence length, batch is the batch size, and embed_dim is the embedding dimension (must be divisible by the number of attention heads). It applies multiple "heads" of attention to the input, and then concatenates the output of these heads and linearly transforms it. This allows the network to focus on different parts of the input for each head.


In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Define the first convolutional layer
        self.conv1 = nn.Conv1d(1, 32, 3, 1,1, bias=True)
        # Define the batch normalization layer for the first conv layer
        self.Bn1 = nn.BatchNorm1d(32)
        # Define the max pooling layer for the first conv layer
        self.pool1=nn.MaxPool1d(kernel_size=5, stride=5)

        self.conv2 = nn.Conv1d(1, 32, 3, 1,1, bias=True)
        self.Bn2 = nn.BatchNorm1d(32)
        self.pool2=nn.MaxPool1d(kernel_size=5, stride=5)

        self.conv3 = nn.Conv1d(1, 32, 3, 1,1, bias=True)
        self.Bn3 = nn.BatchNorm1d(32)
        self.pool3=nn.MaxPool1d(kernel_size=5, stride=5)

        # Define LSTM layer with input size of 960 and hidden size of 100
        self.bi_lstm1 = nn.LSTM(input_size=960, hidden_size=100, num_layers=1, batch_first=True, bidirectional=False)
        # Define the first fully connected layer after LSTM
        self.fc1 = nn.Linear(100, 100, bias=True)
        # Define self-attention layer
        self.self_attn_1 = nn.MultiheadAttention(embed_dim=100, num_heads=4)
        # Define the final fully connected layer for classification
        self.fc2 = nn.Linear(100, 5, bias=True)

    def forward(self, x):
        # Pass input through the first convolutional layer, then through the ReLU activation function, then through max pooling
        x_layer1 = self.pool1(F.relu(self.Bn1(self.conv1(x))))
        x_layer2 = self.pool1(F.relu(self.Bn2(self.conv2(x))))
        x_layer3 = self.pool1(F.relu(self.Bn3(self.conv3(x))))
        # Concatenate the outputs of the three layers along the channel dimension
        x = torch.cat((x_layer1, x_layer2,x_layer3), 1)

        # Flatten the tensor for the fully connected layers
        x = torch.flatten(x, 1)

        # Pass input through the LSTM layer
        x, _ = self.bi_lstm1(x)
        # Pass output of LSTM layer through the first fully connected layer, then through the ReLU activation function
        x = F.relu(self.fc1(x))
        # Reshape the tensor for the self-attention layer
        x = x.view(-1, 1, 100)
        # Pass tensor through the self-attention layer
        x, _ = self.self_attn_1(x.permute(1, 0, 2), x.permute(1, 0, 2), x.permute(1, 0, 2))
        # Reshape tensor back to original shape
        x = x.permute(1, 0, 2)
        x = x.view(-1, 100)
        # Pass the output through the final fully connected layer for classification
        x = self.fc2(x)

        return x



# **Training and Testing**

In [None]:
def train(args, model, device, train_loader, optimizer, epoch):
    model.train()  # Set the model to training mode

    for batch_idx, (data, target) in enumerate(train_loader):  # Loop over each batch from the training set
        data, target = data.to(device), target.to(device)  # Move the data to the device that is used

        target = target-1  # Adjust the target values (Moving 1-5 to 0-4  for easy training)
        target = target.long()  # Make sure that target data is long type (necessary for loss function)

        optimizer.zero_grad()  # Clear gradients from the previous training step
        output = model(data)  # Run forward pass (model predictions)

        loss = F.cross_entropy(output, target)  # Calculate the loss between the output and target
        loss.backward()  # Perform backpropagation (calculate gradients of loss w.r.t. parameters)
        optimizer.step()  # Update the model parameters

        if batch_idx % args.log_interval == 0:  # Print log info for specified interval
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx * len(data), len(train_loader.dataset),100. * batch_idx / len(train_loader), loss.item()))



def test(model, device, test_loader):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0
    correct = 0

    with torch.no_grad():  # Deactivates autograd, reduces memory usage and speeds up computations
        for data, target in test_loader:  # Loop over each batch from the testing set
            data, target = data.to(device), target.to(device)  # Move the data to the device that is used
            target = target-1  # Adjust the target values
            output = model(data)  # Run forward pass (model predictions)
            pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max log-probability as the predicted output
            correct += pred.eq(target.view_as(pred)).sum().item()  # Count correct predictions

    test_loss /= len(test_loader.dataset)  # Calculate the average loss

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(test_loss, correct, len(test_loader.dataset),100. * correct / len(test_loader.dataset)))
    return correct  # Return the number of correctly classified samples


# **Hyperparameter**

We use only cpu here as an example. learning rate is set as 1.

In [None]:
class Args:
  epochs = 10
  lr = 1.0
  use_cuda=False
  gamma = 0.7
  log_interval = 10
  no_cuda = False
  seed = 1

args = Args()

# **Load Data**

In [None]:
from google.colab import drive

import gdown

file_id = '1CCIfElCaURQbuYvHZiL445UQIRzmmuM7'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'train_vectors.pt'
gdown.download(url, output, quiet=False)

file_id = '1bwkg7XdmH6Mkp_tkAakCbxJMWNAXJU43'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'train_labels.pt'
gdown.download(url, output, quiet=False)

file_id = '1fprUkqC9Qb-y1eDRZt0gA4-4gS941TUo'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'test_vectors.pt'
gdown.download(url, output, quiet=False)

file_id = '1VwOqpW7DZPhqAGDrreVwhtzCB2lUc_LD'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'test_labels.pt'
gdown.download(url, output, quiet=False)

train_vectors = torch.load('train_vectors.pt')
train_labels = torch.load('train_labels.pt')
test_vectors = torch.load('test_vectors.pt')
test_labels = torch.load('test_labels.pt')





Downloading...
From: https://drive.google.com/uc?id=1CCIfElCaURQbuYvHZiL445UQIRzmmuM7
To: /content/train_vectors.pt
100%|██████████| 80.0M/80.0M [00:03<00:00, 25.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1bwkg7XdmH6Mkp_tkAakCbxJMWNAXJU43
To: /content/train_labels.pt
100%|██████████| 3.20M/3.20M [00:00<00:00, 16.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1fprUkqC9Qb-y1eDRZt0gA4-4gS941TUo
To: /content/test_vectors.pt
100%|██████████| 20.0M/20.0M [00:00<00:00, 45.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1VwOqpW7DZPhqAGDrreVwhtzCB2lUc_LD
To: /content/test_labels.pt
100%|██████████| 801k/801k [00:00<00:00, 5.89MB/s]


# **Start training and testing**

In [None]:
torch.manual_seed(args.seed)

device = torch.device("cuda" if args.use_cuda else "cpu")
model = Net().to(device)

for param_tensor in model.state_dict():
        print(param_tensor, "\t", model.state_dict()[param_tensor].size())

#Form training and testing dataset
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

train_dataset = torch.utils.data.TensorDataset(train_vectors, train_labels)
test_dataset = torch.utils.data.TensorDataset(test_vectors, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=640, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=640, shuffle=False)
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)

#Model training
ACC = 0
for epoch in range(1, args.epochs + 1):
    train(args, model, device, train_loader, optimizer, epoch)
    ACC_ = test(model, device, test_loader)
    if ACC_>ACC or ACC_ == ACC:
        ACC = ACC_
        torch.save(model.state_dict(), "cnn_lstm_att.pt")

    scheduler.step()

print(ACC)


conv1.weight 	 torch.Size([32, 1, 3])
conv1.bias 	 torch.Size([32])
Bn1.weight 	 torch.Size([32])
Bn1.bias 	 torch.Size([32])
Bn1.running_mean 	 torch.Size([32])
Bn1.running_var 	 torch.Size([32])
Bn1.num_batches_tracked 	 torch.Size([])
conv2.weight 	 torch.Size([32, 1, 3])
conv2.bias 	 torch.Size([32])
Bn2.weight 	 torch.Size([32])
Bn2.bias 	 torch.Size([32])
Bn2.running_mean 	 torch.Size([32])
Bn2.running_var 	 torch.Size([32])
Bn2.num_batches_tracked 	 torch.Size([])
conv3.weight 	 torch.Size([32, 1, 3])
conv3.bias 	 torch.Size([32])
Bn3.weight 	 torch.Size([32])
Bn3.bias 	 torch.Size([32])
Bn3.running_mean 	 torch.Size([32])
Bn3.running_var 	 torch.Size([32])
Bn3.num_batches_tracked 	 torch.Size([])
bi_lstm1.weight_ih_l0 	 torch.Size([400, 960])
bi_lstm1.weight_hh_l0 	 torch.Size([400, 100])
bi_lstm1.bias_ih_l0 	 torch.Size([400])
bi_lstm1.bias_hh_l0 	 torch.Size([400])
fc1.weight 	 torch.Size([100, 100])
fc1.bias 	 torch.Size([100])
self_attn_1.in_proj_weight 	 torch.Size([300, 1