# Classic ML model applied to BoW sequence representation


In [2]:
import pandas as pd
import numpy as np

In [37]:
import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Read processed data

In [12]:
dev_df = pd.read_csv('/content/drive/MyDrive/instaDeep/data/dev_filtered.csv')
test_df = pd.read_csv('/content/drive/MyDrive/instaDeep/data/test_filtered.csv')
train_df = pd.read_csv('/content/drive/MyDrive/instaDeep/data/train_filtered.csv')



---



---



### BoW as a Data Encoding

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
# Tokenizes the sequences into individual characters
cv = CountVectorizer(lowercase=False, analyzer='char')

In [15]:
# Learn the vocabulary and set indices for amino acids that are used to generate vector encodings
cv.fit(train_df['sequence'])

In [16]:
cv.vocabulary_

{'F': 5,
 'V': 19,
 'S': 16,
 'G': 6,
 'A': 0,
 'T': 17,
 'Q': 14,
 'I': 8,
 'N': 12,
 'E': 4,
 'K': 9,
 'L': 10,
 'D': 3,
 'M': 11,
 'R': 15,
 'H': 7,
 'Y': 22,
 'C': 2,
 'P': 13,
 'W': 20,
 'X': 21,
 'U': 18,
 'B': 1}

In [17]:
size_vocabulary = len(cv.get_feature_names_out())
print(size_vocabulary)

23


In [18]:
cv.get_feature_names_out()

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N',
       'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y'], dtype=object)

In [19]:
# Encode amino acids using BoW technique
X_train = cv.transform(train_df['sequence'])
X_dev = cv.transform(dev_df['sequence'])
X_test = cv.transform(test_df['sequence'])

#### Example

In [20]:
X_train.shape

(360946, 23)

In [21]:
X_train[0]

<1x23 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [22]:
feat_vect = np.array(X_train[0].todense())[0]
feat_vect.shape

(23,)

In [23]:
feat_vect

# Feature vector corresponding to the 1st training sample

array([15,  0,  2,  9, 12,  5, 10,  2, 13,  7, 21,  1, 14,  1,  8,  2, 10,
       13,  0, 17,  1,  0,  1])

In [24]:
train_df.iloc[0]['sequence']

'FVVSGSATQITANQIEKLEEADEFENLLVINLDMKTVLAGVSDEIVNRIVNNLGQNNIVVVHTSTLVRDFDGFSEDSLNAELTKANLANVITDFLAELTQKVVAQKELILITLGGETSYKCCSAIGATQLQLIDEVAPAIALSLDHNAQWIVTKSGNLGGVNTL'

In [25]:
[amino_acid for amino_acid in train_df.iloc[0]['sequence'] if amino_acid == 'F']

['F', 'F', 'F', 'F', 'F']

In [26]:
[amino_acid for amino_acid in train_df.iloc[0]['sequence'] if amino_acid == 'V']

num_ocurrs_v = len([amino_acid for amino_acid in train_df.iloc[0]['sequence'] if amino_acid == 'V'])

print(f' At index {cv.vocabulary_["V"]}: the number of occurrences "V" in input sequence is: {num_ocurrs_v}')

 At index 19: the number of occurrences "V" in input sequence is: 17


In [28]:
# Get target labels 
train_labels = train_df["true_label_encoded"].values
test_labels = test_df["true_label_encoded"].values
dev_labels = dev_df["true_label_encoded"].values

In [None]:
num_classes = len(train_labels.unique())

### Define the DataLoader

In [39]:
"""
    Define a custom dataset class that will be used to load the data
"""

class ProteinDataset(Dataset):
    def __init__(self, encoded_sequences, labels):
        self.sequences = torch.tensor(encoded_sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.int64)
    
    def __getitem__(self, index):
        x = self.sequences[index]
        y = self.labels[index]
        return x, y

    def __len__(self):
        return len(self.sequences)

In [40]:
train_ds = ProteinDataset(X_train.todense(), train_labels)

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=64,
    shuffle=True
)

In [41]:
dev_ds = ProteinDataset(X_dev.todense(), dev_labels)

dev_loader = DataLoader(
    dataset=dev_ds,
    batch_size=64,
    shuffle=False,
)

In [42]:
test_ds = ProteinDataset(X_test.todense(), test_labels)

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=64,
    shuffle=False,
)

## Softmax Regression Classifier

In [38]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [43]:
class SoftmaxRegression(torch.nn.Module):
    """ Naive Softmax regression module """
    
    def __init__(self, input_dim, output_dim):
        """
        Args:
            input_dim  (int): input vector size.
            output_dim (int): output vector size (number of classes).
        """
        super().__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        # pass x through a parameterized linear transformation
        y = self.linear(x)

        # pass the result through softmax over the last dimension to generate
        # a probability distribution vector over the classes:
        y = torch.nn.functional.softmax(y, dim=-1)
        return y

In [45]:
# Creating the model and moving it to the GPU
model = SoftmaxRegression(size_vocabulary, num_classes)
model.to(device)

In [172]:
## Test the DataLoader
# for batch_ind, (inputs, labels) in enumerate(train_loader):
#     break

In [46]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [47]:
def train(model, dataloader, loss_fn, optimizer):
    """
    Args:
        model  (torch.nn.Module): model.
        dataloader (torch.utils.data.DataLoader): data loader object to use for training.
    Returns:
        loss_total (float): loss value.
        acc_total  (float): accuracy.
    """
    # num_sample: number of samples explored
    num_sample = 0.

    # loss_total, acc_total
    # variables to collect overall loss and accuracy
    loss_total = 0.
    acc_total = 0.
    
    model.train()

    for inputs, labels in dataloader:
        num_sample += float(inputs.size(0))

        # .zero_grad() to clear the gradients
        optimizer.zero_grad()

        # computing the model predictions
        preds = model(inputs.to(device)) 

        # computing the loss value for the mini-batch
        loss = loss_fn(preds, labels.to(device))

        # computing the gradient w.r.t. to parameters
        loss.backward()

        # update learnable parameters 
        optimizer.step()

        # cummulating loss in loss_total
        loss_total += float(loss.detach().item())

        # cummulating number of correct classifications in acc_total
        acc_total += float(
            (torch.argmax(preds.detach(), dim=-1) == torch.argmax(labels.to(device), dim=-1)).sum()
        )

    # dividing by total number of visited samples
    loss_total = loss_total / num_sample
    acc_total = acc_total / num_sample

    return loss_total, acc_total

In [48]:
def evaluate(model, dataloader, loss_fn, optimizer):
    """
    Args:
        model  (torch.nn.Module): model.
        dataloader (torch.utils.data.DataLoader): 
            data loader object to use for training.
    Returns:
        loss_total (float): loss value.
        acc_total  (float): accuracy.
    """
    num_sample = 0.

    loss_total = 0.
    acc_total = 0.

    model.eval()

    for inputs, labels in dataloader:
        num_sample += float(inputs.size(0))

        preds = model(inputs.to(device)).detach()
        loss_total += float(loss_fn(preds, labels.to(device)).detach().item())
        acc_total += float(
            (torch.argmax(preds, dim=-1) == torch.argmax(labels.to(device), dim=-1)).sum()
        )

    loss_total = loss_total / num_sample
    acc_total = acc_total / num_sample

    return loss_total, acc_total

In [52]:
max_epochs = 10

for epoch in range(max_epochs):
    train_loss_total, train_acc_total = train(model, train_loader, loss_fn, optimizer)
    val_loss_total, val_acc_total = evaluate(model, dev_loader, loss_fn, optimizer)

    print(f'[EPOCH:{epoch+1:3d}/{max_epochs}]',
        f'train.loss: {train_loss_total:.4f}',
        f'train.acc: {100*train_acc_total:3.2f}%',
        f'val.loss: {val_loss_total:.4f}',
        f'val.acc: {100*val_acc_total:3.2f}%')

[EPOCH:  1/10] train.loss: 0.2097 train.acc: 0.15% val.loss: 0.2098 val.acc: 0.20%
[EPOCH:  2/10] train.loss: 0.2097 train.acc: 0.13% val.loss: 0.2098 val.acc: 0.15%
[EPOCH:  3/10] train.loss: 0.2097 train.acc: 0.14% val.loss: 0.2098 val.acc: 0.15%
[EPOCH:  4/10] train.loss: 0.2097 train.acc: 0.13% val.loss: 0.2098 val.acc: 0.19%
[EPOCH:  5/10] train.loss: 0.2097 train.acc: 0.13% val.loss: 0.2098 val.acc: 0.17%
[EPOCH:  6/10] train.loss: 0.2097 train.acc: 0.14% val.loss: 0.2098 val.acc: 0.15%
[EPOCH:  7/10] train.loss: 0.2097 train.acc: 0.13% val.loss: 0.2098 val.acc: 0.15%
[EPOCH:  8/10] train.loss: 0.2097 train.acc: 0.13% val.loss: 0.2097 val.acc: 0.17%
[EPOCH:  9/10] train.loss: 0.2097 train.acc: 0.12% val.loss: 0.2097 val.acc: 0.16%
[EPOCH: 10/10] train.loss: 0.2097 train.acc: 0.13% val.loss: 0.2097 val.acc: 0.15%


In [53]:
test_loss_total, test_acc_total = evaluate(model, test_loader, loss_fn, optimizer)

print(f'[Test set performance]',
        f'test.loss: {test_loss_total:.4f}',
        f'test.acc: {100*test_acc_total:3.2f}%')

[Test set performance] test.loss: 0.2097 test.acc: 0.14%


**Observation**: The loss on the training and validation datasets stays the same and the accuracy on the training dataset does not increase and on the validation dataset fluctuates. 

Next steps to do:
- Try more epochs for training
- Add regularization (weight decay)
- Increase complexity of the model (more FC layers)

In [54]:
# Creating the model and moving it to the GPU (if available):
model = SoftmaxRegression(size_vocabulary, num_classes)
model.to(device)

SoftmaxRegression(
  (linear): Linear(in_features=23, out_features=896, bias=True)
)

In [57]:
optimizer = torch.optim.Adam(model.parameters(), 
                             weight_decay=0.0001
                             )

In [58]:
max_epochs = 30

for epoch in range(max_epochs):
    train_loss_total, train_acc_total = train(model, train_loader, loss_fn, optimizer)
    val_loss_total, val_acc_total = evaluate(model, dev_loader, loss_fn, optimizer)

    print(f'[EPOCH:{epoch+1:3d}/{max_epochs}]',
        f'train.loss: {train_loss_total:.4f}',
        f'train.acc: {100*train_acc_total:3.2f}%',
        f'val.loss: {val_loss_total:.4f}',
        f'val.acc: {100*val_acc_total:3.2f}%')

[EPOCH:  1/30] train.loss: 0.2114 train.acc: 0.09% val.loss: 0.2113 val.acc: 0.29%
[EPOCH:  2/30] train.loss: 0.2112 train.acc: 0.09% val.loss: 0.2113 val.acc: 0.13%
[EPOCH:  3/30] train.loss: 0.2112 train.acc: 0.13% val.loss: 0.2113 val.acc: 0.12%
[EPOCH:  4/30] train.loss: 0.2112 train.acc: 0.12% val.loss: 0.2113 val.acc: 0.27%
[EPOCH:  5/30] train.loss: 0.2112 train.acc: 0.13% val.loss: 0.2113 val.acc: 0.15%
[EPOCH:  6/30] train.loss: 0.2112 train.acc: 0.11% val.loss: 0.2113 val.acc: 0.18%
[EPOCH:  7/30] train.loss: 0.2112 train.acc: 0.11% val.loss: 0.2113 val.acc: 0.12%
[EPOCH:  8/30] train.loss: 0.2112 train.acc: 0.12% val.loss: 0.2113 val.acc: 0.13%
[EPOCH:  9/30] train.loss: 0.2112 train.acc: 0.13% val.loss: 0.2113 val.acc: 0.32%
[EPOCH: 10/30] train.loss: 0.2112 train.acc: 0.12% val.loss: 0.2113 val.acc: 0.22%
[EPOCH: 11/30] train.loss: 0.2112 train.acc: 0.13% val.loss: 0.2113 val.acc: 0.18%
[EPOCH: 12/30] train.loss: 0.2112 train.acc: 0.11% val.loss: 0.2113 val.acc: 0.20%
[EPO

In [59]:
test_loss_total, test_acc_total = evaluate(model, test_loader, loss_fn, optimizer)

print(f'[Test set performance]',
        f'test.loss: {test_loss_total:.4f}',
        f'test.acc: {100*test_acc_total:3.2f}%')

[Test set performance] test.loss: 0.2112 test.acc: 0.11%


**Observations**: Adding L2 regularization does not prevent the val.acc from fluctuating / decreasing. Training for more epochs did not help to overcome issues with low performance on the training dataset. 

The model suffers from underfitting. 
Potential improvements:
- add more FC layers prior to softmax
- have better hand-crafted feature representations of the protein sequence 
- switch to sequence models (to learn features without applying domain knowledge)

In [65]:
class FCSoftmaxRegression(torch.nn.Module):
    """ Softmax regression module with two hidden layers (dims hardcoded) """

    def __init__(self, input_dim, output_dim):
        """
        Args:
            input_dim  (int): input vector size.
            output_dim (int): output vector size (number of classes).
        """
        super().__init__()
        # self.linear = torch.nn.Linear(input_dim, output_dim)

        self.fc1 = torch.nn.Linear(input_dim, 128)
        self.fc2 = torch.nn.Linear(128, 256)
        self.fc3 = torch.nn.Linear(256, output_dim)

    def forward(self, x):
        # pass x through a parameterized linear transformation
        # y = self.linear(x)

        x = self.fc1(x)
        x = torch.nn.functional.relu(x)

        # pass through second hidden layer
        x = self.fc2(x)
        x = torch.nn.functional.relu(x)

        # pass through the third (output) layer
        x = self.fc3(x)


        # pass the result through softmax over the last dimension to generate
        # a probability distribution vector over the classes:
        y = torch.nn.functional.softmax(x, dim=-1)
        return y

In [70]:
# Creating the model and moving it to the GPU (if available):
model = FCSoftmaxRegression(size_vocabulary, num_classes)
model.to(device)

FCSoftmaxRegression(
  (fc1): Linear(in_features=23, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=896, bias=True)
)

In [71]:
optimizer = torch.optim.Adam(model.parameters(), 
                             weight_decay=0
                             )

In [72]:
max_epochs = 15

for epoch in range(max_epochs):
    train_loss_total, train_acc_total = train(model, train_loader, loss_fn, optimizer)
    val_loss_total, val_acc_total = evaluate(model, dev_loader, loss_fn, optimizer)

    print(f'[EPOCH:{epoch+1:3d}/{max_epochs}]',
        f'train.loss: {train_loss_total:.4f}',
        f'train.acc: {100*train_acc_total:3.2f}%',
        f'val.loss: {val_loss_total:.4f}',
        f'val.acc: {100*val_acc_total:3.2f}%')

[EPOCH:  1/15] train.loss: 0.2111 train.acc: 0.10% val.loss: 0.2109 val.acc: 0.06%
[EPOCH:  2/15] train.loss: 0.2109 train.acc: 0.23% val.loss: 0.2109 val.acc: 0.43%
[EPOCH:  3/15] train.loss: 0.2108 train.acc: 0.22% val.loss: 0.2109 val.acc: 0.09%
[EPOCH:  4/15] train.loss: 0.2108 train.acc: 0.18% val.loss: 0.2108 val.acc: 0.35%
[EPOCH:  5/15] train.loss: 0.2108 train.acc: 0.20% val.loss: 0.2109 val.acc: 0.19%
[EPOCH:  6/15] train.loss: 0.2108 train.acc: 0.22% val.loss: 0.2109 val.acc: 0.28%
[EPOCH:  7/15] train.loss: 0.2108 train.acc: 0.16% val.loss: 0.2108 val.acc: 0.45%
[EPOCH:  8/15] train.loss: 0.2109 train.acc: 0.14% val.loss: 0.2109 val.acc: 0.17%
[EPOCH:  9/15] train.loss: 0.2108 train.acc: 0.22% val.loss: 0.2109 val.acc: 0.33%
[EPOCH: 10/15] train.loss: 0.2109 train.acc: 0.34% val.loss: 0.2108 val.acc: 0.30%
[EPOCH: 11/15] train.loss: 0.2108 train.acc: 0.20% val.loss: 0.2109 val.acc: 0.09%
[EPOCH: 12/15] train.loss: 0.2108 train.acc: 0.15% val.loss: 0.2110 val.acc: 0.44%
[EPO

In [73]:
test_loss_total, test_acc_total = evaluate(model, test_loader, loss_fn, optimizer)

print(f'[Test set performance]',
        f'test.loss: {test_loss_total:.4f}',
        f'test.acc: {100*test_acc_total:3.2f}%')

[Test set performance] test.loss: 0.2109 test.acc: 0.64%


**Observation**: Increasing the model complexity has helped with underfitting problem. Hence, to further improve the performance, more complex models can be used over BoW encoding or other known encoding techniques.

Potential improvements:
- Use another technique for encoding the protein sequence, e.g. 2-gram BoW, 3-gram BoW, (combination of both)
- Use deeper NN (potentially add regularization: L2, dropout)
- Since the training is not stable, early stopping might be helpful to get the best-performing model
- Hyperparameter tuning, e.g. the default learning rate used (no tuning).

In [76]:
lr = optimizer.param_groups[0]['lr']
lr

0.001