# Classic ML model applied to BoW sequence representation


In [1]:
import pandas as pd
import numpy as np

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Read processed data

In [4]:
dev_df = pd.read_csv('/content/drive/MyDrive/instaDeep/data/dev_filtered.csv')
test_df = pd.read_csv('/content/drive/MyDrive/instaDeep/data/test_filtered.csv')
train_df = pd.read_csv('/content/drive/MyDrive/instaDeep/data/train_filtered.csv')



---



---



### BoW as a Data Encoding

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
# Tokenizes the sequences into individual characters
cv = CountVectorizer(lowercase=False, analyzer='char')

In [7]:
# Learn the vocabulary and set indices for amino acids that are used to generate vector encodings
cv.fit(train_df['sequence'])

In [8]:
cv.vocabulary_

{'F': 5,
 'V': 19,
 'S': 16,
 'G': 6,
 'A': 0,
 'T': 17,
 'Q': 14,
 'I': 8,
 'N': 12,
 'E': 4,
 'K': 9,
 'L': 10,
 'D': 3,
 'M': 11,
 'R': 15,
 'H': 7,
 'Y': 22,
 'C': 2,
 'P': 13,
 'W': 20,
 'X': 21,
 'U': 18,
 'B': 1}

In [9]:
size_vocabulary = len(cv.get_feature_names_out())
print(size_vocabulary)

23


In [10]:
cv.get_feature_names_out()

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N',
       'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y'], dtype=object)

In [11]:
# Encode amino acids using BoW technique
X_train = cv.transform(train_df['sequence'])
X_dev = cv.transform(dev_df['sequence'])
X_test = cv.transform(test_df['sequence'])

#### Example

In [12]:
X_train.shape

(360946, 23)

In [13]:
X_train[0]

<1x23 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [14]:
feat_vect = np.array(X_train[0].todense())[0]
feat_vect.shape

(23,)

In [15]:
feat_vect

# Feature vector corresponding to the 1st training sample

array([15,  0,  2,  9, 12,  5, 10,  2, 13,  7, 21,  1, 14,  1,  8,  2, 10,
       13,  0, 17,  1,  0,  1])

In [16]:
train_df.iloc[0]['sequence']

'FVVSGSATQITANQIEKLEEADEFENLLVINLDMKTVLAGVSDEIVNRIVNNLGQNNIVVVHTSTLVRDFDGFSEDSLNAELTKANLANVITDFLAELTQKVVAQKELILITLGGETSYKCCSAIGATQLQLIDEVAPAIALSLDHNAQWIVTKSGNLGGVNTL'

In [17]:
[amino_acid for amino_acid in train_df.iloc[0]['sequence'] if amino_acid == 'F']

['F', 'F', 'F', 'F', 'F']

In [18]:
[amino_acid for amino_acid in train_df.iloc[0]['sequence'] if amino_acid == 'V']

num_ocurrs_v = len([amino_acid for amino_acid in train_df.iloc[0]['sequence'] if amino_acid == 'V'])

print(f' At index {cv.vocabulary_["V"]}: the number of occurrences "V" in input sequence is: {num_ocurrs_v}')

 At index 19: the number of occurrences "V" in input sequence is: 17


In [19]:
# Get target labels 
train_labels = train_df["true_label_encoded"].values
test_labels = test_df["true_label_encoded"].values
dev_labels = dev_df["true_label_encoded"].values

In [22]:
num_classes = len(np.unique(train_labels))
num_classes

896

### Define the DataLoader

In [23]:
"""
    Define a custom dataset class that will be used to load the data
"""

class ProteinDataset(Dataset):
    def __init__(self, encoded_sequences, labels):
        self.sequences = torch.tensor(encoded_sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.int64)
    
    def __getitem__(self, index):
        x = self.sequences[index]
        y = self.labels[index]
        return x, y

    def __len__(self):
        return len(self.sequences)

In [24]:
train_ds = ProteinDataset(X_train.todense(), train_labels)

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=128,
    shuffle=True
)

In [25]:
dev_ds = ProteinDataset(X_dev.todense(), dev_labels)

dev_loader = DataLoader(
    dataset=dev_ds,
    batch_size=128,
    shuffle=False,
)

In [26]:
test_ds = ProteinDataset(X_test.todense(), test_labels)

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=128,
    shuffle=False,
)

## Softmax Regression Classifier

In [27]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [28]:
class SoftmaxRegression(torch.nn.Module):
    """ Naive Softmax regression module """
    
    def __init__(self, input_dim, output_dim):
        """
        Args:
            input_dim  (int): input vector size.
            output_dim (int): output vector size (number of classes).
        """
        super().__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        # pass x through a parameterized linear transformation
        y = self.linear(x)

        # pass the result through softmax over the last dimension to generate
        # a probability distribution vector over the classes:
        y = torch.nn.functional.softmax(y, dim=-1)
        return y

In [29]:
# Creating the model and moving it to the GPU
model = SoftmaxRegression(size_vocabulary, num_classes)
model.to(device)

SoftmaxRegression(
  (linear): Linear(in_features=23, out_features=896, bias=True)
)

In [None]:
## Test the DataLoader
# for batch_ind, (inputs, labels) in enumerate(train_loader):
#     break

In [30]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [31]:
def train(model, dataloader, loss_fn, optimizer):
    """
    Args:
        model  (torch.nn.Module): model.
        dataloader (torch.utils.data.DataLoader): data loader object to use for training.
    Returns:
        loss_total (float): loss value.
        acc_total  (float): accuracy.
    """
    # num_sample: number of samples explored
    num_sample = 0.

    # loss_total, acc_total
    # variables to collect overall loss and accuracy
    loss_total = 0.
    acc_total = 0.
    
    model.train()

    for inputs, labels in dataloader:
        num_sample += float(inputs.size(0))

        # .zero_grad() to clear the gradients
        optimizer.zero_grad()

        # computing the model predictions
        preds = model(inputs.to(device)) 

        # computing the loss value for the mini-batch
        loss = loss_fn(preds, labels.to(device))

        # computing the gradient w.r.t. to parameters
        loss.backward()

        # update learnable parameters 
        optimizer.step()

        # cummulating loss in loss_total
        loss_total += float(loss.detach().item())

        # cummulating number of correct classifications in acc_total
        acc_total += float(
            (torch.argmax(preds.detach(), dim=-1) == labels.to(preds.device)).sum()
        )

    # dividing by total number of visited samples
    loss_total = loss_total / num_sample
    acc_total = acc_total / num_sample

    return loss_total, acc_total

In [32]:
def evaluate(model, dataloader, loss_fn, optimizer):
    """
    Args:
        model  (torch.nn.Module): model.
        dataloader (torch.utils.data.DataLoader): 
            data loader object to use for training.
    Returns:
        loss_total (float): loss value.
        acc_total  (float): accuracy.
    """
    num_sample = 0.

    loss_total = 0.
    acc_total = 0.

    model.eval()

    for inputs, labels in dataloader:
        num_sample += float(inputs.size(0))

        preds = model(inputs.to(device)).detach()
        loss_total += float(loss_fn(preds, labels.to(device)).detach().item())
        acc_total += float(
            (torch.argmax(preds, dim=-1) == labels.to(preds.device)).sum()
        )

    loss_total = loss_total / num_sample
    acc_total = acc_total / num_sample

    return loss_total, acc_total

In [33]:
max_epochs = 10

for epoch in range(max_epochs):
    train_loss_total, train_acc_total = train(model, train_loader, loss_fn, optimizer)
    val_loss_total, val_acc_total = evaluate(model, dev_loader, loss_fn, optimizer)

    print(f'[EPOCH:{epoch+1:3d}/{max_epochs}]',
        f'train.loss: {train_loss_total:.4f}',
        f'train.acc: {100*train_acc_total:3.2f}%',
        f'val.loss: {val_loss_total:.4f}',
        f'val.acc: {100*val_acc_total:3.2f}%')

[EPOCH:  1/10] train.loss: 0.0528 train.acc: 5.39% val.loss: 0.0527 val.acc: 7.36%
[EPOCH:  2/10] train.loss: 0.0526 train.acc: 7.77% val.loss: 0.0526 val.acc: 8.03%
[EPOCH:  3/10] train.loss: 0.0526 train.acc: 8.19% val.loss: 0.0526 val.acc: 8.25%
[EPOCH:  4/10] train.loss: 0.0525 train.acc: 8.38% val.loss: 0.0526 val.acc: 8.42%
[EPOCH:  5/10] train.loss: 0.0525 train.acc: 8.51% val.loss: 0.0526 val.acc: 8.55%
[EPOCH:  6/10] train.loss: 0.0525 train.acc: 8.68% val.loss: 0.0525 val.acc: 8.79%
[EPOCH:  7/10] train.loss: 0.0525 train.acc: 8.86% val.loss: 0.0525 val.acc: 8.86%
[EPOCH:  8/10] train.loss: 0.0525 train.acc: 8.93% val.loss: 0.0525 val.acc: 8.93%
[EPOCH:  9/10] train.loss: 0.0525 train.acc: 8.97% val.loss: 0.0525 val.acc: 8.99%
[EPOCH: 10/10] train.loss: 0.0525 train.acc: 9.01% val.loss: 0.0525 val.acc: 9.00%


In [34]:
test_loss_total, test_acc_total = evaluate(model, test_loader, loss_fn, optimizer)

print(f'[Test set performance]',
        f'test.loss: {test_loss_total:.4f}',
        f'test.acc: {100*test_acc_total:3.2f}%')

[Test set performance] test.loss: 0.0525 test.acc: 8.90%


**Observation**: The loss on the training and validation datasets stays the same and the accuracy on the training dataset does increase slightly and on the validation dataset also slightly increases. 

The model suffers from underfitting.  

Next steps to do:
- Try more epochs for training
- Increase complexity of the model (add more FC layers prior to softmax)
- Have better hand-crafted feature representations of the protein sequence 
- Switch to sequence models (to learn features without applying domain knowledge)

In [40]:
class FCSoftmaxRegression(torch.nn.Module):
    """ Softmax regression module with two hidden layers (dims hardcoded) """

    def __init__(self, input_dim, output_dim):
        """
        Args:
            input_dim  (int): input vector size.
            output_dim (int): output vector size (number of classes).
        """
        super().__init__()
        # self.linear = torch.nn.Linear(input_dim, output_dim)

        self.fc1 = torch.nn.Linear(input_dim, 128)
        self.fc2 = torch.nn.Linear(128, 256)
        self.fc3 = torch.nn.Linear(256, output_dim)

    def forward(self, x):
        # pass x through a parameterized linear transformation
        # y = self.linear(x)

        x = self.fc1(x)
        x = torch.nn.functional.relu(x)

        # pass through second hidden layer
        x = self.fc2(x)
        x = torch.nn.functional.relu(x)

        # pass through the third (output) layer
        x = self.fc3(x)


        # pass the result through softmax over the last dimension to generate
        # a probability distribution vector over the classes:
        y = torch.nn.functional.softmax(x, dim=-1)
        return y

In [41]:
# Creating the model and moving it to the GPU (if available):
model = FCSoftmaxRegression(size_vocabulary, num_classes)
model.to(device)

FCSoftmaxRegression(
  (fc1): Linear(in_features=23, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=896, bias=True)
)

In [42]:
optimizer = torch.optim.Adam(model.parameters(), 
                             weight_decay=0.001
                             )

In [43]:
max_epochs = 30

for epoch in range(max_epochs):
    train_loss_total, train_acc_total = train(model, train_loader, loss_fn, optimizer)
    val_loss_total, val_acc_total = evaluate(model, dev_loader, loss_fn, optimizer)

    print(f'[EPOCH:{epoch+1:3d}/{max_epochs}]',
        f'train.loss: {train_loss_total:.4f}',
        f'train.acc: {100*train_acc_total:3.2f}%',
        f'val.loss: {val_loss_total:.4f}',
        f'val.acc: {100*val_acc_total:3.2f}%')

[EPOCH:  1/30] train.loss: 0.0526 train.acc: 7.52% val.loss: 0.0525 val.acc: 8.77%
[EPOCH:  2/30] train.loss: 0.0524 train.acc: 9.09% val.loss: 0.0524 val.acc: 9.50%
[EPOCH:  3/30] train.loss: 0.0524 train.acc: 9.49% val.loss: 0.0524 val.acc: 9.86%
[EPOCH:  4/30] train.loss: 0.0524 train.acc: 9.88% val.loss: 0.0524 val.acc: 10.08%
[EPOCH:  5/30] train.loss: 0.0523 train.acc: 10.30% val.loss: 0.0524 val.acc: 10.49%
[EPOCH:  6/30] train.loss: 0.0523 train.acc: 10.69% val.loss: 0.0523 val.acc: 10.94%
[EPOCH:  7/30] train.loss: 0.0523 train.acc: 10.86% val.loss: 0.0523 val.acc: 10.99%
[EPOCH:  8/30] train.loss: 0.0523 train.acc: 11.04% val.loss: 0.0523 val.acc: 11.18%
[EPOCH:  9/30] train.loss: 0.0523 train.acc: 11.19% val.loss: 0.0523 val.acc: 11.29%
[EPOCH: 10/30] train.loss: 0.0522 train.acc: 11.26% val.loss: 0.0523 val.acc: 11.22%
[EPOCH: 11/30] train.loss: 0.0522 train.acc: 11.34% val.loss: 0.0523 val.acc: 11.33%
[EPOCH: 12/30] train.loss: 0.0522 train.acc: 11.41% val.loss: 0.0523 val

In [44]:
test_loss_total, test_acc_total = evaluate(model, test_loader, loss_fn, optimizer)

print(f'[Test set performance]',
        f'test.loss: {test_loss_total:.4f}',
        f'test.acc: {100*test_acc_total:3.2f}%')

[Test set performance] test.loss: 0.0522 test.acc: 11.69%


**Observation**: Increasing the model complexity has marginally helped with underfitting problem by increasing accuracy by 2-3%. 

Hence, to further improve the performance, more complex models can be used over BoW encoding or other known encoding techniques.  

Potential improvements:
- Use another technique for encoding the protein sequence, e.g. 2-gram BoW, 3-gram BoW, (combination of both)
- Use deeper NN (potentially add regularization: L2, dropout)
- Hyperparameter tuning, e.g. the default learning rate used (no tuning)

In [46]:
lr = optimizer.param_groups[0]['lr']
lr

0.001

## Generate Predictions

In [48]:
model.eval()

acc_total = 0.

# Keep test predictions
test_preds = np.empty((0,), dtype=np.int64)

for inputs, labels in test_loader:
    
    preds = model(inputs.to(device)).detach()
    acc_total += float(
        (torch.argmax(preds, dim=-1) == labels.to(preds.device)).sum()
    )

    preds = torch.argmax(preds, dim=-1).cpu().numpy()  # convert to numpy array
    test_preds = np.concatenate([test_preds, preds])


acc_total = acc_total / (len(test_ds))

In [49]:
acc_total

0.11689329958108381

## Save results

In [50]:
test_df['softmaxreg_preds'] = test_preds

In [51]:
test_df.to_csv('/content/drive/MyDrive/instaDeep/data/softmaxreg_preds.csv', index=False)