In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd

In [31]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

Running on the GPU


In [32]:
import torch.nn as nn

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads=1, dropout=0.1):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.head_dim = embedding_dim // num_heads

        self.q_linear = nn.Linear(embedding_dim, embedding_dim)
        self.k_linear = nn.Linear(embedding_dim, embedding_dim)
        self.v_linear = nn.Linear(embedding_dim, embedding_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, query: torch.Tensor, key : torch.Tensor, value : torch.Tensor, type, mask=None, mask2=None):
        # query shape: [batch_size, query_len, embedding_dim]
        # key shape: [batch_size, key_len, embedding_dim]
        # value shape: [batch_size, value_len, embedding_dim]
        batch_size = query.shape[0]

        # Linear transformations
        Q = self.q_linear(query)
        K = self.k_linear(key)
        V = self.v_linear(value)

        # Reshape for multi-head attention
        Q = Q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        # print("hiii")
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)

        if mask is not None:
            if type == 'encoder':
                mask = mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
                scores = scores.masked_fill(mask == 0, float('-inf'))
            elif type == 'decoder':
                mask = mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
                scores = scores.masked_fill(mask == 0, float('-inf'))
                mask2 = mask2.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
                scores = scores.masked_fill(mask2 == 0, float('-inf'))
            elif type == 'encoder-decoder':
                final_mask = mask2.transpose(-2, -1) @ mask
                final_mask = final_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
                scores = scores.masked_fill(final_mask == 0, float('-inf'))


        attn_weights = nn.functional.softmax(scores, dim=-1)
        attn_weights = attn_weights.masked_fill(torch.isnan(attn_weights), 0)
        attn_weights = self.dropout(attn_weights)

        # print(attn_weights.shape)
        attn_output = torch.matmul(attn_weights, V)

        # Reshape
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.embedding_dim)
        # print(attn_output.shape)
        return attn_output

In [33]:
import torch.nn as nn

class PositionwiseFeedforward(nn.Module):
    def __init__(self, embedding_dim, dff, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(embedding_dim, dff)
        self.fc2 = nn.Linear(dff, embedding_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x shape: [batch_size, context_size, embedding_dim]
        x = nn.functional.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [34]:
import torch.nn as nn

class EncoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, dropout):
        super().__init__()
        self.self_attn = MultiHeadSelfAttention(embedding_dim, num_heads)
        self.feed_forward = PositionwiseFeedforward(embedding_dim, 4*embedding_dim)
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.norm2 = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        # src shape: [batch_size, src_len, hidden_dim]
        # src_mask shape: [batch_size, 1, 1, src_len]
        # print("Hello")
        # print(src.shape)
        src2 = self.norm1(src)
        # print("Hello2")
        src = src + self.dropout(self.self_attn(src2, src2, src2, 'encoder', src_mask))
        src2 = self.norm2(src)
        src = src + self.dropout(self.feed_forward(src2))
        # print("Hello3")
        return src

In [35]:
import math
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, context_size, dropout):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(context_size, embedding_dim)
        position = torch.arange(0, context_size, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: [batch_size, seq_len, embedding_dim]
        x = x + self.pe[:x.size(1), :]
        return self.dropout(x)

In [36]:
import torch.nn as nn

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size, num_layers, num_heads, dropout):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.context_size = context_size
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dropout = dropout

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoding = PositionalEncoding(embedding_dim, context_size, dropout)
        self.layers = nn.ModuleList([EncoderLayer(embedding_dim, num_heads, dropout) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(embedding_dim)

    def forward(self, src, src_lens):
        # src shape: [batch_size, src_len]
        src_mask = self.calculate_mask(src, src_lens)
        # src_mask shape: [batch_size, 1, 1, src_len]
        src = self.embedding(src)
        # src shape: [batch_size, src_len, context_size]

        src = self.pos_encoding(src)

        for layer in self.layers:
            src = layer(src, src_mask)
        src = self.norm(src)
        return src, src_mask

    def calculate_mask(self, src, src_lens):
        # src shape: [batch_size, context_size]
        # src_lens shape: [batch_size]
        # mask shape: [batch_size, hidden_dim, hidden_dim]
        batch_size = src.shape[0]
        context_size = src.shape[1]
        mask = torch.arange(context_size).expand(batch_size, context_size).to(device) < src_lens.clone().detach().unsqueeze(1).to(device)
        mask = mask.float()
        mask = mask.unsqueeze(1)
        mask = mask.transpose(1, 2) @ mask
        return mask

In [37]:
class DecoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, dropout):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.self_attn = MultiHeadSelfAttention(embedding_dim, num_heads)
        self.enc_attn = MultiHeadSelfAttention(embedding_dim, num_heads)
        self.feed_forward = PositionwiseFeedforward(embedding_dim, 4*embedding_dim)
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.norm2 = nn.LayerNorm(embedding_dim)
        self.norm3 = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        # trg shape: [batch_size, context_size, embedding_dim]
        # enc_src shape: [batch_size, context_size, embedding_dim]

        trg_len = trg.shape[1]
        trg_triangle = torch.tril(torch.ones((trg.shape[0], trg_len, trg_len), device=trg.device)).bool().float()
        trg2 = self.norm1(trg)

        trg = trg + self.dropout(self.self_attn(trg2, trg2, trg2, 'decoder', trg_mask, trg_triangle))

        trg2 = self.norm2(trg)
        trg = trg + self.dropout(self.enc_attn(trg2, enc_src, enc_src, 'encoder-decoder', src_mask, trg_mask))

        trg2 = self.norm3(trg)
        trg = trg + self.dropout(self.feed_forward(trg2))
        return trg

In [38]:
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size, num_layers, num_heads, dropout):
        super().__init__()
        self.vocab_size = vocab_size
        self.context_size = context_size
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dropout = dropout

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoding = PositionalEncoding(embedding_dim, context_size, dropout)
        self.layers = nn.ModuleList([DecoderLayer(embedding_dim, num_heads, dropout) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(embedding_dim)
        self.fc_out = nn.Linear(embedding_dim, vocab_size)

    def forward(self, trg, trg_lens, enc_src, src_mask, inference=False):
        # trg shape: [batch_size, context_size]
        # enc_src shape: [batch_size, context_size, embedding_dim]
        if inference == False:
            trg_mask = self.calculate_mask(trg, trg_lens)
            # trg_mask shape: [batch_size, context_size, context_size]
            trg = self.embedding(trg)
            # trg shape: [batch_size, context_size, embedding_dim]
            trg = self.pos_encoding(trg)

            for layer in self.layers:
                trg = layer(trg, enc_src, trg_mask, src_mask)

            trg = self.norm(trg)
            output = self.fc_out(trg)
            return output
        else:
            return self.inference(enc_src, src_mask, trg.shape[1], torch.tensor([32883]), torch.tensor([[32884]], device=trg.device))

    # def inference(self, enc_src, src_mask, max_len, start_token, end_token):
    #     # enc_src shape: [batch_size, src_len, hidden_dim]
    #     # src_mask shape: [batch_size, 1, 1, src_len]
    #     # max_len: maximum length of the generated sequence
    #     # start_token: integer representing the start token
    #     # end_token: integer representing the end token
    #     # print(start_token)
    #     batch_size = enc_src.shape[0]
    #     trg = torch.tensor([[start_token]] * batch_size, device=enc_src.device)
    #     # trg shape: [batch_size, 1]
    #     output = []
    #     for i in range(max_len):
    #         trg_mask = self.calculate_mask(trg)
    #         # trg_mask shape: [batch_size, 1, i+1, i+1]
    #         trg_embedded = self.embedding(trg) * math.sqrt(self.hidden_dim)
    #         # trg_embedded shape: [batch_size, i+1, hidden_dim]
    #         trg_embedded = self.pos_encoding(trg_embedded)
    #         # print(trg_embedded.shape)
    #         for layer in self.layers:
    #             trg_embedded = layer(trg_embedded, enc_src, trg_mask, src_mask)
    #         trg_embedded = self.norm(trg_embedded)
    #         # print(trg_embedded.shape)
    #         logits = self.fc_out(trg_embedded[:, -1])

    #         # logits shape: [batch_size, vocab_size]
    #         probs = F.softmax(logits, dim=-1)
    #         # print(probs.shape)
    #         # probs shape: [batch_size, vocab_size]
    #         next_token = torch.argmax(probs, dim=-1, keepdim=True)
    #         # next_token shape: [batch_size, 1]
    #         output.append(logits.unsqueeze(1))
    #         trg = torch.cat([trg, next_token], dim=-1)
    #         # trg shape: [batch_size, i+2]
    #         if torch.all(next_token == end_token):
    #             break
    #     output = torch.cat(output, dim=1)
    #     # print(output.shape)
    #     # output shape: [batch_size, seq_len]
    #     return output

    def calculate_mask(self, trg, trg_lens):
        # trg shape: [batch_size, context_size]
        # trg_lens shape: [batch_size]
        # mask shape: [batch_size, context_size, context_size]
        batch_size = trg.shape[0]
        context_size = trg.shape[1]
        mask = torch.arange(context_size).expand(batch_size, context_size).to(device) < trg_lens.clone().detach().unsqueeze(1).to(device)
        mask = mask.float()
        mask = mask.unsqueeze(1)
        mask = mask.transpose(1, 2) @ mask
        return mask

In [39]:
class MyTransformer(nn.Module):
    def __init__(self, encoder_vocab_size, embedding_dim, context_size, num_layers, num_heads, dropout):
        super().__init__()
        self.encoder = TransformerEncoder(encoder_vocab_size, embedding_dim, context_size, num_layers, num_heads, dropout)
        self.fc = nn.Linear(context_size, 1)

    def forward(self, src, src_lens, inference=False):
        # src shape: [batch_size, context_size]
        # trg shape: [batch_size, context_size]
        # src_lens shape: [batch_size]
        # trg_lens shape: [batch_size]

        enc_src, src_mask = self.encoder(src, src_lens)
        output = self.fc(enc_src[:,0,:])

        # enc_src shape: [batch_size, context_size, embedding_dim]
        # src_mask shape: [batch_size, context_size, context_size]

        # output = self.decoder(trg, trg_lens, enc_src, src_mask, inference)
        # output shape: [batch_size, context_size, decoder_vocab_size]
        return output

In [40]:

# Load CSV data
train_df = pd.read_csv('codemix-main/train.csv')
test_df = pd.read_csv('codemix-main/test.csv')
validation_df = pd.read_csv('codemix-main/valid.csv')


In [41]:
train_df

Unnamed: 0.1,Unnamed: 0,labels,tweets
0,0,0,ather farouqui general secretary of ghar empha...
1,1,0,by passing of is started ji jaggo nahi to sama...
2,2,1,swadu duniya geeta parjapat manjeetgill royal ...
3,3,1,hurry up kahin ye offer miss na ho jaye p p p p p
4,4,1,s logic hasne ke paise milte hai to alag alag ...
...,...,...,...
114995,114995,0,jab pakistan me flods aaty hain tw or pani cho...
114996,114996,0,vineeta chadha wo jo ke time sab ne waste kiya...
114997,114997,0,mujhe bhi do na caption credits akhil thakur
114998,114998,0,people of balochistangilgit and pok have thank...


In [42]:

# Extract text and labels
train_sentences = train_df['tweets'].tolist()
train_labels = train_df['labels'].tolist()

test_sentences = test_df['tweets'].tolist()
test_labels = test_df['labels'].tolist()

validation_sentences = validation_df['tweets'].tolist()
validation_labels = validation_df['labels'].tolist()


In [43]:
config = {
    'method': 'random',
    'name': 'Transformer',
    'metric': {
        'name': 'bleu_score',
        'goal': 'maximize',
    },
    'parameters': {
        'batch_size': {'value': 16},
        'num_epochs': {'value': 100},
        'learning_rate': {'values': [0.01, 0.005, 0.001]},
        'embedding_dim': {'values': [64, 96, 128]},
        'context_size': {'value': 64},
        'dropout': {'values': [0.1, 0.2]},
        'optimizer': {'values': ['Adam', 'RMSprop']},
        'num_layers': {'values': [2, 3]},
        'num_heads' : {'values' : [2, 4, 8]},
        'model_path' : {'value' : './Transformer.pt'},
        'drive_path' : {'value' : './'},
    }
}

In [44]:

import re
def clean_sentences_eng(sentences):
    # cleaned_sentences = []
    # for sentence in sentences:
    #     cleaned_sentence = sentence.lower().strip()
    #     cleaned_sentence = re.sub(r'[^a-zA-Z\s.!?\']', '', cleaned_sentence)
    #     cleaned_sentence = re.sub(r' +', ' ', cleaned_sentence)
    #     cleaned_sentences.append(cleaned_sentence)
    # return cleaned_sentences
    cleaned_sentences = []
    for sentence in sentences:
        cleaned_sentence = sentence.lower().strip()
        # cleaned_sentence = cleaned_sentence.replace('[^a-zA-Z0-9\s.\']',' ')
        # cleaned_sentence = cleaned_sentence.replace(' +', ' ')
        cleaned_sentences.append(cleaned_sentence)
    return cleaned_sentences

def clean_sentences_french(sentences):
    # cleaned_sentences = []
    # for sentence in sentences:
    #     cleaned_sentence = sentence.lower().strip()
    #     cleaned_sentence = re.sub(r"[^a-zA-Zàâçéèêëîïôûùüÿñæœ.!?]+", r" ", cleaned_sentence)
    #     cleaned_sentence = re.sub(r' +', ' ', cleaned_sentence)
    #     cleaned_sentences.append(cleaned_sentence)
    # return cleaned_sentences
    cleaned_sentences = []
    for sentence in sentences:
        cleaned_sentence = sentence.lower().strip()
        # cleaned_sentence = cleaned_sentence.replace('[^a-zA-Z0-9\s.\']',' ')
        # cleaned_sentence = cleaned_sentence.replace(' +', ' ')
        cleaned_sentences.append(cleaned_sentence)
    return cleaned_sentences

In [45]:
train_eng_sentences = clean_sentences_eng(train_sentences)
val_eng_sentences = clean_sentences_eng(validation_sentences)
test_eng_sentences = clean_sentences_eng(test_sentences)

In [46]:
import nltk
nltk.download('punkt')
unique_english_train_words = set(nltk.word_tokenize(' '.join(train_eng_sentences)))

[nltk_data] Error loading punkt: <urlopen error [Errno 104] Connection
[nltk_data]     reset by peer>


In [47]:
print(len(unique_english_train_words))

57643


In [48]:
def get_vocabulary(dataset):
    vocab = list(set(dataset))
    vocab.append('<unk>')
    vocab.append('<pad>')
    vocab.append('<sos>')
    vocab.append('<eos>')
    # vocab_size = len(vocab)

    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    idx_to_word = {idx: word for idx, word in enumerate(vocab)}

    return vocab, word_to_idx, idx_to_word

In [49]:
english_vocab, english_word_to_idx, english_idx_to_word = get_vocabulary(unique_english_train_words)
encoder_vocab_size = len(english_vocab)

In [50]:
class EarlyStopping():
    def __init__(self, patience=3, min_delta=0.01):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, loss):
        if self.best_loss is None:
            self.best_loss = loss
        elif self.best_loss - loss > self.min_delta:
            self.best_loss = loss
            self.counter = 0
        elif self.best_loss - loss < self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

In [51]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
class TranslationDataset(Dataset):
    def __init__(self, src_sentences, labels, src_word_to_idx, context_size):
        self.src_sentences = src_sentences
        self.context_size = context_size
        self.src_word_to_idx = src_word_to_idx
        self.labels = labels

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src_sentence = self.src_sentences[idx]
        label = self.labels[idx]
        src_words = nltk.word_tokenize(src_sentence)

        src_tokens = [self.src_word_to_idx['<sos>']] + [self.src_word_to_idx.get(word, self.src_word_to_idx['<unk>']) for word in src_words] + [self.src_word_to_idx['<eos>']]

        len_src_tokens = len(src_tokens) if len(src_tokens) <= self.context_size else self.context_size

        if len(src_tokens) >= self.context_size:
            src_tokens = src_tokens[:self.context_size]
        else:
            src_tokens = src_tokens + [self.src_word_to_idx['<pad>']] * (self.context_size - len(src_tokens))

        return torch.tensor(src_tokens), torch.tensor(label).float(), len_src_tokens

In [52]:
from tqdm import tqdm
import pickle
def train(model: MyTransformer, criterion_train: nn.CrossEntropyLoss, criterion_val: nn.CrossEntropyLoss, optimizer: optim.Adam, num_epochs: int, data_loader_train: DataLoader, data_loader_val: DataLoader, es: EarlyStopping):
    print("Hyperparameters set")
    print("Training started...")

    epoch_no = []
    train_loss_epoch = []
    val_loss_epoch = []
    model.to(device)

    for epoch in range(num_epochs):
        train_total = 0
        validation_total = 0
        train_correct = 0
        validation_correct = 0
        train_loss = 0
        model.train()

        pbar = tqdm(enumerate(data_loader_train), total=len(data_loader_train))
        for i, (src, labels, src_lens) in pbar:

            optimizer.zero_grad()
            output = model(src.to(device), src_lens.to(device))
            output = output.squeeze(1)
            labels = labels.to(device)
            loss = criterion_train(output.to(device), labels)
            loss.backward()
            optimizer.step()
            predicted_class = torch.sigmoid(output) > 0.5
            train_correct += torch.sum(predicted_class == labels).item()
            train_total += len(predicted_class)
            train_loss += loss.item()


            pbar.set_description(f'Epoch: {epoch + 1}, Train Loss: {train_loss / (i + 1):.4f}')

        epoch_no.append(epoch + 1)
        train_loss_epoch.append(train_loss / len(data_loader_train))

        pbar = tqdm(enumerate(data_loader_val), total=len(data_loader_val))

        val_loss = 0
        model.eval()

        for i, (src, labels, src_lens) in pbar:
            with torch.no_grad():

                output = model(src.to(device), src_lens.to(device))
                output = output.squeeze(1)
                labels = labels.to(device)
                loss = criterion_val(output.to(device), labels)
                val_loss += loss.item()
                predicted_class = torch.sigmoid(output) > 0.5
                validation_correct += torch.sum(predicted_class == labels).item()
                validation_total += len(predicted_class)

                pbar.set_description(f'Epoch: {epoch + 1}, Val Loss: {val_loss / (i + 1):.4f}')

        val_loss_epoch.append(val_loss / len(data_loader_val))
        print(f'Epoch: {epoch + 1}, Train Loss: {train_loss_epoch[-1]:.4f}, Val Loss: {val_loss_epoch[-1]:.4f}')
        print(f'Epoch: {epoch + 1}, Train Accuracy: {(train_correct/train_total):.4f}, Val Accuracy: {(validation_correct/validation_total):.4f}\n\n')
    return output,epoch_no, train_loss_epoch, val_loss_epoch

In [62]:
from tqdm import tqdm
import pickle
from sklearn.metrics import f1_score, precision_score, recall_score

def test(model: MyTransformer, criterion_test: nn.CrossEntropyLoss, data_loader_test: DataLoader):
    print("Testing...")
    model.to(device)
    test_loss = 0
    test_total = 0
    test_correct = 0
    true_labels = []
    predicted_labels = []
    pbar = tqdm(enumerate(data_loader_test), total=len(data_loader_test))
    
    for i, (src, labels, src_lens) in pbar:
        with torch.no_grad():
            output = model(src.to(device), src_lens.to(device))
            output = output.squeeze(1)
            labels = labels.to(device)
            loss = criterion_val(output.to(device), labels)
            test_loss += loss.item()
            predicted_class = torch.sigmoid(output) > 0.5
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predicted_class.cpu().numpy())
            test_correct += torch.sum(predicted_class == labels).item()
            test_total += len(predicted_class)
    
    accuracy = test_correct / test_total
    f1 = f1_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    
    print(f'Test Accuracy: {accuracy:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}\n')


In [54]:
# Fixed Hyperparameters

context_size = config['parameters']['context_size']['value']
model_path = config['parameters']['model_path']['value']
batch_size = config['parameters']['batch_size']['value']
drive_path = config['parameters']['drive_path']['value']
num_epochs = config['parameters']['num_epochs']['value']

In [55]:
cfg = config
learning_rate = 0.01
embedding_dim = 64
dropout = 0.1
optimizer_name = 'Adam'
num_layers = 2
num_heads = 2
num_epochs = 10
batch_size = 32
context_size = 64

In [56]:
train_dataset = TranslationDataset(train_eng_sentences, train_labels, english_word_to_idx, context_size)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_dataset = TranslationDataset(val_eng_sentences, validation_labels, english_word_to_idx, context_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataset = TranslationDataset(test_eng_sentences, test_labels, english_word_to_idx, context_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [57]:

model = MyTransformer(encoder_vocab_size, embedding_dim, context_size, num_layers, num_heads, dropout)
criterion_train = nn.BCEWithLogitsLoss()
criterion_val = nn.BCEWithLogitsLoss()
criterion_test = nn.BCEWithLogitsLoss()
optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=learning_rate)
es = EarlyStopping(patience=3, min_delta=0.005)

In [58]:
output,epoch_no, train_loss_epoch, val_loss_epoch = train(model, criterion_train, criterion_val, optimizer,
                                          num_epochs, train_dataloader, val_dataloader, es)

Hyperparameters set
Training started...


Epoch: 1, Train Loss: 0.5035: 100%|██████████| 938/938 [00:20<00:00, 45.27it/s]
Epoch: 1, Val Loss: 0.4452: 100%|██████████| 469/469 [00:04<00:00, 111.87it/s]


Epoch: 1, Train Loss: 0.5035, Val Loss: 0.4452
Epoch: 1, Train Accuracy: 0.7840, Val Accuracy: 0.8299




Epoch: 2, Train Loss: 0.4375: 100%|██████████| 938/938 [00:20<00:00, 45.85it/s]
Epoch: 2, Val Loss: 0.4432: 100%|██████████| 469/469 [00:04<00:00, 108.76it/s]


Epoch: 2, Train Loss: 0.4375, Val Loss: 0.4432
Epoch: 2, Train Accuracy: 0.8336, Val Accuracy: 0.8271




Epoch: 3, Train Loss: 0.4191: 100%|██████████| 938/938 [00:20<00:00, 46.79it/s]
Epoch: 3, Val Loss: 0.4408: 100%|██████████| 469/469 [00:04<00:00, 109.10it/s]


Epoch: 3, Train Loss: 0.4191, Val Loss: 0.4408
Epoch: 3, Train Accuracy: 0.8408, Val Accuracy: 0.8279




Epoch: 4, Train Loss: 0.3973: 100%|██████████| 938/938 [00:19<00:00, 47.73it/s]
Epoch: 4, Val Loss: 0.4348: 100%|██████████| 469/469 [00:04<00:00, 108.06it/s]


Epoch: 4, Train Loss: 0.3973, Val Loss: 0.4348
Epoch: 4, Train Accuracy: 0.8498, Val Accuracy: 0.8271




Epoch: 5, Train Loss: 0.3822: 100%|██████████| 938/938 [00:20<00:00, 44.89it/s]
Epoch: 5, Val Loss: 0.4147: 100%|██████████| 469/469 [00:04<00:00, 106.86it/s]


Epoch: 5, Train Loss: 0.3822, Val Loss: 0.4147
Epoch: 5, Train Accuracy: 0.8555, Val Accuracy: 0.8385




Epoch: 6, Train Loss: 0.3741: 100%|██████████| 938/938 [00:19<00:00, 46.94it/s]
Epoch: 6, Val Loss: 0.4345: 100%|██████████| 469/469 [00:03<00:00, 119.45it/s]


Epoch: 6, Train Loss: 0.3741, Val Loss: 0.4345
Epoch: 6, Train Accuracy: 0.8548, Val Accuracy: 0.8317




Epoch: 7, Train Loss: 0.3690: 100%|██████████| 938/938 [00:19<00:00, 48.38it/s]
Epoch: 7, Val Loss: 0.4353: 100%|██████████| 469/469 [00:04<00:00, 110.28it/s]


Epoch: 7, Train Loss: 0.3690, Val Loss: 0.4353
Epoch: 7, Train Accuracy: 0.8576, Val Accuracy: 0.8329




Epoch: 8, Train Loss: 0.3583: 100%|██████████| 938/938 [00:18<00:00, 51.05it/s]
Epoch: 8, Val Loss: 0.4723: 100%|██████████| 469/469 [00:04<00:00, 111.36it/s]


Epoch: 8, Train Loss: 0.3583, Val Loss: 0.4723
Epoch: 8, Train Accuracy: 0.8652, Val Accuracy: 0.8367




Epoch: 9, Train Loss: 0.3382: 100%|██████████| 938/938 [00:21<00:00, 44.16it/s]
Epoch: 9, Val Loss: 0.4482: 100%|██████████| 469/469 [00:04<00:00, 109.58it/s]


Epoch: 9, Train Loss: 0.3382, Val Loss: 0.4482
Epoch: 9, Train Accuracy: 0.8746, Val Accuracy: 0.8384




Epoch: 10, Train Loss: 0.3259: 100%|██████████| 938/938 [00:16<00:00, 56.51it/s]
Epoch: 10, Val Loss: 0.4449: 100%|██████████| 469/469 [00:04<00:00, 110.71it/s]


Epoch: 10, Train Loss: 0.3259, Val Loss: 0.4449
Epoch: 10, Train Accuracy: 0.8783, Val Accuracy: 0.8378




In [63]:
test(model, criterion_test, test_dataloader)

Testing...


100%|██████████| 469/469 [00:03<00:00, 138.47it/s]

Test Accuracy: 0.8433
F1 Score: 0.8322
Precision: 0.9050
Recall: 0.7703




