## import librairies


In [1]:
!pip install transformers
!pip install scikit-learn
!pip install pandas


[0m

In [2]:
import pandas as pd
import time  
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


In [3]:
import pandas as pd
df_final = pd.read_csv('/workspace/data/final_diag50.csv')
#batching data into samples
df_final1 = df_final.sample(frac=0.3)
df_final2 = df_final.sample(frac=0.7)
df_final3 = df_final.sample(frac=1)
#checking size
print(df_final1.shape)
print(df_final2.shape)
print(df_final3.shape)


(20110, 4)
(46924, 4)
(67034, 4)


In [4]:
# Load 30% of the dataset
data1 = df_final1.copy()
# Load 70% of the dataset
data2 = df_final2.copy()
# Load full dataset
data3 = df_final3.copy()
#train/test split was 80/20%
# Split the dataset into training and testing sets
train_df1, test_df1 = train_test_split(data1, test_size=0.2, random_state=42)
train_df2, test_df2 = train_test_split(data2, test_size=0.2, random_state=42)
train_df3, test_df3 = train_test_split(data3, test_size=0.2, random_state=42)


In [5]:
from transformers import BertTokenizer

# Load the ClinicalBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

def tokenize_texts(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

train_encodings1 = tokenize_texts(train_df1['clean_text'].tolist())
test_encodings1 = tokenize_texts(test_df1['clean_text'].tolist())
train_encodings2 = tokenize_texts(train_df2['clean_text'].tolist())
test_encodings2 = tokenize_texts(test_df2['clean_text'].tolist())
train_encodings3 = tokenize_texts(train_df3['clean_text'].tolist())
test_encodings3 = tokenize_texts(test_df3['clean_text'].tolist())


## Working on first sample=30% dataset

In [6]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings1, train_df1['labels'].tolist())
test_dataset = TextDataset(test_encodings1, test_df1['labels'].tolist())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [7]:
##Setting models definition
import torch.nn as nn
from transformers import BertModel

class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        return self.fc(output[:, -1, :])

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        return self.fc(output[:, -1, :])

class BERTForClassification(nn.Module):
    def __init__(self, output_dim):
        super().__init__()
        self.bert = BertModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
        self.fc = nn.Linear(self.bert.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return self.fc(outputs['pooler_output'])


In [8]:
#RNN Tuned MODEL FOR SAMPLE 1
import torch
import warnings
from collections import Counter
from torch.optim.lr_scheduler import ReduceLROnPlateau

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from sklearn.metrics import f1_score, classification_report
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

class AdjustedRNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.4):
        super(AdjustedRNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)  # Use nn.RNN instead of nn.LSTM
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        rnn_out, _ = self.rnn(embedded)  # Use rnn_out instead of lstm_out
        # Use the last hidden state for classification
        output = self.fc(self.dropout(rnn_out[:, -1, :]))
        return output

# Instantiate model
model = AdjustedRNNModel(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, output_dim=50, dropout=0.4).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.00002)


# Assuming `train_labels` is a list containing all the labels in your training dataset
train_labels = [label for batch in train_loader for label in batch['labels'].tolist()]

# 1. Compute class distribution
class_counts = Counter(train_labels)

# 2. Calculate the weights
max_count = max(class_counts.values())
class_weights = {class_id: max_count / count for class_id, count in class_counts.items()}
weights = [class_weights[class_id] for class_id in sorted(class_weights.keys())]

weights_tensor = torch.tensor(weights, dtype=torch.float32).to(device)

# 3. Use the weights in the loss function
criterion = nn.CrossEntropyLoss(weight=weights_tensor)

#criterion = nn.CrossEntropyLoss()  # use this if data is not imbalanced

# Add Learning Rate Scheduler
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

# Train the model
best_f1 = 0.0  # for early stopping based on F1 score

for epoch in range(10):
    # Measure the time at the start of the epoch
    epoch_start_time = time.time()
    model.train()
    training_start_time = time.time()  # Start measuring time
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    training_end_time = time.time()  # End measuring time
    training_time = training_end_time - training_start_time  # Calculate elapsed time for training

    model.eval()
    testing_start_time = time.time()  # Start measuring time
    predictions, true_labels = [], []
    val_loss = 0  # to compute average validation loss for scheduler

    for batch in test_loader:
        with torch.no_grad():
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            predictions.extend(torch.argmax(outputs, dim=1).cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    testing_end_time = time.time()  # End measuring time
    testing_time = testing_end_time - testing_start_time  # Calculate elapsed time for testing

    val_f1 = f1_score(true_labels, predictions, average='weighted')
    scheduler.step(val_loss / len(test_loader))  # scheduler step based on avg val loss
    report1 = classification_report(true_labels, predictions)
    print(f"Epoch: {epoch}, Training Time: {training_time} seconds, Testing Time: {testing_time} seconds")
    # Implementing early stopping based on F1 score
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        #torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= 3:
            print("Early stopping")
            print(report1)
            break


else:  # This block will be executed if the for loop completes normally, i.e., if early stopping does not occur.
    print("Training completed.")
    print(report1)  # Print the classification report after the last epoch


Epoch: 0, Training Time: 48.929654121398926 seconds, Testing Time: 7.71307635307312 seconds
Epoch: 1, Training Time: 48.75060176849365 seconds, Testing Time: 7.745631456375122 seconds
Epoch: 2, Training Time: 48.88056039810181 seconds, Testing Time: 7.739435911178589 seconds
Epoch: 3, Training Time: 48.550743103027344 seconds, Testing Time: 7.671591520309448 seconds
Epoch: 4, Training Time: 48.7392373085022 seconds, Testing Time: 7.772187232971191 seconds
Epoch: 5, Training Time: 48.98300004005432 seconds, Testing Time: 7.7655980587005615 seconds
Epoch: 6, Training Time: 48.68782591819763 seconds, Testing Time: 7.665295124053955 seconds
Epoch: 7, Training Time: 48.916528940200806 seconds, Testing Time: 7.774595022201538 seconds
Early stopping
              precision    recall  f1-score   support

           0       0.07      0.04      0.05        51
           1       0.02      0.02      0.02        56
           2       0.02      0.04      0.03        68
           3       0.00      0

In [9]:
##LSTM tined MODEL FOR SAMPLE 1

import warnings
import torch
from collections import Counter
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score, classification_report
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class AdjustedLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.2):
        super(AdjustedLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # Use the last hidden state for classification
        output = self.fc(self.dropout(lstm_out[:, -1, :]))
        return output

# Instantiate model
model = AdjustedLSTMModel(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, output_dim=50, dropout=0.2).to(device)
optimizer = optim.AdamW(model.parameters(),lr=0.001)

# If your dataset is imbalanced, compute class weights
# weights = # Compute based on class distribution
# criterion = nn.CrossEntropyLoss(weight=weights)


# Assuming `train_labels` is a list containing all the labels in your training dataset
train_labels = [label for batch in train_loader for label in batch['labels'].tolist()]

# 1. Compute class distribution
class_counts = Counter(train_labels)

# 2. Calculate the weights
max_count = max(class_counts.values())
class_weights = {class_id: max_count / count for class_id, count in class_counts.items()}
weights = [class_weights[class_id] for class_id in sorted(class_weights.keys())]

weights_tensor = torch.tensor(weights, dtype=torch.float32).to(device)

# 3. Use the weights in the loss function
criterion = nn.CrossEntropyLoss(weight=weights_tensor)

#criterion = nn.CrossEntropyLoss()  # use this if data is not imbalanced

# Add Learning Rate Scheduler
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

# Train the model
best_f1 = 0.0  # for early stopping based on F1 score

for epoch in range(10):
    epoch_start_time = time.time()
    model.train()
    training_start_time = time.time()  # Start measuring time
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    training_end_time = time.time()  # End measuring time
    training_time = training_end_time - training_start_time  # Calculate elapsed time for training

    model.eval()
    testing_start_time = time.time()  # Start measuring time
    predictions, true_labels = [], []
    val_loss = 0  # to compute average validation loss for scheduler

    for batch in test_loader:
        with torch.no_grad():
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            predictions.extend(torch.argmax(outputs, dim=1).cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    testing_end_time = time.time()  # End measuring time
    testing_time = testing_end_time - testing_start_time  # Calculate elapsed time for testing

    val_f1 = f1_score(true_labels, predictions, average='weighted')
    scheduler.step(val_loss / len(test_loader))  # scheduler step based on avg val loss
    report2 = classification_report(true_labels, predictions)

    print(f"Epoch: {epoch}, Training Time: {training_time} seconds, Testing Time: {testing_time} seconds")
    # Implementing early stopping based on F1 score
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        #torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= 3:
            print("Early stopping")
            print(report2)
            break

else:  # This block will be executed if the for loop completes normally, i.e., if early stopping does not occur.
    print("Training completed.")
    print(report2)  # Print the classification report after the last epoch


Epoch: 0, Training Time: 120.55844593048096 seconds, Testing Time: 16.22673225402832 seconds
Epoch: 1, Training Time: 120.63873291015625 seconds, Testing Time: 16.181991815567017 seconds
Epoch: 2, Training Time: 125.98930525779724 seconds, Testing Time: 17.474894285202026 seconds
Epoch: 3, Training Time: 122.63699316978455 seconds, Testing Time: 16.10278820991516 seconds
Epoch: 4, Training Time: 120.7151620388031 seconds, Testing Time: 16.190575122833252 seconds
Epoch: 5, Training Time: 120.8771026134491 seconds, Testing Time: 16.11627984046936 seconds
Epoch: 6, Training Time: 121.14670324325562 seconds, Testing Time: 16.228591918945312 seconds
Epoch: 7, Training Time: 120.92098712921143 seconds, Testing Time: 16.175971031188965 seconds
Epoch: 8, Training Time: 121.5137414932251 seconds, Testing Time: 16.196144580841064 seconds
Epoch: 9, Training Time: 121.9709324836731 seconds, Testing Time: 16.269481658935547 seconds
Training completed.
              precision    recall  f1-score   s

In [10]:
##BiLSTM tuned MODEL FOR SAMPLE 1
import warnings
import torch
from collections import Counter
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score, classification_report
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class AdjustedBiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.2):
        super(AdjustedBiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Add this line to store the hidden_dim as an instance variable
        self.hidden_dim = hidden_dim
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2*hidden_dim, output_dim)  # Multiply by 2 because it's bidirectional
        self.dropout = nn.Dropout(dropout)



    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # Use the last hidden state for classification. We concatenate the last hidden state from both directions
        output = self.fc(self.dropout(torch.cat((lstm_out[:, -1, :self.hidden_dim], lstm_out[:, 0, self.hidden_dim:]), dim=1)))
        return output



# Instantiate model
model = AdjustedBiLSTMModel(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, output_dim=50, dropout=0.2).to(device)

#model = AdjustedLSTMModel(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, output_dim=10, dropout=0.2).to(device)
optimizer = optim.AdamW(model.parameters(),lr=0.001)

# If your dataset is imbalanced, compute class weights
# weights = # Compute based on class distribution
# criterion = nn.CrossEntropyLoss(weight=weights)


# Assuming `train_labels` is a list containing all the labels in your training dataset
train_labels = [label for batch in train_loader for label in batch['labels'].tolist()]

# 1. Compute class distribution
class_counts = Counter(train_labels)

# 2. Calculate the weights
max_count = max(class_counts.values())
class_weights = {class_id: max_count / count for class_id, count in class_counts.items()}
weights = [class_weights[class_id] for class_id in sorted(class_weights.keys())]

weights_tensor = torch.tensor(weights, dtype=torch.float32).to(device)

# 3. Use the weights in the loss function
criterion = nn.CrossEntropyLoss(weight=weights_tensor)

#criterion = nn.CrossEntropyLoss()  # use this if data is not imbalanced

# Add Learning Rate Scheduler
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

# Train the model
best_f1 = 0.0  # for early stopping based on F1 score

for epoch in range(10):

    epoch_start_time = time.time()
    model.train()
    training_start_time = time.time()  # Start measuring time
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    training_end_time = time.time()  # End measuring time
    training_time = training_end_time - training_start_time  # Calculate elapsed time for training

    model.eval()
    testing_start_time = time.time()  # Start measuring time
    predictions, true_labels = [], []
    val_loss = 0  # to compute average validation loss for scheduler

    for batch in test_loader:
        with torch.no_grad():
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            predictions.extend(torch.argmax(outputs, dim=1).cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    testing_end_time = time.time()  # End measuring time
    testing_time = testing_end_time - testing_start_time  # Calculate elapsed time for testing

    val_f1 = f1_score(true_labels, predictions, average='weighted')
    scheduler.step(val_loss / len(test_loader))  # scheduler step based on avg val loss
    report3 = classification_report(true_labels, predictions)

    print(f"Epoch: {epoch}, Training Time: {training_time} seconds, Testing Time: {testing_time} seconds")
    
    # Implementing early stopping based on F1 score
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        #torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= 3:
            print("Early stopping")
            print(report3)
            break

else:  # This block will be executed if the for loop completes normally, i.e., if early stopping does not occur.
    print("Training completed.")
    print(report3)  # Print the classification report after the last epoch


Epoch: 0, Training Time: 146.2873899936676 seconds, Testing Time: 16.79489278793335 seconds
Epoch: 1, Training Time: 153.2348906993866 seconds, Testing Time: 17.7624249458313 seconds
Epoch: 2, Training Time: 145.9025137424469 seconds, Testing Time: 16.88110375404358 seconds
Epoch: 3, Training Time: 146.2739658355713 seconds, Testing Time: 16.773165464401245 seconds
Epoch: 4, Training Time: 145.75145030021667 seconds, Testing Time: 16.625985860824585 seconds
Epoch: 5, Training Time: 144.3977952003479 seconds, Testing Time: 16.76985812187195 seconds
Epoch: 6, Training Time: 143.4182629585266 seconds, Testing Time: 16.611148595809937 seconds
Epoch: 7, Training Time: 143.43671202659607 seconds, Testing Time: 16.652474403381348 seconds
Epoch: 8, Training Time: 144.37139749526978 seconds, Testing Time: 16.579089164733887 seconds
Epoch: 9, Training Time: 145.12491631507874 seconds, Testing Time: 17.73709535598755 seconds
Training completed.
              precision    recall  f1-score   suppor

In [11]:
##BERT MODEL FOR SAMPLE 1
import torch.optim as optim
import warnings
import torch
from collections import Counter
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score, classification_report
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERTForClassification(50).to(device)  # replace NUM_CLASSES with the number of unique labels in your dataset
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Early stopping parameters
patience = 3
best_valid_loss = float('inf')
counter = 0

for epoch in range(10):
    epoch_start_time = time.time()
    model.train()
    training_start_time = time.time()  # Start measuring time
    for batch in train_loader:
        optimizer.zero_grad()
        criterion = nn.CrossEntropyLoss()
        inputs, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
        outputs = model(inputs, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    training_end_time = time.time()  # End measuring time
    training_time = training_end_time - training_start_time  # Calculate elapsed time for training

    # Validation phase
    model.eval()
    testing_start_time = time.time()  # Start measuring time
    predictions, true_labels = [], []
    for batch in test_loader:
        with torch.no_grad():
            inputs, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(inputs, attention_mask)
            predictions.extend(torch.argmax(outputs, dim=1).cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    testing_end_time = time.time()  # End measuring time
    testing_time = testing_end_time - testing_start_time  # Calculate elapsed time for testing

    val_f1 = f1_score(true_labels, predictions, average='weighted')
    report4 = classification_report(true_labels, predictions)

    print(f"Epoch: {epoch}, Training Time: {training_time} seconds, Testing Time: {testing_time} seconds")
    
    # Early stopping logic
    if val_f1 < best_valid_loss:
        best_valid_loss = val_f1
        counter = 0
        #torch.save(model.state_dict(), 'best_model_bert2.pkl')  # Save the model
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered.")
            print(report4)
            break

else:  # This block will be executed if the for loop completes normally, i.e., if early stopping does not occur.
    print("Training completed.")
    print(report4)  # Print the classification report after the last epoch


Epoch: 0, Training Time: 2336.5735466480255 seconds, Testing Time: 221.35874676704407 seconds
Epoch: 1, Training Time: 3212.2437999248505 seconds, Testing Time: 445.291729927063 seconds
Epoch: 2, Training Time: 4971.916688919067 seconds, Testing Time: 445.376238822937 seconds
Epoch: 3, Training Time: 4966.547714710236 seconds, Testing Time: 471.38681292533875 seconds
Early stopping triggered.
              precision    recall  f1-score   support

           0       0.42      0.33      0.37        51
           1       0.54      0.50      0.52        56
           2       0.42      0.46      0.44        68
           3       0.86      0.62      0.72        48
           4       0.49      0.31      0.38        78
           5       0.62      0.15      0.24        54
           6       0.50      0.10      0.16        21
           7       0.60      0.67      0.63       562
           8       0.57      0.49      0.53        47
           9       0.87      0.98      0.92        42
         

## Working on first sample=70% dataset

In [12]:
train_dataset = TextDataset(train_encodings2, train_df2['labels'].tolist())
test_dataset = TextDataset(test_encodings2, test_df2['labels'].tolist())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [13]:
#RNN Tuned MODEL FOR SAMPLE 2
import warnings
import torch
from collections import Counter
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score, classification_report
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class AdjustedRNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.4):
        super(AdjustedRNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)  # Use nn.RNN instead of nn.LSTM
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        rnn_out, _ = self.rnn(embedded)  # Use rnn_out instead of lstm_out
        # Use the last hidden state for classification
        output = self.fc(self.dropout(rnn_out[:, -1, :]))
        return output

# Instantiate model
model = AdjustedRNNModel(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, output_dim=50, dropout=0.4).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.00002)


# Assuming `train_labels` is a list containing all the labels in your training dataset
train_labels = [label for batch in train_loader for label in batch['labels'].tolist()]

# 1. Compute class distribution
class_counts = Counter(train_labels)

# 2. Calculate the weights
max_count = max(class_counts.values())
class_weights = {class_id: max_count / count for class_id, count in class_counts.items()}
weights = [class_weights[class_id] for class_id in sorted(class_weights.keys())]

weights_tensor = torch.tensor(weights, dtype=torch.float32).to(device)

# 3. Use the weights in the loss function
criterion = nn.CrossEntropyLoss(weight=weights_tensor)

#criterion = nn.CrossEntropyLoss()  # use this if data is not imbalanced

# Add Learning Rate Scheduler
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

# Train the model
best_f1 = 0.0  # for early stopping based on F1 score

for epoch in range(10):
    # Measure the time at the start of the epoch
    epoch_start_time = time.time()
    model.train()
    training_start_time = time.time()  # Start measuring time
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    training_end_time = time.time()  # End measuring time
    training_time = training_end_time - training_start_time  # Calculate elapsed time for training

    model.eval()
    testing_start_time = time.time()  # Start measuring time
    predictions, true_labels = [], []
    val_loss = 0  # to compute average validation loss for scheduler

    for batch in test_loader:
        with torch.no_grad():
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            predictions.extend(torch.argmax(outputs, dim=1).cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    testing_end_time = time.time()  # End measuring time
    testing_time = testing_end_time - testing_start_time  # Calculate elapsed time for testing

    val_f1 = f1_score(true_labels, predictions, average='weighted')
    scheduler.step(val_loss / len(test_loader))  # scheduler step based on avg val loss
    report5 = classification_report(true_labels, predictions)

    print(f"Epoch: {epoch}, Training Time: {training_time} seconds, Testing Time: {testing_time} seconds")
    
    # Implementing early stopping based on F1 score
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        #torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= 3:
            print("Early stopping")
            print(report5)
            break

else:  # This block will be executed if the for loop completes normally, i.e., if early stopping does not occur.
    print("Training completed.")
    print(report5)  # Print the classification report after the last epoch



Epoch: 0, Training Time: 114.36370539665222 seconds, Testing Time: 18.12068200111389 seconds
Epoch: 1, Training Time: 114.2175395488739 seconds, Testing Time: 17.95778775215149 seconds
Epoch: 2, Training Time: 113.63109350204468 seconds, Testing Time: 18.02319598197937 seconds
Epoch: 3, Training Time: 114.19601917266846 seconds, Testing Time: 18.204540967941284 seconds
Epoch: 4, Training Time: 114.50356435775757 seconds, Testing Time: 18.11637830734253 seconds
Epoch: 5, Training Time: 115.2248466014862 seconds, Testing Time: 18.261039972305298 seconds
Epoch: 6, Training Time: 115.04927849769592 seconds, Testing Time: 18.096057176589966 seconds
Epoch: 7, Training Time: 115.31235361099243 seconds, Testing Time: 18.29442596435547 seconds
Epoch: 8, Training Time: 115.63009595870972 seconds, Testing Time: 18.163508653640747 seconds
Epoch: 9, Training Time: 115.81140446662903 seconds, Testing Time: 18.332648754119873 seconds
Early stopping
              precision    recall  f1-score   suppor

In [14]:
##LSTM tined MODEL FOR SAMPLE 1
import warnings
import torch
from collections import Counter
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score, classification_report
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class AdjustedLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.2):
        super(AdjustedLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # Use the last hidden state for classification
        output = self.fc(self.dropout(lstm_out[:, -1, :]))
        return output

# Instantiate model
model = AdjustedLSTMModel(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, output_dim=50, dropout=0.2).to(device)
optimizer = optim.AdamW(model.parameters(),lr=0.001)

# If your dataset is imbalanced, compute class weights
# weights = # Compute based on class distribution
# criterion = nn.CrossEntropyLoss(weight=weights)


# Assuming `train_labels` is a list containing all the labels in your training dataset
train_labels = [label for batch in train_loader for label in batch['labels'].tolist()]

# 1. Compute class distribution
class_counts = Counter(train_labels)

# 2. Calculate the weights
max_count = max(class_counts.values())
class_weights = {class_id: max_count / count for class_id, count in class_counts.items()}
weights = [class_weights[class_id] for class_id in sorted(class_weights.keys())]

weights_tensor = torch.tensor(weights, dtype=torch.float32).to(device)

# 3. Use the weights in the loss function
criterion = nn.CrossEntropyLoss(weight=weights_tensor)

#criterion = nn.CrossEntropyLoss()  # use this if data is not imbalanced

# Add Learning Rate Scheduler
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

# Train the model
best_f1 = 0.0  # for early stopping based on F1 score

for epoch in range(10):
    epoch_start_time = time.time()
    model.train()
    training_start_time = time.time()  # Start measuring time
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    training_end_time = time.time()  # End measuring time
    training_time = training_end_time - training_start_time  # Calculate elapsed time for training

    model.eval()
    testing_start_time = time.time()  # Start measuring time
    predictions, true_labels = [], []
    val_loss = 0  # to compute average validation loss for scheduler

    for batch in test_loader:
        with torch.no_grad():
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            predictions.extend(torch.argmax(outputs, dim=1).cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    testing_end_time = time.time()  # End measuring time
    testing_time = testing_end_time - testing_start_time  # Calculate elapsed time for testing

    val_f1 = f1_score(true_labels, predictions, average='weighted')
    scheduler.step(val_loss / len(test_loader))  # scheduler step based on avg val loss
    report6 = classification_report(true_labels, predictions)

    print(f"Epoch: {epoch}, Training Time: {training_time} seconds, Testing Time: {testing_time} seconds")
    
    # Implementing early stopping based on F1 score
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        #torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= 3:
            print("Early stopping")
            print(report6)
            break

else:  # This block will be executed if the for loop completes normally, i.e., if early stopping does not occur.
    print("Training completed.")
    print(report6)  # Print the classification report after the last epoch


Epoch: 0, Training Time: 282.30864095687866 seconds, Testing Time: 37.554548501968384 seconds
Epoch: 1, Training Time: 282.4274568557739 seconds, Testing Time: 37.24467945098877 seconds
Epoch: 2, Training Time: 284.0395698547363 seconds, Testing Time: 37.68706464767456 seconds
Epoch: 3, Training Time: 281.14775109291077 seconds, Testing Time: 37.163119316101074 seconds
Epoch: 4, Training Time: 299.72270131111145 seconds, Testing Time: 37.15981864929199 seconds
Epoch: 5, Training Time: 281.7582960128784 seconds, Testing Time: 37.40096712112427 seconds
Epoch: 6, Training Time: 279.7129969596863 seconds, Testing Time: 37.12498164176941 seconds
Epoch: 7, Training Time: 281.18375396728516 seconds, Testing Time: 37.20955753326416 seconds
Epoch: 8, Training Time: 281.5672118663788 seconds, Testing Time: 37.38908553123474 seconds
Epoch: 9, Training Time: 281.1390874385834 seconds, Testing Time: 37.462230920791626 seconds
Training completed.
              precision    recall  f1-score   support

In [15]:
##BiLSTM tuned MODEL FOR SAMPLE 1
import warnings
import torch
from collections import Counter
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score, classification_report
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class AdjustedBiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.2):
        super(AdjustedBiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Add this line to store the hidden_dim as an instance variable
        self.hidden_dim = hidden_dim
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2*hidden_dim, output_dim)  # Multiply by 2 because it's bidirectional
        self.dropout = nn.Dropout(dropout)



    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # Use the last hidden state for classification. We concatenate the last hidden state from both directions
        output = self.fc(self.dropout(torch.cat((lstm_out[:, -1, :self.hidden_dim], lstm_out[:, 0, self.hidden_dim:]), dim=1)))
        return output



# Instantiate model
model = AdjustedBiLSTMModel(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, output_dim=50, dropout=0.2).to(device)

#model = AdjustedLSTMModel(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, output_dim=10, dropout=0.2).to(device)
optimizer = optim.AdamW(model.parameters(),lr=0.001)

# If your dataset is imbalanced, compute class weights
# weights = # Compute based on class distribution
# criterion = nn.CrossEntropyLoss(weight=weights)


# Assuming `train_labels` is a list containing all the labels in your training dataset
train_labels = [label for batch in train_loader for label in batch['labels'].tolist()]

# 1. Compute class distribution
class_counts = Counter(train_labels)

# 2. Calculate the weights
max_count = max(class_counts.values())
class_weights = {class_id: max_count / count for class_id, count in class_counts.items()}
weights = [class_weights[class_id] for class_id in sorted(class_weights.keys())]

weights_tensor = torch.tensor(weights, dtype=torch.float32).to(device)

# 3. Use the weights in the loss function
criterion = nn.CrossEntropyLoss(weight=weights_tensor)

#criterion = nn.CrossEntropyLoss()  # use this if data is not imbalanced

# Add Learning Rate Scheduler
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

# Train the model
best_f1 = 0.0  # for early stopping based on F1 score

for epoch in range(10):
    epoch_start_time = time.time()
    model.train()
    training_start_time = time.time()  # Start measuring time
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    training_end_time = time.time()  # End measuring time
    training_time = training_end_time - training_start_time  # Calculate elapsed time for training

    model.eval()
    testing_start_time = time.time()  # Start measuring time
    predictions, true_labels = [], []
    val_loss = 0  # to compute average validation loss for scheduler

    for batch in test_loader:
        with torch.no_grad():
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            predictions.extend(torch.argmax(outputs, dim=1).cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    testing_end_time = time.time()  # End measuring time
    testing_time = testing_end_time - testing_start_time  # Calculate elapsed time for testing

    val_f1 = f1_score(true_labels, predictions, average='weighted')
    scheduler.step(val_loss / len(test_loader))  # scheduler step based on avg val loss
    report7 = classification_report(true_labels, predictions)
    
    print(f"Epoch: {epoch}, Training Time: {training_time} seconds, Testing Time: {testing_time} seconds")
    
    # Implementing early stopping based on F1 score
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        #torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= 3:
            print("Early stopping")
            print(report7)
            break

else:  # This block will be executed if the for loop completes normally, i.e., if early stopping does not occur.
    print("Training completed.")
    print(report7)  # Print the classification report after the last epoch


Epoch: 0, Training Time: 334.9066274166107 seconds, Testing Time: 38.58301305770874 seconds
Epoch: 1, Training Time: 336.1887230873108 seconds, Testing Time: 38.69426941871643 seconds
Epoch: 2, Training Time: 337.580931186676 seconds, Testing Time: 40.988815784454346 seconds
Epoch: 3, Training Time: 276.0445907115936 seconds, Testing Time: 7.127222537994385 seconds
Epoch: 4, Training Time: 65.3902382850647 seconds, Testing Time: 7.106908559799194 seconds
Epoch: 5, Training Time: 65.43103122711182 seconds, Testing Time: 7.102538824081421 seconds
Epoch: 6, Training Time: 65.38250827789307 seconds, Testing Time: 7.115699052810669 seconds
Epoch: 7, Training Time: 65.39116549491882 seconds, Testing Time: 7.106505870819092 seconds
Epoch: 8, Training Time: 65.44267249107361 seconds, Testing Time: 7.130494832992554 seconds
Epoch: 9, Training Time: 65.33995389938354 seconds, Testing Time: 7.109229326248169 seconds
Training completed.
              precision    recall  f1-score   support

      

In [None]:
##BERT MODEL FOR SAMPLE 2
import torch.optim as optim
import warnings
import torch
from collections import Counter
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score, classification_report
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERTForClassification(50).to(device)  # replace NUM_CLASSES with the number of unique labels in your dataset
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Early stopping parameters
patience = 3
best_valid_loss = float('inf')
counter = 0

for epoch in range(10):
    epoch_start_time = time.time()
    model.train()
    training_start_time = time.time()  # Start measuring time
    for batch in train_loader:
        optimizer.zero_grad()
        criterion = nn.CrossEntropyLoss()
        inputs, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
        outputs = model(inputs, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    training_end_time = time.time()  # End measuring time
    training_time = training_end_time - training_start_time  # Calculate elapsed time for training

    # Validation phase
    model.eval()
    testing_start_time = time.time()  # Start measuring time
    predictions, true_labels = [], []
    for batch in test_loader:
        with torch.no_grad():
            inputs, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(inputs, attention_mask)
            predictions.extend(torch.argmax(outputs, dim=1).cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    testing_end_time = time.time()  # End measuring time
    testing_time = testing_end_time - testing_start_time  # Calculate elapsed time for testing

    val_f1 = f1_score(true_labels, predictions, average='weighted')
    report8 = classification_report(true_labels, predictions)
    print(f"Epoch: {epoch}, Training Time: {training_time} seconds, Testing Time: {testing_time} seconds")
    
    # Early stopping logic
    if val_f1 < best_valid_loss:
        best_valid_loss = val_f1
        counter = 0
        #torch.save(model.state_dict(), 'best_model_bert2.pkl')  # Save the model
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered.")
            print(report8)
            break

else:  # This block will be executed if the for loop completes normally, i.e., if early stopping does not occur.
    print("Training completed.")
    print(report8)  # Print the classification report after the last epoch


Epoch: 0, Training Time: 5067.567667484283 seconds, Testing Time: 537.2134208679199 seconds
Epoch: 1, Training Time: 9908.9524102211 seconds, Testing Time: 1077.0321941375732 seconds
Epoch: 2, Training Time: 11579.810034036636 seconds, Testing Time: 1047.3660085201263 seconds


## working on full dataframe

In [25]:
train_dataset = TextDataset(train_encodings3, train_df3['labels'].tolist())
test_dataset = TextDataset(test_encodings3, test_df3['labels'].tolist())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [26]:
#RNN Tuned MODEL FOR SAMPLE 3
import warnings
import torch
from collections import Counter
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score, classification_report
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class AdjustedRNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.4):
        super(AdjustedRNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)  # Use nn.RNN instead of nn.LSTM
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        rnn_out, _ = self.rnn(embedded)  # Use rnn_out instead of lstm_out
        # Use the last hidden state for classification
        output = self.fc(self.dropout(rnn_out[:, -1, :]))
        return output

# Instantiate model
model = AdjustedRNNModel(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, output_dim=50, dropout=0.4).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.00002)


# Assuming `train_labels` is a list containing all the labels in your training dataset
train_labels = [label for batch in train_loader for label in batch['labels'].tolist()]

# 1. Compute class distribution
class_counts = Counter(train_labels)

# 2. Calculate the weights
max_count = max(class_counts.values())
class_weights = {class_id: max_count / count for class_id, count in class_counts.items()}
weights = [class_weights[class_id] for class_id in sorted(class_weights.keys())]

weights_tensor = torch.tensor(weights, dtype=torch.float32).to(device)

# 3. Use the weights in the loss function
criterion = nn.CrossEntropyLoss(weight=weights_tensor)

#criterion = nn.CrossEntropyLoss()  # use this if data is not imbalanced

# Add Learning Rate Scheduler
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

# Train the model
best_f1 = 0.0  # for early stopping based on F1 score

for epoch in range(10):
    # Measure the time at the start of the epoch
    epoch_start_time = time.time()
    model.train()
    training_start_time = time.time()  # Start measuring time
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    training_end_time = time.time()  # End measuring time
    training_time = training_end_time - training_start_time  # Calculate elapsed time for training

    model.eval()
    testing_start_time = time.time()  # Start measuring time
    predictions, true_labels = [], []
    val_loss = 0  # to compute average validation loss for scheduler

    for batch in test_loader:
        with torch.no_grad():
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            predictions.extend(torch.argmax(outputs, dim=1).cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    testing_end_time = time.time()  # End measuring time
    testing_time = testing_end_time - testing_start_time  # Calculate elapsed time for testing

    val_f1 = f1_score(true_labels, predictions, average='weighted')
    scheduler.step(val_loss / len(test_loader))  # scheduler step based on avg val loss
    report9 = classification_report(true_labels, predictions)

    print(f"Epoch: {epoch}, Training Time: {training_time} seconds, Testing Time: {testing_time} seconds")
    
    # Implementing early stopping based on F1 score
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), '/workspace/outputs/simplernn_model2.pth')
    else:
        patience_counter += 1
        if patience_counter >= 3:
            print("Early stopping")
            print(report9)
            break

else:  # This block will be executed if the for loop completes normally, i.e., if early stopping does not occur.
    print("Training completed.")
    print(report9)  # Print the classification report after the last epoch



Epoch: 0, Training Time: 29.0020489692688 seconds, Testing Time: 3.5043396949768066 seconds
Epoch: 1, Training Time: 28.951069355010986 seconds, Testing Time: 3.5032193660736084 seconds
Epoch: 2, Training Time: 28.958198070526123 seconds, Testing Time: 3.505051851272583 seconds
Epoch: 3, Training Time: 28.955583095550537 seconds, Testing Time: 3.503267288208008 seconds
Epoch: 4, Training Time: 28.830087661743164 seconds, Testing Time: 3.5017645359039307 seconds
Epoch: 5, Training Time: 28.56299901008606 seconds, Testing Time: 3.5034854412078857 seconds
Epoch: 6, Training Time: 28.769779205322266 seconds, Testing Time: 3.5019419193267822 seconds
Epoch: 7, Training Time: 28.95014190673828 seconds, Testing Time: 3.50072979927063 seconds
Epoch: 8, Training Time: 28.95990514755249 seconds, Testing Time: 3.50467848777771 seconds
Epoch: 9, Training Time: 28.962185621261597 seconds, Testing Time: 3.506392240524292 seconds
Training completed.
              precision    recall  f1-score   suppor

In [27]:
##LSTM tined MODEL FOR SAMPLE 3
import warnings
import torch
from collections import Counter
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score, classification_report
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class AdjustedLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.2):
        super(AdjustedLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # Use the last hidden state for classification
        output = self.fc(self.dropout(lstm_out[:, -1, :]))
        return output

# Instantiate model
model = AdjustedLSTMModel(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, output_dim=50, dropout=0.2).to(device)
optimizer = optim.AdamW(model.parameters(),lr=0.001)

# If your dataset is imbalanced, compute class weights
# weights = # Compute based on class distribution
# criterion = nn.CrossEntropyLoss(weight=weights)


# Assuming `train_labels` is a list containing all the labels in your training dataset
train_labels = [label for batch in train_loader for label in batch['labels'].tolist()]

# 1. Compute class distribution
class_counts = Counter(train_labels)

# 2. Calculate the weights
max_count = max(class_counts.values())
class_weights = {class_id: max_count / count for class_id, count in class_counts.items()}
weights = [class_weights[class_id] for class_id in sorted(class_weights.keys())]

weights_tensor = torch.tensor(weights, dtype=torch.float32).to(device)

# 3. Use the weights in the loss function
criterion = nn.CrossEntropyLoss(weight=weights_tensor)

#criterion = nn.CrossEntropyLoss()  # use this if data is not imbalanced

# Add Learning Rate Scheduler
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

# Train the model
best_f1 = 0.0  # for early stopping based on F1 score

for epoch in range(10):
    epoch_start_time = time.time()
    model.train()
    training_start_time = time.time()  # Start measuring time
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    training_end_time = time.time()  # End measuring time
    training_time = training_end_time - training_start_time  # Calculate elapsed time for training

    model.eval()
    testing_start_time = time.time()  # Start measuring time
    predictions, true_labels = [], []
    val_loss = 0  # to compute average validation loss for scheduler

    for batch in test_loader:
        with torch.no_grad():
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            predictions.extend(torch.argmax(outputs, dim=1).cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    testing_end_time = time.time()  # End measuring time
    testing_time = testing_end_time - testing_start_time  # Calculate elapsed time for testing

    val_f1 = f1_score(true_labels, predictions, average='weighted')
    scheduler.step(val_loss / len(test_loader))  # scheduler step based on avg val loss
    report10 = classification_report(true_labels, predictions)

    print(f"Epoch: {epoch}, Training Time: {training_time} seconds, Testing Time: {testing_time} seconds")
    
    # Implementing early stopping based on F1 score
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), '/workspace/outputs/lstm_model2.pth')
    else:
        patience_counter += 1
        if patience_counter >= 3:
            print("Early stopping")
            print(report10)
            break

else:  # This block will be executed if the for loop completes normally, i.e., if early stopping does not occur.
    print("Training completed.")
    print(report10)  # Print the classification report after the last epoch



Epoch: 0, Training Time: 87.47678446769714 seconds, Testing Time: 9.961474418640137 seconds
Epoch: 1, Training Time: 87.53887391090393 seconds, Testing Time: 9.962533473968506 seconds
Epoch: 2, Training Time: 87.53276824951172 seconds, Testing Time: 9.965057849884033 seconds
Epoch: 3, Training Time: 87.57044863700867 seconds, Testing Time: 9.96553635597229 seconds
Epoch: 4, Training Time: 87.55779552459717 seconds, Testing Time: 9.96853232383728 seconds
Epoch: 5, Training Time: 87.6807816028595 seconds, Testing Time: 9.964990854263306 seconds
Epoch: 6, Training Time: 87.55305337905884 seconds, Testing Time: 9.96816635131836 seconds
Epoch: 7, Training Time: 87.68879103660583 seconds, Testing Time: 9.970109701156616 seconds
Epoch: 8, Training Time: 87.74234414100647 seconds, Testing Time: 9.967970609664917 seconds
Epoch: 9, Training Time: 87.57272601127625 seconds, Testing Time: 9.968755960464478 seconds
Training completed.
              precision    recall  f1-score   support

         

In [28]:
##BiLSTM tuned MODEL FOR SAMPLE 3
import warnings
import torch
from collections import Counter
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score, classification_report
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class AdjustedBiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.2):
        super(AdjustedBiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Add this line to store the hidden_dim as an instance variable
        self.hidden_dim = hidden_dim
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2*hidden_dim, output_dim)  # Multiply by 2 because it's bidirectional
        self.dropout = nn.Dropout(dropout)



    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # Use the last hidden state for classification. We concatenate the last hidden state from both directions
        output = self.fc(self.dropout(torch.cat((lstm_out[:, -1, :self.hidden_dim], lstm_out[:, 0, self.hidden_dim:]), dim=1)))
        return output



# Instantiate model
model = AdjustedBiLSTMModel(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, output_dim=50, dropout=0.2).to(device)

#model = AdjustedLSTMModel(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=256, output_dim=10, dropout=0.2).to(device)
optimizer = optim.AdamW(model.parameters(),lr=0.001)

# If your dataset is imbalanced, compute class weights
# weights = # Compute based on class distribution
# criterion = nn.CrossEntropyLoss(weight=weights)


# Assuming `train_labels` is a list containing all the labels in your training dataset
train_labels = [label for batch in train_loader for label in batch['labels'].tolist()]

# 1. Compute class distribution
class_counts = Counter(train_labels)

# 2. Calculate the weights
max_count = max(class_counts.values())
class_weights = {class_id: max_count / count for class_id, count in class_counts.items()}
weights = [class_weights[class_id] for class_id in sorted(class_weights.keys())]

weights_tensor = torch.tensor(weights, dtype=torch.float32).to(device)

# 3. Use the weights in the loss function
criterion = nn.CrossEntropyLoss(weight=weights_tensor)

#criterion = nn.CrossEntropyLoss()  # use this if data is not imbalanced

# Add Learning Rate Scheduler
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

# Train the model
best_f1 = 0.0  # for early stopping based on F1 score

for epoch in range(10):
    epoch_start_time = time.time()
    model.train()
    training_start_time = time.time()  # Start measuring time
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    training_end_time = time.time()  # End measuring time
    training_time = training_end_time - training_start_time  # Calculate elapsed time for training

    model.eval()
    testing_start_time = time.time()  # Start measuring time
    predictions, true_labels = [], []
    val_loss = 0  # to compute average validation loss for scheduler

    for batch in test_loader:
        with torch.no_grad():
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            predictions.extend(torch.argmax(outputs, dim=1).cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    testing_end_time = time.time()  # End measuring time
    testing_time = testing_end_time - testing_start_time  # Calculate elapsed time for testing

    val_f1 = f1_score(true_labels, predictions, average='weighted')
    scheduler.step(val_loss / len(test_loader))  # scheduler step based on avg val loss
    report11 = classification_report(true_labels, predictions)

    print(f"Epoch: {epoch}, Training Time: {training_time} seconds, Testing Time: {testing_time} seconds")
    
    # Implementing early stopping based on F1 score
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), '/workspace/outputs/bilstm_model2.pth')
    else:
        patience_counter += 1
        if patience_counter >= 3:
            print("Early stopping")
            print(report11)
            break

else:  # This block will be executed if the for loop completes normally, i.e., if early stopping does not occur.
    print("Training completed.")
    print(report11)  # Print the classification report after the last epoch



Epoch: 0, Training Time: 93.64659595489502 seconds, Testing Time: 10.138993740081787 seconds
Epoch: 1, Training Time: 93.73997354507446 seconds, Testing Time: 10.150611639022827 seconds
Epoch: 2, Training Time: 93.68841218948364 seconds, Testing Time: 10.130943059921265 seconds
Epoch: 3, Training Time: 93.75651669502258 seconds, Testing Time: 10.119308710098267 seconds
Epoch: 4, Training Time: 93.71237206459045 seconds, Testing Time: 10.127362251281738 seconds
Epoch: 5, Training Time: 93.73949456214905 seconds, Testing Time: 10.132631778717041 seconds
Epoch: 6, Training Time: 93.7659215927124 seconds, Testing Time: 10.141343832015991 seconds
Epoch: 7, Training Time: 93.7303318977356 seconds, Testing Time: 10.140599489212036 seconds
Epoch: 8, Training Time: 93.7246823310852 seconds, Testing Time: 10.1498441696167 seconds
Epoch: 9, Training Time: 93.70581793785095 seconds, Testing Time: 10.124911069869995 seconds
Training completed.
              precision    recall  f1-score   support



In [29]:
##BERT MODEL FOR SAMPLE 1
import torch.optim as optim
import warnings
import torch
from collections import Counter
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score, classification_report
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERTForClassification(50).to(device)  # replace NUM_CLASSES with the number of unique labels in your dataset
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Early stopping parameters
patience = 3
best_valid_loss = float('inf')
counter = 0

for epoch in range(10):
    epoch_start_time = time.time()
    model.train()
    training_start_time = time.time()  # Start measuring time
    for batch in train_loader:
        optimizer.zero_grad()
        criterion = nn.CrossEntropyLoss()
        inputs, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
        outputs = model(inputs, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    training_end_time = time.time()  # End measuring time
    training_time = training_end_time - training_start_time  # Calculate elapsed time for training

    # Validation phase
    model.eval()
    testing_start_time = time.time()  # Start measuring time
    predictions, true_labels = [], []
    for batch in test_loader:
        with torch.no_grad():
            inputs, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(inputs, attention_mask)
            predictions.extend(torch.argmax(outputs, dim=1).cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    testing_end_time = time.time()  # End measuring time
    testing_time = testing_end_time - testing_start_time  # Calculate elapsed time for testing

    val_f1 = f1_score(true_labels, predictions, average='weighted')
    report12 = classification_report(true_labels, predictions)
    print(f"Epoch: {epoch}, Training Time: {training_time} seconds, Testing Time: {testing_time} seconds")
    # Early stopping logic
    if val_f1 < best_valid_loss:
        best_valid_loss = val_f1
        counter = 0
        torch.save(model.state_dict(), '/workspace/outputs/Bert_model2.pth')  # Save the model
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered.")
            print(report12)
            break

else:  # This block will be executed if the for loop completes normally, i.e., if early stopping does not occur.
    print("Training completed.")
    print(report12)  # Print the classification report after the last epoch



Epoch: 0, Training Time: 6347.1341671943665 seconds, Testing Time: 602.6011655330658 seconds
Epoch: 1, Training Time: 6302.424460172653 seconds, Testing Time: 595.2031095027924 seconds
Epoch: 2, Training Time: 6327.777872562408 seconds, Testing Time: 594.8712770938873 seconds
Epoch: 3, Training Time: 6358.15244603157 seconds, Testing Time: 601.5906183719635 seconds
Early stopping triggered.
              precision    recall  f1-score   support

           0       0.73      0.84      0.78       160
           1       0.96      0.72      0.82       196
           2       0.90      0.75      0.82       186
           3       0.96      0.89      0.93       140
           4       0.67      0.79      0.73       278
           5       0.88      0.62      0.73       194
           6       0.75      0.73      0.74       100
           7       0.82      0.85      0.84      1969
           8       0.89      0.86      0.88       170
           9       0.86      0.82      0.84       132
          1