In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

In [None]:
desired_directory = '/content/drive/MyDrive'

os.chdir(desired_directory)

current_directory = os.getcwd()
print("Current Working Directory:", current_directory)

Current Working Directory: /content/drive/MyDrive


In [None]:
cd NLP_AUTUMN_ASSIGNMENT_DATA/NLP_AUTUMN_ASSIGNMENT_DATA

/content/drive/MyDrive/NLP_AUTUMN_ASSIGNMENT_DATA/NLP_AUTUMN_ASSIGNMENT_DATA


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from transformers import BertTokenizer, BertModel
from torch.utils.data import TensorDataset, random_split, DataLoader,Dataset
import torch.optim as optim
from transformers import get_linear_schedule_with_warmup,AdamW
import torch
import torch.nn as nn


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
column_names = ["comment", "label"]

In [None]:
train = pd.read_csv('NLP_ass_train.tsv', sep='\t', header=None, names=column_names)
test = pd.read_csv('NLP_ass_test.tsv', sep='\t', header=None, names=column_names)
valid = pd.read_csv('NLP_ass_valid.tsv', sep='\t', header=None, names=column_names)

In [None]:
# Map sentiment labels to numerical values
label_mapping = {'hatespeech': 2, 'offensive': 1, 'normal': 0}
train['label'] = train['label'].map(label_mapping)

test['label'] = test['label'].map(label_mapping)

valid['label'] = valid['label'].map(label_mapping)

In [None]:
# Convert text to lowercase and remove special characters
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

In [None]:
train['comment'] = train['comment'].apply(clean_text)
valid['comment'] = valid['comment'].apply(clean_text)
test['comment'] = test['comment'].apply(clean_text)

In [None]:
# Split data into input text and sentiment label
x_train = train['comment'].values
y_train = train['label'].values

# test data into input text
x_test = test['comment'].values
y_test = test['label'].values # not doing the OHE

x_valid = valid['comment'].values
y_valid = valid['label'].values

In [None]:

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# # Load the BERT model
# model = BertModel.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts,labels,tokenizer,max_len):

        self.labels = [label for label in labels]
        self.texts = [tokenizer(text,
                               padding='max_length', max_length = max_len, truncation=True,
                                return_tensors="pt") for text in texts]

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return self.labels[idx]

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


In [None]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.3):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(768, 3)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        out1 = self.fc1(dropout_output)
        # out1 = self.fc1(out1)         # No need to define activation function here
        # out2 = self.fc2(out1)
        return out1

In [None]:
train_dataset = TextClassificationDataset(x_train, y_train, tokenizer, max_len=50)
valid_dataset = TextClassificationDataset(x_valid, y_valid, tokenizer, max_len=50)
test_dataset = TextClassificationDataset(x_test, y_test, tokenizer, max_len=50)

# Define the dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
epochs=10
# Define the optimizer
model = BertClassifier()
optimizer = optim.AdamW(model.parameters(), lr= 1e-5)

# Total number of training steps
total_steps = len(train_dataloader) * epochs

# Set up the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Define the loss function
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
# Move the model to the device
model = model.to(device)

best_val_acc = 0

# Training loop
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    total_loss = 0
    total_val_loss = 0
    train_acc=0
    val_acc=0

    # Training
    for step, batch in enumerate(train_dataloader):
        # Put the model into the training mode

        model.train()

        # Forward pass
        # print(batch[0]['attention_mask'])
        outputs = model(batch[0]['input_ids'].squeeze(1).to(device), batch[0]['attention_mask'].to(device))
        # print(torch.max(outputs, dim=1))
        # _, preds = torch.max(outputs, dim=1)
        # print(outputs)
        loss = loss_fn(outputs, batch[1].to(device).long())

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update parameters
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()


    print(f'Training loss: {total_loss / len(train_dataloader)}')

    # Validation
    for batch in val_dataloader:
        # Put the model in evaluation mode
        model.eval()

        with torch.no_grad():
            outputs = model(batch[0]['input_ids'].squeeze(1).to(device), batch[0]['attention_mask'].to(device))
            # print(torch.max(outputs, dim=1))
            # _, preds = torch.max(outputs, dim=1)
            # print(outputs)
            loss = loss_fn(outputs, batch[1].to(device).long())
            acc = (outputs.argmax(dim=1) == batch[1].to(device)).sum().item()


        total_val_loss += loss.item()
        val_acc += acc

    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f'Validation loss: {avg_val_loss} val acc : {val_acc/len(valid_dataset)}')

    # Save the model if the validation loss decreased
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_model_weights.pt')



Epoch 1/10
----------
Training loss: 0.8717484475421311
Validation loss: 0.7590122929781922 val acc : 0.6644120707596254
Epoch 2/10
----------
Training loss: 0.7084252090810986
Validation loss: 0.7363159149146277 val acc : 0.6815816857440167
Epoch 3/10
----------
Training loss: 0.6002207356902022
Validation loss: 0.7586286922624289 val acc : 0.6774193548387096
Epoch 4/10
----------
Training loss: 0.4852936806664521
Validation loss: 0.8045997521108832 val acc : 0.6727367325702394
Epoch 5/10
----------
Training loss: 0.3759387329127833
Validation loss: 0.9193296527936439 val acc : 0.6831425598335068
Epoch 6/10
----------
Training loss: 0.29252059688973636
Validation loss: 1.013591032413658 val acc : 0.6716961498439126
Epoch 7/10
----------
Training loss: 0.23082741054855366
Validation loss: 1.1374058489772407 val acc : 0.6711758584807492
Epoch 8/10
----------
Training loss: 0.19321571778130414
Validation loss: 1.1721505199920785 val acc : 0.6711758584807492
Epoch 9/10
----------
Training

In [None]:
# Load the best model weights
model.load_state_dict(torch.load('best_model_weights.pt'))

# Test the model
model.eval()
total_test_loss = 0
test_acc  = 0
pred =[]
true =[]

for batch in test_dataloader:
    with torch.no_grad():
        outputs = model(batch[0]['input_ids'].squeeze(1).to(device), batch[0]['attention_mask'].to(device))
        # print(torch.max(outputs, dim=1))
        # _, preds = torch.max(outputs, dim=1)
        # print(outputs)
        loss = loss_fn(outputs, batch[1].to(device).long())
        pred.append(outputs.argmax(dim=1))
        true.append(batch[1])
        acc = (outputs.argmax(dim=1) == batch[1].to(device)).sum().item()

    total_test_loss += loss.item()
    test_acc+=acc

avg_test_loss = total_test_loss / len(test_dataloader)
print(f'Test loss: {avg_test_loss}  Test acc {test_acc/len(test_dataset)}')


Test loss: 0.9910401027064678  Test acc 0.6559251559251559


In [None]:
y_test=torch.cat(true,axis=0).numpy()

In [None]:
test_pred = torch.cat(pred,axis=0).detach().cpu().numpy()

In [None]:
from sklearn.metrics import f1_score

f1_score(y_test, test_pred, average='macro')

0.6390040730481744