### Mounting drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

In [None]:
desired_directory = '/content/drive/MyDrive'

os.chdir(desired_directory)

current_directory = os.getcwd()
print("Current Working Directory:", current_directory)

Current Working Directory: /content/drive/MyDrive


In [None]:
cd NLP_ass_2_3

/content/drive/MyDrive/NLP_ass_2_3


### preprocessing Data

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):

    text = re.sub(r'<.*?>', '', text)

    text = re.sub(r'[^a-zA-Z]', ' ', text)


    text = text.lower()

    tokens = nltk.word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]


    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]


    preprocessed_text = ' '.join(tokens)

    return tokens



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


### Loading the model

In [None]:

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models.keyedvectors import KeyedVectors

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:

path = '/content/drive/MyDrive/NLP_ass_2_3/GoogleNews-vectors-negative300.bin.gz'
model = KeyedVectors.load_word2vec_format(path, binary=True)

### Input feature generation

In [None]:
import numpy as np
import pandas as pd
# Load the TSV file into a DataFrame

data = pd.read_csv('NLP_ass_train.tsv',delimiter = '\t',header = None)
text_column = data.columns[0]
text_data = data[text_column].tolist()

labels_column = data.columns[1]
labels_list = data[labels_column].tolist()
for i in range(len(labels_list)):
  if labels_list[i] == "normal":
    labels_list[i] = 0
  elif labels_list[i] == "hatespeech":
    labels_list[i] = 1
  else:
    labels_list[i] = 2

def generate_input_features(comment, model):

    preprocessed_comment = preprocess_text(comment)
    #print(preprocessed_comment)

    feature_vector = np.zeros(model.vector_size)


    word_count = 0


    for word in preprocessed_comment:
        if word in model:
            feature_vector += model[word]
            word_count += 1


    if word_count > 0:
        feature_vector /= word_count

    return feature_vector

input_features = []
for comment in text_data:
    input_feature = generate_input_features(comment, model)
    input_features.append(input_feature)


input_features = np.array(input_features)
#input_features[1]

import torch
from torch import nn
from functools import reduce
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class CustomDataset(Dataset):
    def __init__(self, x, y, transform=None):
        self.x = x
        self.y = y
        self.transform = transform

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        sample_x = self.x[idx]
        sample_y = self.y[idx]

        if self.transform:
            sample_x, sample_y = self.transform(sample_x, sample_y)

        return sample_x, sample_y


class DNN(nn.Module):
    def __init__(self, dim_in, dim_out, width, depth):
        super(DNN, self).__init__()
        self.depth = depth
        self.layers = nn.ModuleList([nn.Linear(dim_in if i == 0 else width, width,bias=True) for i in range(self.depth)])
        self.output_layer = nn.Linear(width, dim_out)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        for layer in self.layers:
            x = self.relu(layer(x))
        x = self.output_layer(x)
        return x


data2 = pd.read_csv('NLP_ass_test.tsv',delimiter = '\t',header = None)
text2_column = data2.columns[0]
text2_data = data2[text2_column].tolist()

labels_column2 = data2.columns[1]
labels_list2 = data2[labels_column2].tolist()
for i in range(len(labels_list2)):
  if labels_list2[i] == "normal":
    labels_list2[i] = 0
  elif labels_list2[i] == "hatespeech":
    labels_list2[i] = 1
  else:
    labels_list2[i] = 2

input_features2 = []
for comment in text2_data:
    input_feature2 = generate_input_features(comment, model)
    input_features2.append(input_feature2)


input_features2 = np.array(input_features2)

data3 = pd.read_csv('NLP_ass_valid.tsv',delimiter = '\t',header = None)
text3_column = data3.columns[0]
text3_data = data3[text3_column].tolist()

labels_column3 = data3.columns[1]
labels_list3 = data3[labels_column3].tolist()
for i in range(len(labels_list3)):
  if labels_list3[i] == "normal":
    labels_list3[i] = 0
  elif labels_list2[i] == "hatespeech":
    labels_list3[i] = 1
  else:
    labels_list3[i] = 2

input_features3 = []
for comment in text3_data:
    input_feature3 = generate_input_features(comment, model)
    input_features3.append(input_feature3)


input_features3 = np.array(input_features3)


x_train = torch.tensor(input_features)
y_train = torch.tensor(labels_list)

x_test = torch.tensor(input_features2)
y_test = torch.tensor(labels_list2)

x_valid = torch.tensor(input_features3)
y_valid = torch.tensor(labels_list3)


# Create DataLoaders for train and test datasets
batch_size = 32

# Assuming you have a custom Dataset class, you can create instances like this:
train_dataset = CustomDataset(x_train, y_train)
test_dataset = CustomDataset(x_test, y_test)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

dim_in = len(x_train[0])
num_classes = 3

model_dnn = DNN(dim_in,num_classes,512,3)
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.01
optimizer = torch.optim.SGD(model_dnn.parameters(), lr = learning_rate, momentum = 0.9)

log_epochs = 10
num_epochs = 500
log_weight = 10

model_dnn.to(device)
Train_losses=[]
Val_losses=[]
best_valid_acc=0
for epoch in range(num_epochs):
    model_dnn.train()
    for x_batch, y_batch in train_dataloader:
        x_batch = x_batch.to(torch.float32).to(device)
        y_batch = y_batch.to(device)
        pred = model_dnn(x_batch)
        loss = loss_fn(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


    # if epoch % log_weight == 0:
    #     features_train.append(log_features(model_dnn))
    if epoch % log_epochs == 0:
        loss_full = loss_fn(model_dnn(x_train.to(torch.float32).to(device)),y_train.to(device))
        Train_losses.append(loss_full.item())
        val_loss = loss_fn(model_dnn(x_valid.to(torch.float32).to(device)),y_valid.to(device))
        valid_pred = torch.argmax(model_dnn(x_valid.to(torch.float32).to(device)), dim=1)
        zero_mask = (valid_pred==y_valid.to(device))
        valid_acc = zero_mask.sum().item()/len(y_valid)
        best_val_acc = 0
        if valid_acc > best_val_acc:
          best_val_acc = valid_acc
          torch.save(model_dnn.state_dict(), 'best_model_weights.pt')
        print(f'Epoch {epoch} Train Loss {loss_full.item():.4f}  valid loss {val_loss.item():.4f}  valid acc {valid_acc}')
    if loss_full.item() < 0.01:
        print(f'Early stopping at epoch {epoch} because loss is below 0.01')
        break

model_dnn.load_state_dict(torch.load('best_model_weights.pt'))

test_loss_full = loss_fn(model_dnn(x_test.to(torch.float32).to(device)),y_test.to(device))
print(test_loss_full)

train_pred = torch.argmax(model_dnn(x_train.to(torch.float32).to(device)), dim=1)
zero_mask = (train_pred==y_train.to(device))

# Count the number of zeros
train_acc = zero_mask.sum().item()/len(y_train)
train_acc

test_pred = torch.argmax(model_dnn(x_test.to(torch.float32).to(device)), dim=1)
zero_mask = (test_pred==y_test.to(device))
# Count the number of zeros
test_acc = zero_mask.sum().item()/len(y_test)
test_acc

Epoch 0 Train Loss 1.0755  valid loss 1.0905  valid acc 0.40634755463059313
Epoch 10 Train Loss 0.7480  valid loss 1.2152  valid acc 0.4474505723204995
Epoch 20 Train Loss 0.5929  valid loss 1.2686  valid acc 0.4308012486992716
Epoch 30 Train Loss 0.3598  valid loss 1.4383  valid acc 0.45265348595213317
Epoch 40 Train Loss 0.2330  valid loss 1.8477  valid acc 0.4625390218522373
Epoch 50 Train Loss 0.1498  valid loss 2.5840  valid acc 0.41467221644120705
Epoch 60 Train Loss 0.1034  valid loss 3.5580  valid acc 0.41467221644120705
Epoch 70 Train Loss 0.0173  valid loss 4.3319  valid acc 0.43548387096774194
Epoch 80 Train Loss 0.0402  valid loss 3.3829  valid acc 0.441207075962539
Epoch 90 Train Loss 0.0081  valid loss 5.3853  valid acc 0.43184183142559834
Early stopping at epoch 90 because loss is below 0.01
tensor(3.4058, device='cuda:0', grad_fn=<NllLossBackward0>)


0.577962577962578

In [None]:
from sklearn.metrics import f1_score

f1_score(y_test.detach(), test_pred.detach().cpu().numpy(), average='macro')

0.5664371998723438

In [None]:

import csv

def find_common_sentences(file1_path, file2_path):
    common_sentences = set()


    with open(file1_path, 'r', newline='') as file1:
        reader1 = csv.reader(file1, delimiter='\t')
        for row in reader1:
            if len(row) > 0:
                common_sentences.add(row[0])


    common_count = 0
    with open(file2_path, 'r', newline='') as file2:
        reader2 = csv.reader(file2, delimiter='\t')
        for row in reader2:
            if len(row) > 0 and row[0] in common_sentences:
                common_count += 1

    return common_count


file1_path = 'NLP_ass_train.tsv'
file2_path = 'NLP_ass_test.tsv'


common_count = find_common_sentences(file1_path, file2_path)
file3_path = 'NLP_ass_valid.tsv'


common_count1 = find_common_sentences(file1_path, file3_path)


print(f"Number of common sentences between test and train datasets: {common_count}")
print(f"Number of common sentences between validation and train datasets: {common_count1}")


Number of common sentences between test and train datasets: 6
Number of common sentences between validation and train datasets: 3
