importing the required packages and modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import sklearn
from sklearn.model_selection import train_test_split
seed = 277241
random.seed(seed)
import nltk
import gensim
import gensim.downloader as api
from sklearn.metrics import classification_report
import torch
import torch.optim as optim
from nltk.tokenize import word_tokenize
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec
import spacy
import string
from sklearn import metrics
import numpy as np
from torch import nn
from transformers import BertTokenizer, BertModel
from torch.optim import Adam
from tqdm import tqdm
import gc
import re

In [2]:
from nltk.corpus import stopwords# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
torch.cuda.empty_cache()#Clear CUDA cache to free up memory

In [4]:
use_cuda = torch.cuda.is_available()#Check if NVIDIA GPU is available
#If yes, use CUDA to perform computations

if use_cuda:
    print("GPU acceleration enabled")
else:
    print("GPU acceleration NOT enabled. If using Colab, have you changed the runtime type and selected GPU as the hardware accelerator?")

device = torch.device("cuda" if use_cuda else "cpu")
print(device)

GPU acceleration enabled
cuda


In [5]:
torch.manual_seed(seed)#Set seed to ensuring the code and outputs are reproducible
torch.backends.cudnn.deterministic = True

 **DATA READING AND PRE PROCESSING**

In [6]:
train_data1 = pd.read_csv('propaganda_train.tsv', sep='\t')#training data
print(f'No. of records in the training dataset:{train_data1.count()}')
train_data1

No. of records in the training dataset:label                2414
tagged_in_context    2414
dtype: int64


Unnamed: 0,label,tagged_in_context
0,not_propaganda,"No, <BOS> he <EOS> will not be confirmed."
1,not_propaganda,This declassification effort <BOS> won’t make ...
2,flag_waving,The Obama administration misled the <BOS> Amer...
3,not_propaganda,“It looks like we’re capturing the demise of t...
4,not_propaganda,"<BOS> Location: Westerville, Ohio <EOS>"
...,...,...
2409,not_propaganda,<BOS> We support and appreciate <EOS> your bus...
2410,not_propaganda,International Atomic Energy Agency (IAEA) Dire...
2411,not_propaganda,What has been done: there has been work on for...
2412,not_propaganda,This is <BOS> the law of gradualness not the g...


In [7]:
validation_data1 = pd.read_csv('propaganda_val.tsv', sep='\t')
print(f'No. of records in the validation dataset:{validation_data1.count()}')
validation_data1

No. of records in the validation dataset:label                580
tagged_in_context    580
dtype: int64


Unnamed: 0,label,tagged_in_context
0,not_propaganda,"On average, between 300 and 600 infections are..."
1,causal_oversimplification,Mostly because <BOS> the country would not las...
2,appeal_to_fear_prejudice,Lyndon Johnson <BOS> gets Earl Warren and Sen....
3,not_propaganda,<BOS> You <EOS> may opt out at anytime.
4,repetition,It must be exacted from him directly in order ...
...,...,...
575,not_propaganda,"NewsCatholic Church, <BOS> Family, Marriage <E..."
576,not_propaganda,"Remember our saying, modern day fairy <BOS> ta..."
577,not_propaganda,Why <BOS> not <EOS> open up to Iran with massi...
578,flag_waving,<BOS> He also sang an Islamic State fight song...


For binary classification, we do not need to specify the propaganda type instead we can make other column with the labels of propaganda and not_propaganda.

In [8]:
train_data1['binary_label'] = np.where(train_data1['label']!='not_propaganda', 'propaganda', train_data1['label'])
validation_data1['binary_label'] = np.where(validation_data1['label']!='not_propaganda', 'propaganda', validation_data1['label'])

In [9]:
train_data1.groupby('binary_label').count()

Unnamed: 0_level_0,label,tagged_in_context
binary_label,Unnamed: 1_level_1,Unnamed: 2_level_1
not_propaganda,1191,1191
propaganda,1223,1223


In [10]:
validation_data1.groupby('binary_label').count()

Unnamed: 0_level_0,label,tagged_in_context
binary_label,Unnamed: 1_level_1,Unnamed: 2_level_1
not_propaganda,301,301
propaganda,279,279


In [11]:
#Split the data into train and test sets.
train_df, test_df = train_test_split(train_data1, test_size=0.2, random_state=seed)
test_df

Unnamed: 0,label,tagged_in_context,binary_label
899,not_propaganda,Visit The Virginian-Pilot <BOS> at <EOS> pilot...,not_propaganda
1165,loaded_language,Somewhere in the “community standards” that Fa...,propaganda
337,repetition,"As a matter of fact, these <BOS> counterfeits ...",propaganda
272,appeal_to_fear_prejudice,<BOS> The West falls as it fails to see Europe...,propaganda
2295,not_propaganda,"In the following, we shall present to our read...",not_propaganda
...,...,...,...
1183,flag_waving,Hungary's Prime Minister Viktor Orban has been...,propaganda
1241,loaded_language,I do not know when Pope Benedict took these me...,propaganda
2047,appeal_to_fear_prejudice,"Of course, <BOS> if they did, then someone mig...",propaganda
846,"name_calling,labeling","In his 2005 book Memory and Identity, John Pau...",propaganda


 (Task 1)The training dataset is balanced when we consider the binary classification task. This is a binary classification task since we only have to predict if the text belongs to the class - 'propaganda' or 'not_propaganda'.

we splited the training dataset into a separate train and test dataset. The testing dataset will contain 20% of the data currently in the training class as done above.

 **Task -1 Binary Classification of propaganda text**

In [12]:
train_bi_classify = train_df[['tagged_in_context', 'binary_label']].copy()#selection of required columns
test_bi_classify = test_df[['tagged_in_context', 'binary_label']].copy()
validation_bi_classify = validation_data1[['tagged_in_context', 'binary_label']].copy()

In [13]:
binary_labels = {label:i for i,label in enumerate(train_bi_classify['binary_label'].sort_values().unique().tolist())}#labels for Binary Classification
binary_labels

{'not_propaganda': 0, 'propaganda': 1}

In [14]:
reverse_index_binary_label ={v:k for (k,v) in binary_labels.items()}
reverse_index_binary_label

{0: 'not_propaganda', 1: 'propaganda'}

We will build classifiers using two entirely different approaches in order to complete **task 1 binary
text classification**. They are as follows:

1) **Bi-LSTMs**(Bidirectional Long Short-Term Memory) are a type of recurrent neural network ar-
chitecture that can effectively model sequential data like text by capturing long-range dependen-
cies.They have shown strong performance on many natural language processing tasks like text clas-
sification, tagging, and text summarization.

2) A pretrained large language model **BERT** (Bidirectional Encoder Representations from Trans-
formers) will be used to generate a contextualised embedding, which will be used downstream to
perform text classification tasks by a neural network

**Task -1 Bi-LSTM**

In [15]:
# Preprocess data
def preprocess_text(text):
    # Remove <BOS> and <EOS> tokens
    text = text.replace('<BOS>', '').replace('<EOS>', '').strip()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and convert to lowercase
    filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    # Join the tokens back into a sentence
    processed_text = ' '.join(filtered_tokens)
    return processed_text

train_bi_classify['tagged_in_context'] = train_bi_classify['tagged_in_context'].apply(preprocess_text)
test_bi_classify['tagged_in_context'] = test_bi_classify['tagged_in_context'].apply(preprocess_text)


In [16]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.callbacks import EarlyStopping


In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_bi_classify['tagged_in_context'])
X_train_seq = tokenizer.texts_to_sequences(train_bi_classify['tagged_in_context'])
X_train_pad = pad_sequences(X_train_seq, padding='post')

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_bi_classify['binary_label'])

# Define neural language model (Bi-LSTM)
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=X_train_pad.shape[1]),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.2),
    Bidirectional(LSTM(128)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer with 1 unit for binary classification
])

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2, callbacks=[EarlyStopping(patience=3)])

# Evaluate model on validation set
X_val_seq = tokenizer.texts_to_sequences(validation_bi_classify['tagged_in_context'])
X_val_pad = pad_sequences(X_val_seq, maxlen=X_train_pad.shape[1], padding='post')
y_val = label_encoder.transform(validation_bi_classify['binary_label'])

loss, accuracy = model.evaluate(X_val_pad, y_val)
print(f'Validation Loss: {loss}, Validation Accuracy: {accuracy}')

# Save the Bi-LSTM Binary classifier
#model.save("bi_lstm_binary_classifier.h5")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Validation Loss: 1.2572959661483765, Validation Accuracy: 0.6362069249153137


In [18]:
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report

# Load the trained model
#model = load_model("bi_lstm_binary_classifier.h5")

# Evaluate model on validation set
predictions = model.predict(X_val_pad)
y_pred = (predictions > 0.5).astype(int)  # Convert probabilities to binary predictions

# Generate classification report
report = classification_report(y_val, y_pred, target_names=label_encoder.classes_)
print(report)


                precision    recall  f1-score   support

not_propaganda       0.63      0.73      0.68       301
    propaganda       0.65      0.53      0.58       279

      accuracy                           0.64       580
     macro avg       0.64      0.63      0.63       580
  weighted avg       0.64      0.64      0.63       580



**Task-1 BERT**

In [19]:
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')


class Dataset(torch.utils.data.Dataset):#Took from Advanced NLP Lab 10 Solutions Notebook
    def __init__(self, df, label_dict, column='tagged_in_context'):#This class will defines a Dataset that will be used to load the prepare the data for training, validation and testing
        self.labels = [label_dict[label] for label in df['label']]
        self.texts = [tokenizer.encode_plus(text.lower(), padding='max_length', max_length=512, truncation=True, return_tensors="pt") for text in df[column]]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [20]:
def prepare_inputs(input1, label, device):
    label = label.to(device)
    mask = input1['attention_mask'].to(device) #it prepares the input for classification task
    input_id = input1['input_ids'].squeeze(1).to(device)
    return (input_id, mask, label)

In [21]:
class BertBinaryClassifier(nn.Module):# Took from the Advanced NLP Lab 10 Solutions Notebook
    """
    Neural network structure for binary classification, adapted from Advanced NLP Week 6 Lab solutions.

    Args:
        dropout (float): Dropout probability to prevent overfitting.
        num_classes (int): Number of output classes.
    """

    def __init__(self, dropout=0.5, num_classes=2):

        super(BertBinaryClassifier, self).__init__()

        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Dropout layer to randomly deactivate neurons
        self.dropout = nn.Dropout(dropout)

        # Linear layer for classification
        self.linear = nn.Linear(768, num_classes)

        # ReLU activation function
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        """
        Forward pass of the neural network.

        Args:
            input_id (torch.Tensor): Input tensor containing input IDs.
            mask (torch.Tensor): Input tensor containing attention mask.

        Returns:
            torch.Tensor: Output tensor after passing through the network.
        """

        # Get BERT embeddings
        last_hidden_layer, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)

        # Apply dropout
        dropout_output = self.dropout(pooled_output)

        # Linear layer for classification
        linear_output = self.linear(dropout_output)

        # Apply ReLU activation
        final_layer = self.relu(linear_output)

        return final_layer


In [22]:
def train(model, train_data, val_data, learning_rate, epochs, batch_size=2):# Took from the Advanced NLP Lab 6 Solutions Notebook
    """
    Function to train the BERT Binary Classifier.

    Args:
        model (torch.nn.Module): Instance of the model class to be used for training.
        train_data (torch.utils.data.Dataset): Training dataset.
        val_data (torch.utils.data.Dataset): Validation dataset.
        learning_rate (float): Learning rate for optimization.
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training and validation dataloaders.

    Returns:
        None. Prints training and validation loss/accuracy for each epoch.
    """
    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        model.train()
        for train_input, train_label in tqdm(train_dataloader):

            input_id, mask, train_label = prepare_inputs(train_input, train_label, device)

            output = model(input_id, mask)

            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0
        model.eval()
        with torch.no_grad():
            for val_input, val_label in val_dataloader:

                input_id, mask, val_label = prepare_inputs(val_input, val_label, device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label.long())

                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc

        print(f'Epoch: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data):.3f} | Train Accuracy: {total_acc_train / len(train_data):.3f}')
        print(f'Val Loss: {total_loss_val / len(val_data):.3f} | Val Accuracy: {total_acc_val / len(val_data):.3f}')


In [23]:
batch_size = 4
def evaluate(model, test_dataset, batch_size=4):# Took from the Advanced NLP Lab 10 Solutions Notebook
    """
    Function to evaluate the model on unseen test data.

    Args:
        model (torch.nn.Module): Instance of the model class to be evaluated.
        test_dataset (torch.utils.data.Dataset): Test dataset.
        batch_size (int): Batch size for evaluation dataloader.

    Returns:
        predictions (list): List of tensors containing model predictions for each data point in the test dataset.
        acc (float): Total accuracy of the model.
    """
    model.eval()
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():
        count = 0
        predictions = []
        for test_input, test_label in tqdm(test_dataloader):
            count += batch_size
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            predictions.append(output.argmax(dim=1))  # save the prediction for further analysis
            acc = (output.argmax(dim=1) == test_label).sum().item()

            total_acc_test += acc
            if count % 100 == 0:
                print(f'Accuracy so far = {total_acc_test / count: .3f}')

    print(f'Test accuracy: {total_acc_test / len(test_dataset): .3f}')
    return predictions, (total_acc_test / len(test_dataset))


In [24]:
train_bert_classify, test_bert_classify = train_test_split(train_data1, test_size=0.2, random_state=seed )
train_bert_classify = train_bert_classify[['tagged_in_context', 'binary_label']].copy()
test_bert_classify = test_bert_classify[['tagged_in_context', 'binary_label']].copy()
valid_bert_classify = validation_data1[['tagged_in_context', 'binary_label']].copy()

train_bert_classify.rename({'binary_label': 'label'}, axis=1, inplace=True)
test_bert_classify.rename({'binary_label': 'label'}, axis=1, inplace=True)
valid_bert_classify.rename({'binary_label': 'label'}, axis=1, inplace=True)

In [25]:
bert_train_binary = Dataset(train_bert_classify, binary_labels)
bert_test_binary = Dataset(test_bert_classify, binary_labels)
bert_validation_binary = Dataset(valid_bert_classify, binary_labels)

In [26]:
EPOCHS=1
bert_binary_classifier=BertBinaryClassifier(num_classes=len(binary_labels.keys()))
LR=1e-05

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [27]:
predictions, acc=evaluate(bert_binary_classifier, bert_test_binary, 1)

 21%|██▏       | 103/483 [00:05<00:13, 28.90it/s]

Accuracy so far =  0.550


 42%|████▏     | 205/483 [00:08<00:09, 28.66it/s]

Accuracy so far =  0.515


 63%|██████▎   | 304/483 [00:12<00:06, 28.17it/s]

Accuracy so far =  0.480


 83%|████████▎ | 403/483 [00:15<00:02, 28.39it/s]

Accuracy so far =  0.485


100%|██████████| 483/483 [00:18<00:00, 25.79it/s]

Test accuracy:  0.501





In [28]:
predicted_labels = [ reverse_index_binary_label[y.item()] for y in predictions]


print(classification_report(test_bert_classify['label'], predicted_labels))

                precision    recall  f1-score   support

not_propaganda       0.33      0.02      0.03       237
    propaganda       0.51      0.97      0.66       246

      accuracy                           0.50       483
     macro avg       0.42      0.49      0.35       483
  weighted avg       0.42      0.50      0.35       483



**Task-2 Multiclass Classification**

In [29]:
train_multiclass_fil = train_data1[train_data1['label']!='not_propaganda']
test_multiclass_fil = test_df[test_df['label']!='not_propaganda']
validation_multiclass_fil = validation_data1[validation_data1['label']!='not_propaganda']

In [30]:
train_multiclass_fil.groupby('label').count()

Unnamed: 0_level_0,tagged_in_context,binary_label
label,Unnamed: 1_level_1,Unnamed: 2_level_1
appeal_to_fear_prejudice,151,151
causal_oversimplification,158,158
doubt,144,144
"exaggeration,minimisation",164,164
flag_waving,148,148
loaded_language,154,154
"name_calling,labeling",157,157
repetition,147,147


In [31]:
multiclass_labels = {label:i for i,label in enumerate(train_multiclass_fil['label'].sort_values().unique().tolist())}#multiclass Classification labels
multiclass_labels

{'appeal_to_fear_prejudice': 0,
 'causal_oversimplification': 1,
 'doubt': 2,
 'exaggeration,minimisation': 3,
 'flag_waving': 4,
 'loaded_language': 5,
 'name_calling,labeling': 6,
 'repetition': 7}

In [32]:
reverse_index_multiclass_label ={v:k for (k,v) in multiclass_labels.items()}
reverse_index_multiclass_label

{0: 'appeal_to_fear_prejudice',
 1: 'causal_oversimplification',
 2: 'doubt',
 3: 'exaggeration,minimisation',
 4: 'flag_waving',
 5: 'loaded_language',
 6: 'name_calling,labeling',
 7: 'repetition'}

In [33]:
train_multiclass = train_multiclass_fil[['label','tagged_in_context']].copy()
test_multiclass = test_multiclass_fil[['label','tagged_in_context']].copy()
validation_multiclass = validation_multiclass_fil[['label','tagged_in_context']].copy()

In [34]:
def spanreturn(s):
  result = re.search('<BOS>(.*)<EOS>', s)
  return(result.group(1))

train_multiclass['propaganda'] =train_multiclass['tagged_in_context'].apply(spanreturn)
test_multiclass['propaganda'] = test_multiclass['tagged_in_context'].apply(spanreturn)
validation_multiclass['propaganda'] = validation_multiclass['tagged_in_context'].apply(spanreturn)
train_multiclass

Unnamed: 0,label,tagged_in_context,propaganda
2,flag_waving,The Obama administration misled the <BOS> Amer...,American people
5,loaded_language,"Hitler <BOS> annihilated <EOS> 400,000 Germans...",annihilated
8,doubt,"As noted above, at this point literally every ...",so-called evidence
10,"name_calling,labeling",His account was suspended for violating Twitte...,hateful conduct
12,appeal_to_fear_prejudice,A couple of seemingly unrelated events this pa...,point to Iran’s positioning itself for more a...
...,...,...,...
2403,repetition,You might think that would have cured him of f...,Nazi
2405,"exaggeration,minimisation",“I would like to take this opportunity to make...,absolutely no place for anti-Semitism
2406,flag_waving,Trump To Jeff Sessions: Put An End to Russia P...,Prosecutors Doing Mueller’s ‘Dirty Work Are A...
2407,causal_oversimplification,<BOS> Neither the Democrat leadership nor the ...,Neither the Democrat leadership nor the Democ...


**BERT-for multiclass classification**

In [35]:
train_data_multi = Dataset(train_multiclass, multiclass_labels)
test_data_multi = Dataset(test_multiclass, multiclass_labels)
validation_data_multi = Dataset(validation_multiclass, multiclass_labels)

In [36]:
class BertMulticlassClassifier(nn.Module):
    """
    Neural network structure for multiclass classification, adapted from Advanced NLP Week 10 Lab solutions.

    Args:
        num_classes (int): Number of output classes.
    """
    def __init__(self, num_classes=8):
        super(BertMulticlassClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(768, 400)
        self.fc2 = nn.Linear(400, 100)
        self.fc3 = nn.Linear(100, num_classes)
        self.leakyrelu = nn.LeakyReLU(0.1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        """
        Forward pass of the neural network.

        Args:
            input_ids (torch.Tensor): Input tensor containing input IDs.
            attention_mask (torch.Tensor): Input tensor containing attention mask.

        Returns:
            torch.Tensor: Output tensor after passing through the network.
        """
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        x = self.fc1(pooled_output)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.leakyrelu(x)
        final_output = self.softmax(x)

        return final_output


In [37]:
EPOCHS=3
LR=1e-6
model=BertMulticlassClassifier(num_classes=len(multiclass_labels.keys()))

In [38]:
train(model,train_data_multi,validation_data_multi,LR,EPOCHS, 4)

100%|██████████| 306/306 [01:54<00:00,  2.67it/s]


Epoch: 1 | Train Loss: 0.520 | Train Accuracy: 0.112
Val Loss: 1.043 | Val Accuracy: 0.176


100%|██████████| 306/306 [01:53<00:00,  2.70it/s]


Epoch: 2 | Train Loss: 0.520 | Train Accuracy: 0.150
Val Loss: 1.043 | Val Accuracy: 0.197


100%|██████████| 306/306 [01:53<00:00,  2.70it/s]


Epoch: 3 | Train Loss: 0.520 | Train Accuracy: 0.170
Val Loss: 1.042 | Val Accuracy: 0.165


In [43]:
predictions, acc=evaluate(model, test_data_multi)

 44%|████▍     | 26/59 [00:03<00:04,  6.80it/s]

Accuracy so far =  0.220


 86%|████████▋ | 51/59 [00:07<00:01,  6.61it/s]

Accuracy so far =  0.210


100%|██████████| 59/59 [00:08<00:00,  6.78it/s]

Test accuracy:  0.226





**GloVe for Multiclass Classification**

In [39]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [40]:
wordvec = api.load('glove-wiki-gigaword-300')



preprocessing

In [41]:
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation


def sentenceVector(sentence):
    """
    Function to calculate the vector representation of a given sentence using pre-trained word embeddings.

    Args:
        sentence (list): List of words in the sentence.

    Returns:
        numpy.ndarray: Vector representation of the sentence.
    """
    vector_size = wordvec.vector_size
    vector_result = np.zeros(vector_size)
    counter = 1
    for word in sentence:
        if word in wordvec:
            counter += 1
            vector_result += wordvec[word]
    vector_result /= counter
    return vector_result

def spacyTokenizer(sentence):
    """
    Function to tokenize, lemmatize, and filter out stopwords and punctuations from a given sentence.

    Args:
        sentence (str): Input sentence.

    Returns:
        list: Tokenized, lemmatized, and filtered list of tokens.
    """
    # Load the pre-trained spaCy model
    doc = nlp(sentence)

    # Lemmatize each token and convert each token into lowercase
    lemmatized_tokens = [word.lemma_.lower().strip() for word in doc]

    # Remove stop words and punctuations
    filtered_tokens = [word for word in lemmatized_tokens if word not in stop_words and word not in punctuations]

    # Return preprocessed list of tokens
    return filtered_tokens


In [42]:
class GloveEmbeddedMultiClassDataset(torch.utils.data.Dataset):
    """
    Dataset class for loading and preparing data for training, validation, and testing.

    Args:
        df (pandas.DataFrame): DataFrame containing the dataset.
        label_dict (dict): Dictionary mapping class labels to numerical indices.
        context_column (str): Name of the column containing context texts.
        span_column (str): Name of the column containing span texts.
    """

    def __init__(self, df, label_dict, context_column='tagged_in_context', span_column='propaganda'):
        self.labels = [label_dict[label] for label in df['label']]
        self.context_texts = [torch.from_numpy(np.array(sentenceVector(spacyTokenizer(text))).astype(np.float32)) for text in df[context_column]]
        self.span_texts = [torch.from_numpy(np.array(sentenceVector(spacyTokenizer(text))).astype(np.float32)) for text in df[span_column]]

    def classes(self):
        """
        Get the list of class labels.

        Returns:
            list: List of class labels.
        """
        return self.labels

    def __len__(self):
        """
        Get the total number of samples in the dataset.

        Returns:
            int: Total number of samples.
        """
        return len(self.labels)

    def get_batch_labels(self, idx):
        """
        Get batch labels for the given index.

        Args:
            idx (int): Index of the batch.

        Returns:
            numpy.ndarray: Batch labels.
        """
        return np.array(self.labels[idx])

    def get_batch_context_texts(self, idx):
        """
        Get batch context texts for the given index.

        Args:
            idx (int): Index of the batch.

        Returns:
            torch.Tensor: Batch context texts.
        """
        return self.context_texts[idx]

    def get_batch_span_texts(self, idx):
        """
        Get batch span texts for the given index.

        Args:
            idx (int): Index of the batch.

        Returns:
            torch.Tensor: Batch span texts.
        """
        return self.span_texts[idx]

    def __getitem__(self, idx):
        """
        Get the item from the dataset for the given index.

        Args:
            idx (int): Index of the item.

        Returns:
            tuple: Tuple containing batch context texts, batch span texts, and batch labels.
        """
        batch_context_texts = self.get_batch_context_texts(idx)
        batch_span_texts = self.get_batch_span_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_context_texts, batch_span_texts, batch_y


In [43]:
glove_multi_training = GloveEmbeddedMultiClassDataset(train_multiclass, multiclass_labels)
glove_multi_testing = GloveEmbeddedMultiClassDataset(test_multiclass, multiclass_labels)
glove_multi_validation = GloveEmbeddedMultiClassDataset(validation_multiclass, multiclass_labels)

In [44]:
class GloveMultiClassifierNet(nn.Module):
    """
    Neural network structure for multi-class classification, adapted from Advanced NLP Lab solutions.

    Args:
        input_size (int): Size of the input data.
        hidden_size (int): Size of the hidden layers.
        output_size (int): Number of output classes.
        dropout (float): Dropout rate to prevent overfitting.
    """
    def __init__(self, input_size, hidden_size, output_size, dropout=0.3):
        super(GloveMultiClassifierNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, output_size)
        self.dropout = nn.Dropout(dropout)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, inten, outen):
        """
        Forward pass of the neural network.

        Args:
            inten (torch.Tensor): Input tensor for feature 1.
            outen (torch.Tensor): Input tensor for feature 2.

        Returns:
            torch.Tensor: Output tensor after passing through the network.
        """
        x = torch.cat((inten, outen), dim=1)
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.leakyrelu(x)
        x = self.dropout(x)
        x = self.fc4(x)
        x = nn.functional.softmax(x, dim=1)
        return x


In [45]:
def train_GloveMultiClassClassifier(model, train_data, val_data, learning_rate, epochs, batch_size=4):
    """
    Train the GloVe Multi-Class Classifier model.

    Args:
        model (torch.nn.Module): The model instance to be used for training.
        train_data (torch.utils.data.Dataset): The training dataset.
        val_data (torch.utils.data.Dataset): The validation dataset.
        learning_rate (float): The learning rate for optimizer.
        epochs (int): Number of epochs for training.
        batch_size (int, optional): Batch size for training. Defaults to 4.

    Returns:
        None
    """
    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=4)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        model.train()
        for tagged_context, span_text, train_label in tqdm(train_dataloader):
            tagged_context = tagged_context.cuda()
            span_text = span_text.cuda()
            train_label = train_label.cuda()
            output = model(tagged_context, span_text)

            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0
        model.eval()
        with torch.no_grad():
            for val_tagged_context, val_span_text, val_label in val_dataloader:

                val_tagged_context = val_tagged_context.cuda()
                val_span_text = val_span_text.cuda()
                val_label = val_label.cuda()

                output = model(val_tagged_context, val_span_text)

                batch_loss = criterion(output, val_label.long())

                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc

        print(f'Epochs: {epoch_num+1} | Train Loss: {total_loss_train / len(train_data):.3f} | Train Accuracy: {total_acc_train/len(train_data):.3f}')
        print(f'Val loss: {total_loss_val/len(val_data):.3f} | Val Accuracy: {total_acc_val / len(val_data):.3f}')


In [46]:
def evaluate_GloveMultiClassClassifier(model, test_dataset, batch_size=4):
    """
    Evaluate the GloVe Multi-Class Classifier model on the test dataset.

    Args:
        model (torch.nn.Module): The instance of the model class for evaluation.
        test_dataset (torch.utils.data.Dataset): The test dataset.
        batch_size (int, optional): Batch size for evaluation. Defaults to 4.

    Returns:
        predictions (list): An array of tensors containing the predictions of the model for each data point in the test dataset.
        accuracy (float): The total accuracy of the model.
    """
    model.eval()
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():
        count = 0
        predictions = []
        for test_context, test_span, test_label in tqdm(test_dataloader):
            count += batch_size
            test_context = test_context.to(device)
            test_span = test_span.to(device)
            test_label = test_label.to(device)
            output = model(test_context, test_span)
            predictions.append(output.argmax(dim=1))  # Save the prediction for further analysis
            acc = (output.argmax(dim=1) == test_label).sum().item()

            total_acc_test += acc
            if count % 100 == 0:
                print(f'Accuracy so far = {total_acc_test / count:.3f}')

    print(f'Test accuracy: {total_acc_test / len(test_dataset):.3f}')
    return predictions, (total_acc_test / len(test_dataset))


In [47]:
INPUT_SIZE = 600
HIDDEN_SIZE = 300
OUTPUT_SIZE = len(multiclass_labels.keys())


EPOCHS=20
LR=1e-03
gloveMulticlassClassifier = GloveMultiClassifierNet(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)

In [48]:
train_GloveMultiClassClassifier(gloveMulticlassClassifier,glove_multi_training,glove_multi_validation,LR,EPOCHS)

100%|██████████| 306/306 [00:02<00:00, 132.73it/s]


Epochs: 1 | Train Loss: 0.504 | Train Accuracy: 0.232
Val loss: 0.484 | Val Accuracy: 0.376


100%|██████████| 306/306 [00:01<00:00, 270.23it/s]


Epochs: 2 | Train Loss: 0.486 | Train Accuracy: 0.336
Val loss: 0.476 | Val Accuracy: 0.373


100%|██████████| 306/306 [00:00<00:00, 358.77it/s]


Epochs: 3 | Train Loss: 0.476 | Train Accuracy: 0.375
Val loss: 0.467 | Val Accuracy: 0.412


100%|██████████| 306/306 [00:00<00:00, 378.13it/s]


Epochs: 4 | Train Loss: 0.467 | Train Accuracy: 0.409
Val loss: 0.467 | Val Accuracy: 0.409


100%|██████████| 306/306 [00:00<00:00, 362.65it/s]


Epochs: 5 | Train Loss: 0.465 | Train Accuracy: 0.415
Val loss: 0.464 | Val Accuracy: 0.427


100%|██████████| 306/306 [00:00<00:00, 373.42it/s]


Epochs: 6 | Train Loss: 0.460 | Train Accuracy: 0.431
Val loss: 0.465 | Val Accuracy: 0.423


100%|██████████| 306/306 [00:00<00:00, 388.10it/s]


Epochs: 7 | Train Loss: 0.460 | Train Accuracy: 0.438
Val loss: 0.466 | Val Accuracy: 0.412


100%|██████████| 306/306 [00:00<00:00, 376.30it/s]


Epochs: 8 | Train Loss: 0.453 | Train Accuracy: 0.467
Val loss: 0.460 | Val Accuracy: 0.444


100%|██████████| 306/306 [00:00<00:00, 380.41it/s]


Epochs: 9 | Train Loss: 0.450 | Train Accuracy: 0.466
Val loss: 0.461 | Val Accuracy: 0.437


100%|██████████| 306/306 [00:00<00:00, 374.20it/s]


Epochs: 10 | Train Loss: 0.451 | Train Accuracy: 0.472
Val loss: 0.461 | Val Accuracy: 0.437


100%|██████████| 306/306 [00:00<00:00, 375.19it/s]


Epochs: 11 | Train Loss: 0.447 | Train Accuracy: 0.486
Val loss: 0.469 | Val Accuracy: 0.401


100%|██████████| 306/306 [00:00<00:00, 377.51it/s]


Epochs: 12 | Train Loss: 0.446 | Train Accuracy: 0.488
Val loss: 0.461 | Val Accuracy: 0.430


100%|██████████| 306/306 [00:00<00:00, 384.80it/s]


Epochs: 13 | Train Loss: 0.441 | Train Accuracy: 0.509
Val loss: 0.464 | Val Accuracy: 0.412


100%|██████████| 306/306 [00:00<00:00, 322.18it/s]


Epochs: 14 | Train Loss: 0.443 | Train Accuracy: 0.502
Val loss: 0.459 | Val Accuracy: 0.409


100%|██████████| 306/306 [00:01<00:00, 277.17it/s]


Epochs: 15 | Train Loss: 0.438 | Train Accuracy: 0.527
Val loss: 0.466 | Val Accuracy: 0.405


100%|██████████| 306/306 [00:01<00:00, 272.07it/s]


Epochs: 16 | Train Loss: 0.441 | Train Accuracy: 0.515
Val loss: 0.464 | Val Accuracy: 0.427


100%|██████████| 306/306 [00:00<00:00, 355.63it/s]


Epochs: 17 | Train Loss: 0.438 | Train Accuracy: 0.521
Val loss: 0.460 | Val Accuracy: 0.434


100%|██████████| 306/306 [00:00<00:00, 367.93it/s]


Epochs: 18 | Train Loss: 0.437 | Train Accuracy: 0.523
Val loss: 0.462 | Val Accuracy: 0.427


100%|██████████| 306/306 [00:00<00:00, 379.06it/s]


Epochs: 19 | Train Loss: 0.434 | Train Accuracy: 0.544
Val loss: 0.452 | Val Accuracy: 0.480


100%|██████████| 306/306 [00:00<00:00, 374.67it/s]

Epochs: 20 | Train Loss: 0.435 | Train Accuracy: 0.533
Val loss: 0.459 | Val Accuracy: 0.430





In [49]:
predictions, acc = evaluate_GloveMultiClassClassifier(gloveMulticlassClassifier, glove_multi_testing,1)

100%|██████████| 246/246 [00:00<00:00, 1170.02it/s]

Accuracy so far = 0.710
Accuracy so far = 0.705
Test accuracy: 0.720





In [50]:
predicted_labels = [ reverse_index_multiclass_label[y.item()] for y in predictions]


print(classification_report(test_multiclass['label'], predicted_labels))

                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.46      0.69      0.55        32
causal_oversimplification       0.79      0.72      0.75        32
                    doubt       0.69      0.81      0.75        27
exaggeration,minimisation       0.82      0.77      0.79        30
              flag_waving       0.83      0.83      0.83        24
          loaded_language       0.90      0.74      0.81        38
    name_calling,labeling       0.71      0.83      0.77        30
               repetition       0.74      0.42      0.54        33

                 accuracy                           0.72       246
                macro avg       0.74      0.73      0.72       246
             weighted avg       0.75      0.72      0.72       246

