In [13]:
import pandas as pd
import xlrd
import os
import nltk
from tabulate import tabulate
from transformers import DataCollatorWithPadding, BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\urasa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
dir_path = "..\\UKP_sentential_argument_mining\\data\\"
df = pd.read_csv(os.path.join(dir_path, "abortion.tsv"), sep="\t", header=0)

for file_name in os.listdir(dir_path):
    file_path = os.path.join(dir_path, file_name)
    if os.path.isfile(file_path) and file_name != "abortion.tsv":
        try:
            temp_df = pd.read_csv(file_path, sep="\t", header=0)
            df = pd.concat([df, temp_df], ignore_index=True)
        except pd.errors.ParserError as e:
            print(f"Error reading {file_path}: {e}")

In [3]:
df.drop(columns=["retrievedUrl", "archivedUrl", "sentenceHash"], inplace=True)
# in this case we don't care if the argument is against or in favor of the topic
df["is_argument"] = df["annotation"].replace({"Argument_against": 1, "Argument_for": 1, "NoArgument": 0})
df.drop(columns=["annotation"], inplace=True)

  df["is_argument"] = df["annotation"].replace({"Argument_against": 1, "Argument_for": 1, "NoArgument": 0})


In [None]:
claim_indicators = [
    "accordingly",
    "assersts",
    "as a result",
    "believe that",
    "think that",
    "consequently",
    "conclude that",
    "clearly",
    "demonstrates that",
    "entails",
    "follows that",
    "hence",
    "implies",
    "in short",
    "in conclusion",
    "indicates that",
    "it follows that",
    "it is highly probable that",
    "should be clear that",
    "proves that",
    "shows that",
    "so",
    "suggests ",
    "claims",
    "therefore",
    "thus",
    "to sum up",
    "we may deduce","argues"
    "affirms",
    "contends",
    "demonstrates",
    "evidently",
    "justifies",
    "maintains",
    "positively",
    "points out",
    "presumably",
    "purports",
    "reasons",
    "seems",
    "supposes",
    "verifies",
    "in essence",
    "fundamentally",
    "invariably",
    "predicts",
    "invariably",
    "predicts",
    "there can be no doubt that",
    "there is no doubt that",
    "there is no question that",
    "it is apparent that",
    "it is likely that",
    "it is plausible that",
    "it is probable that"
]

In [21]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  
device = torch.device('cpu')



In [43]:
# BERT with extra steps
import torch.nn as nn
from transformers import BertModel

class BERTX(nn.Module):
    def __init__(self, num_labels=2):
        super(BERTX, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # freeze BERT layerss
        for param in self.bert.parameters():
            param.requires_grad = False
        
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 256)  
        self.relu = nn.ReLU()  
        self.dropout = nn.Dropout(0.3)  # 30% of neurons randomly set to 0
        self.fc2 = nn.Linear(256, num_labels) 

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Take the pooled output from BERT
        pooled_output = outputs[1] #CLS token apparently represents a summary of the entire sequence
        x = self.fc1(pooled_output)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            return loss, logits
    
        return logits
    
model = BERTX(num_labels=2).to(device)

for param in model.bert.parameters():
    param.requires_grad = False



In [6]:
def tokenize_df(df, col_name):
    tokens = tokenizer(df[col_name].tolist(), padding=True, truncation=True, max_length=256, return_tensors="pt")
    df["input_ids"] = tokens["input_ids"].tolist()
    df["attention_mask"] = tokens["attention_mask"].tolist()
    return df

In [25]:

class TextDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(sentence, 
                                  truncation=True, 
                                  padding='max_length', 
                                  max_length=self.max_len, 
                                  return_tensors='pt')
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [8]:
df.head()

Unnamed: 0,topic,sentence,set,is_argument
0,abortion,This means it has to steer monetary policy to ...,val,0
1,abortion,Where did you get that ?,train,0
2,abortion,Nathanson later became pro-life .,val,0
3,abortion,In this case we may never do evil ( directly a...,train,1
4,abortion,With that I would like to give everyone someth...,test,0


In [26]:
train_df = df[df['set'] == 'train']
val_df = df[df['set'] == 'val']
test_df = df[df['set'] == 'test']

train_sentences = train_df['sentence'].tolist()
val_sentences = val_df['sentence'].tolist()
test_sentences = test_df['sentence'].tolist()

train_labels = train_df['is_argument'].tolist()
val_labels = val_df['is_argument'].tolist()
test_labels = test_df['is_argument'].tolist()

In [27]:
max_len = 128

train_dataset = TextDataset(train_sentences, train_labels, tokenizer, max_len)
val_dataset = TextDataset(val_sentences, val_labels, tokenizer, max_len)
test_dataset = TextDataset(test_sentences, test_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [28]:
# TIME TO TRAIN
for batch in train_loader:
    print(batch)

{'input_ids': tensor([[ 101, 2023, 7543,  ...,    0,    0,    0],
        [ 101, 2021, 2012,  ...,    0,    0,    0],
        [ 101, 2057, 2342,  ...,    0,    0,    0],
        ...,
        [ 101, 1998, 2076,  ...,    0,    0,    0],
        [ 101, 2044, 1996,  ...,    0,    0,    0],
        [ 101, 9553, 4482,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0])}
{'input_ids': tensor([[ 101, 1999, 2337,  ...,    0,    0,    0],
        [ 101, 2021, 2070,  ...,    0,    0,    0],
        [ 101, 2027, 2089,  ...,    0,    0,    0],
        ...,
        [ 101, 2122, 3463,  ...,    0,    0,    0],
        [ 101, 3572, 2073,  ...,    0,    0,    0],
        [ 101, 2947, 2009,  ...,    0,    0,    0]]), 'attention_ma

KeyboardInterrupt: 

In [29]:
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()



In [37]:
epochs = 3
model = model.to(device)

In [40]:
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        loss, outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}')

    # Validation
    model.eval()
    total_val_loss = 0
    val_preds, val_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            loss, outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_val_loss += loss.item()

            preds = torch.argmax(outputs, dim=-1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_preds)
    print(f'Epoch {epoch + 1}/{epochs}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')


Epoch 1/3, Training Loss: 0.7051
Epoch 1/3, Validation Loss: 0.7044, Validation Accuracy: 0.4497
Epoch 2/3, Training Loss: 0.7053
Epoch 2/3, Validation Loss: 0.7044, Validation Accuracy: 0.4497
Epoch 3/3, Training Loss: 0.7044
Epoch 3/3, Validation Loss: 0.7044, Validation Accuracy: 0.4497


AttributeError: 'BERTX' object has no attribute 'save_pretrained'

In [41]:
# Save the model's state_dict (parameters)
torch.save(model.state_dict(), './bertX_model.pth')

('./BERTX_TOKENIZER\\tokenizer_config.json',
 './BERTX_TOKENIZER\\special_tokens_map.json',
 './BERTX_TOKENIZER\\vocab.txt',
 './BERTX_TOKENIZER\\added_tokens.json')

In [44]:
# Test evaluation
bertX = BERTX(num_labels=2)
bertX.load_state_dict(torch.load('./bertX_model.pth'))
bertX.to(device)

bertX.eval()
test_preds, test_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = bertX(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs, dim=-1)

        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(test_labels, test_preds)
print(f'Test Accuracy: {test_accuracy:.4f}')

Test Accuracy: 0.4536
