In [1]:

# Install dependencies only if allowed (optional)
# %pip install transformers datasets scikit-learn pandas torch

import torch
import pandas as pd
from datasets import Dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Load and clean data
df = pd.read_csv("hasty_generalization_data.csv")
df = df.dropna(subset=["comment_text", "suspected_fallacy"])


In [3]:

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.rename_column("suspected_fallacy", "labels")

# Encode labels
label_list = dataset.unique("labels")
class_label = ClassLabel(names=label_list)
dataset = dataset.cast_column("labels", class_label)


Casting the dataset: 100%|██████████| 99/99 [00:00<00:00, 6046.83 examples/s]


In [4]:

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_fn(example):
    return tokenizer(example["comment_text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)


Map: 100%|██████████| 99/99 [00:00<00:00, 3547.51 examples/s]


In [5]:

# Train/test split
split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]

# Set PyTorch format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


In [6]:

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(label_list)
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [7]:

# Training setup
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=8)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
for epoch in range(3):
    print(f"Epoch {epoch+1}")
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch["labels"].float()  
        inputs = {k: v for k, v in batch.items() if k != "labels"}

        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

Epoch 1


100%|██████████| 10/10 [02:14<00:00, 13.44s/it]


Epoch 2


100%|██████████| 10/10 [02:07<00:00, 12.75s/it]


Epoch 3


100%|██████████| 10/10 [01:56<00:00, 11.67s/it]


In [8]:

# Evaluation
model.eval()
correct = 0
total = 0
for batch in eval_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    preds = outputs.logits.argmax(dim=-1)
    correct += (preds == batch["labels"]).sum().item()
    total += batch["labels"].size(0)

print("Accuracy:", correct / total)


Accuracy: 1.0
