In [3]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch
import pandas as pd
import re
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import get_scheduler

In [4]:
df = pd.read_csv('/kaggle/input/cleaned-dataset/cleaned_dataset.csv')

In [5]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

In [6]:
train_texts = train_texts.astype(str)
test_texts = test_texts.astype(str)


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
# Step 3: Tokenize the input texts
def tokenize_texts(texts, tokenizer, max_length=128, device='cpu'):
    encodings = tokenizer(
        list(texts),  # Convert Series to list
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'  # Return PyTorch tensors
    )
    # Move tensors to the specified device (e.g., GPU if available)
    encodings = {key: val.to(device) for key, val in encodings.items()}
    return encodings

# Specify the device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Tokenize texts and move them to GPU if available
train_encodings = tokenize_texts(train_texts, tokenizer, device=device)
test_encodings = tokenize_texts(test_texts, tokenizer, device=device)

In [9]:
# Step 4: Convert labels to tensors
train_labels = torch.tensor(train_labels.values)
test_labels = torch.tensor(test_labels.values)

In [10]:
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = NewsDataset(train_encodings, train_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

In [11]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Step 7: Define the DataLoader, optimizer, and loss function
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=2e-5)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)
loss_fn = nn.CrossEntropyLoss()

In [13]:
# Output shapes for verification
print("Training Encodings Shape:", train_encodings['input_ids'].shape)
print("Testing Encodings Shape:", test_encodings['input_ids'].shape)
print("Training Labels Shape:", train_labels.shape)
print("Testing Labels Shape:", test_labels.shape)

Training Encodings Shape: torch.Size([162708, 128])
Testing Encodings Shape: torch.Size([40677, 128])
Training Labels Shape: torch.Size([162708])
Testing Labels Shape: torch.Size([40677])


In [14]:
# Save the tokenized data (optional, for future use)
torch.save((train_encodings, train_labels), 'train_data.pt')
torch.save((test_encodings, test_labels), 'test_data.pt')
print("Tokenized inputs and labels saved for BERT.")

Tokenized inputs and labels saved for BERT.


In [19]:
def train_model(model, train_loader, test_loader, optimizer, scheduler, loss_fn, device, epochs=3):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Training Loss: {total_loss / len(train_loader):.4f}")

        # Validation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                predictions = torch.argmax(outputs.logits, dim=1)
                correct += (predictions == labels).sum().item()
                total += labels.size(0)

        accuracy = correct / total
        print(f"Epoch {epoch + 1}, Validation Accuracy: {accuracy:.4f}")

In [20]:
train_model(model, train_loader, test_loader, optimizer, scheduler, loss_fn, device, epochs=3)

Epoch 1, Training Loss: 0.2458
Epoch 1, Validation Accuracy: 0.8756
Epoch 2, Training Loss: 0.1911
Epoch 2, Validation Accuracy: 0.8781
Epoch 3, Training Loss: 0.1705
Epoch 3, Validation Accuracy: 0.8755


In [21]:
# Save the trained model and tokenizer
model.save_pretrained("bert_finetuned")
tokenizer.save_pretrained("bert_finetuned")
print("Model and tokenizer saved.")


Model and tokenizer saved.


In [22]:
# Output shapes for verification
print("Training Encodings Shape:", train_encodings['input_ids'].shape)
print("Testing Encodings Shape:", test_encodings['input_ids'].shape)
print("Training Labels Shape:", train_labels.shape)
print("Testing Labels Shape:", test_labels.shape)

Training Encodings Shape: torch.Size([162708, 128])
Testing Encodings Shape: torch.Size([40677, 128])
Training Labels Shape: torch.Size([162708])
Testing Labels Shape: torch.Size([40677])
