In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [6]:
# Load the data
train_data = pd.read_json('/content/drive/MyDrive/task-1/data/train.jsonl', lines=True)
val_data = pd.read_json('/content/drive/MyDrive/task-1/data/val.jsonl', lines=True)
test_data = pd.read_json('/content/drive/MyDrive/task-1/data/test.jsonl', lines=True)

# Combine text features into a single feature
train_data['combined_text'] = train_data['postText'].apply(lambda x: ' '.join(x)) + ' ' + train_data['targetParagraphs'].apply(lambda x: ' '.join(x))
val_data['combined_text'] = val_data['postText'].apply(lambda x: ' '.join(x)) + ' ' + val_data['targetParagraphs'].apply(lambda x: ' '.join(x))
test_data['combined_text'] = test_data['postText'].apply(lambda x: ' '.join(x)) + ' ' + test_data['targetParagraphs'].apply(lambda x: ' '.join(x))

# Convert tags to strings
train_data['tags'] = train_data['tags'].apply(lambda x: x[0] if isinstance(x, list) else x)
val_data['tags'] = val_data['tags'].apply(lambda x: x[0] if isinstance(x, list) else x)

# Encode the target labels
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['tags'])
val_data['label'] = label_encoder.transform(val_data['tags'])

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input text
train_encodings = tokenizer(train_data['combined_text'].tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_data['combined_text'].tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_data['combined_text'].tolist(), truncation=True, padding=True, max_length=512)

class SpoilerDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create dataset objects
train_dataset = SpoilerDataset(train_encodings, train_data['label'].tolist())
val_dataset = SpoilerDataset(val_encodings, val_data['label'].tolist())
test_dataset = SpoilerDataset(test_encodings)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Prepare the data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Move the model to the GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Training loop
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

# Validation loop
model.eval()
predictions = []
true_labels = []
for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
    true_labels.extend(batch['labels'].cpu().numpy())

from sklearn.metrics import classification_report
print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))

# Make predictions on the test set
model.eval()
test_predictions = []
for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    test_predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())

# Map numerical labels back to original tags
predicted_tags = label_encoder.inverse_transform(test_predictions)

# Prepare predictions for submission
submission = pd.DataFrame({
    'id': range(len(test_data)),  # Use a range of integers as the IDs
    'spoilerType': predicted_tags
})

# Save the predictions to a CSV file
submission.to_csv('/content/drive/MyDrive/task-1/data/prediction_task1.csv', index=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1200 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       multi       0.69      0.44      0.54        84
     passage       0.56      0.66      0.61       154
      phrase       0.58      0.59      0.58       162

    accuracy                           0.58       400
   macro avg       0.61      0.56      0.58       400
weighted avg       0.59      0.58      0.58       400

