In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer
import pandas as pd



# Load the trained model

class SpoilerDataset(Dataset):
    def __init__(self, file, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.posts = []
        self.labels = []
        self.spoilers = []
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                self.posts.append(data['postText'][0] + " [SEP] " + " ".join(data['targetParagraphs']))
                if 'tags' in data:
                    if data['tags'][0] == 'phrase':
                        self.labels.append(0)
                    elif data['tags'][0] == 'passage':
                        self.labels.append(1)
                    else:
                        self.labels.append(2)
                if 'spoiler' in data:
                    self.spoilers.append(data['spoiler'][0])

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(self.posts[idx], truncation=True, padding='max_length', max_length=self.max_length,
                                   return_tensors='pt')
        item = {key: torch.squeeze(val) for key, val in encodings.items()}
        if self.labels:
            item['labels'] = self.labels[idx]
        if self.spoilers:
            item['spoilers'] = self.spoilers[idx]
        return item

def predict(model,dataloader,device):
    model.eval()
    predictions=[]
    label_map = {0:'phrase',1:'passage',2:'multi'}
    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(inputs, attention_mask = attention_mask)
            _, predicted_class = torch.max(outputs.logits,dim=1)

            predicted_labels = [label_map[pred.item()] for pred in predicted_class]
            predictions.extend(predicted_labels)

    return predictions


# Prepare a test dataset
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
test_dataset = SpoilerDataset('test.jsonl', tokenizer, 512)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)
model = BertForSequenceClassification.from_pretrained('./model').to(device)

predictions = predict(model, test_dataloader, device)

output_data={'id':range(len(predictions)),'SpoilerType':predictions}
out = pd.DataFrame(output_data)
out.to_csv('Task1.csv',index=False)

