In [2]:
!pip install torch
!pip install transformers
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install jsonlines
!pip install nltk

Collecting fsspec (from huggingface-hub<1.0,>=0.11.0->transformers)
  Obtaining dependency information for fsspec from https://files.pythonhosted.org/packages/e3/bd/4c0a4619494188a9db5d77e2100ab7d544a42e76b2447869d8e124e981d8/fsspec-2023.6.0-py3-none-any.whl.metadata
  Downloading fsspec-2023.6.0-py3-none-any.whl.metadata (6.7 kB)
Using cached fsspec-2023.6.0-py3-none-any.whl (163 kB)
Installing collected packages: fsspec
Successfully installed fsspec-2023.6.0
Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/9a/f2/0ad053856debbe90c83de1b4f05915f85fd2146f20faf9daa3b320d36df3/pandas-2.0.3-cp39-cp39-win_amd64.whl.metadata
  Downloading pandas-2.0.3-cp39-cp39-win_amd64.whl.metadata (18 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2023.3-py2.py3-none-any.whl (502 kB)
Collecting tzdata>=2022.1 (from pandas)
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Using cached pandas-2.0.3-cp39-cp39-win_amd64.whl (

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, BertForMaskedLM, AdamW

#Define custom dataset
class SpoilerDataset(Dataset):
    def __init__(self, file, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.posts = []
        self.labels = []
        self.spoilers = []
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                self.posts.append(data['postText'][0] + " [SEP] " + " ".join(data['targetParagraphs']))
                if 'tags' in data:
                    if data['tags'][0] == 'phrase':
                        self.labels.append(0)
                    elif data['tags'][0] == 'passage':
                        self.labels.append(1)
                    else:
                        self.labels.append(2)
                if 'spoiler' in data:
                    self.spoilers.append(data['spoiler'][0])

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(self.posts[idx], truncation=True, padding='max_length', max_length=self.max_length,
                                   return_tensors='pt')
        item = {key: torch.squeeze(val) for key, val in encodings.items()}
        if self.labels:
            item['labels'] = self.labels[idx]
        if self.spoilers:
            item['spoilers'] = self.spoilers[idx]
        return item

#define training loop
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'spoilers'}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    return total_loss / len(dataloader)

#define loss evaluation
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'spoilers'}
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(dataloader)

#define accuracy evaluation
def evl_acc(model, dataloader, device):
    model.eval()  # switch the model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.logits, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

#initialize model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = SpoilerDataset('train.jsonl', tokenizer, 512)
val_dataset = SpoilerDataset('val.jsonl', tokenizer, 512)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3).to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)

#train model
for epoch in range(5):
    train_loss = train(model, train_dataloader, optimizer, device)
    val_loss = evaluate(model, val_dataloader, device)
    train_acc = evl_acc(model,train_dataloader,device)
    val_acc = evl_acc(model,val_dataloader,device)
    print(f'Epoch: {epoch + 1}, Train loss: {train_loss}, Val loss: {val_loss}, Train Accuracy:{train_acc}, Val Accuracy:{val_acc}')

model.save_pretrained('./model')




In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, EncoderDecoderModel, AdamW
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

#define custom dataset
class SpoilerDataset(Dataset):
    def __init__(self, file, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.posts = []
        self.spoilers = []
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                self.posts.append(data['postText'][0] + " [SEP] " + " ".join(data['targetParagraphs']))
                if 'spoiler' in data:
                    self.spoilers.append(data['spoiler'][0])

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.posts[idx], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        if idx < len(self.spoilers):
            targets = self.tokenizer(self.spoilers[idx], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        else:
            targets = self.tokenizer("", truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        inputs['labels'] = targets['input_ids']
        return inputs
    
def bleu_score(references, predictions):
    smoothing = SmoothingFunction().method4
    bleu_score_1gram = corpus_bleu(references, predictions, weights=(1, 0, 0, 0), smoothing_function=smoothing)
    bleu_score_2gram = corpus_bleu(references, predictions, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing)
    bleu_score_3gram = corpus_bleu(references, predictions, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing)
    bleu_score_4gram = corpus_bleu(references, predictions, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing)
    bleu_score = (bleu_score_1gram + bleu_score_2gram + bleu_score_3gram + bleu_score_4gram) / 4
    return bleu_score

#define training function
def train_model(model, dataloader, tokenizer, optimizer, device):
    model.train()
    total_loss = 0
    train_references = []
    train_predictions= []
   
    
    for i, batch in enumerate(dataloader):
        inputs = {key: val.reshape(val.shape[0], -1).to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1.0)
        
        optimizer.step()
        optimizer.zero_grad()
       
        references = [tokenizer.decode(ref, skip_special_tokens=True, clean_up_tokenization_spaces = True) for ref in inputs ['input_ids']]
        predictions = [tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=True) for pred in outputs.logits.argmax(dim=-1)]
        train_references.extend([ref.split() for ref in references])
        train_predictions.extend([pred.split() for pred in predictions])
        total_loss += loss.item()
    
    
        
    train_bleu_score = bleu_score(train_references, train_predictions)
    train_avg_loss = total_loss/len(dataloader)
    return train_avg_loss, train_bleu_score

#define vlaidation funciton
def evaluate_model(model, dataloader, tokenizer, device):
    model.eval()
    total_loss = 0
    val_references = []
    val_predictions = []
    

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            inputs = {key: val.reshape(val.shape[0], -1).to(device) for key, val in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss
            references = [tokenizer.decode(ref, skip_special_tokens=True, clean_up_tokenization_spaces = True) for ref in inputs ['input_ids']]
            predictions = [tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=True) for pred in outputs.logits.argmax(dim=-1)]
            val_references.extend([ref.split() for ref in references])
            val_predictions.extend([pred.split() for pred in predictions])
            total_loss += loss.item()

     
    val_bleu_score = bleu_score(val_references, val_predictions)
    val_avg_loss = total_loss/len(dataloader)
    return val_avg_loss, val_bleu_score



# setup parameters
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 512
batch_size = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load data
train_data = SpoilerDataset('train.jsonl', tokenizer, max_length)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_data = SpoilerDataset('val.jsonl',tokenizer,max_length)
val_loader = DataLoader(val_data, batch_size=batch_size,shuffle=True)
test_data = SpoilerDataset('test.jsonl', tokenizer, max_length)
test_loader = DataLoader(test_data, batch_size=batch_size)

# Initialize model
model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'gpt2')
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
 
# Training loops

for epoch in range(10):
    train_avg_loss, train_bleu_score = train_model(model, train_loader, tokenizer, optimizer, device)
    val_avg_loss, val_bleu_score = evaluate_model(model, val_loader, tokenizer, device)
    print(f'Epoch: {epoch+1}, train_Loss:  {train_avg_loss}, train_Bleu:{train_bleu_score}, val_loss:{val_avg_loss}, val_Bleu:{val_bleu_score}')

    torch.save(model.state_dict(), f'model_{epoch}.pt')
    
                   
print("Training completed.")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: 1, train_Loss:  0.2341787366895005, train_Bleu:0.019562304155386633, val_loss:0.2030882819648832, val_Bleu:0.10155130855558121
Epoch: 2, train_Loss:  0.20659009742317722, train_Bleu:0.038623308233637146, val_loss:0.20105948029085993, val_Bleu:0.08267351781902328
Epoch: 3, train_Loss:  0.20051007234142162, train_Bleu:0.052216120067962786, val_loss:0.20037772485986352, val_Bleu:0.0442685122291905
Epoch: 4, train_Loss:  0.1968097859225236, train_Bleu:0.057968304867180666, val_loss:0.20028146174736322, val_Bleu:0.05770751499782942
Epoch: 5, train_Loss:  0.19308153297868558, train_Bleu:0.05802414610192202, val_loss:0.20082444941625, val_Bleu:0.05206817719418919
Epoch: 6, train_Loss:  0.1893930799991358, train_Bleu:0.05557095756669179, val_loss:0.19999172817915678, val_Bleu:0.037674375729368645
Epoch: 7, train_Loss:  0.18505812409799546, train_Bleu:0.052794544516083224, val_loss:0.2032849051617086, val_Bleu:0.0528049819306212
Epoch: 8, train_Loss:  0.1802583827485796, train_Bleu:0.047

"# Generate spoiler\nmodel.eval()\ninput_data = next(iter(test_loader))\ninputs = {key: val.to(device) for key, val in input_data.items()}\ngenerated_ids = model.generate(input_ids=inputs['input_ids'], max_length=150, num_beams=2)\ngenerated_spoilers = [tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for generated_id in generated_ids]\n\nfor spoiler in generated_spoilers:\n    print(spoiler)"