In [1]:
%pip install torch transformers numpy pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Imports

# Core Imports
import pandas as pd
import torch
import os

# Model Architecture
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AdamW, get_linear_schedule_with_warmup

# Data Manipulation
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

# Etc
from torch.utils.tensorboard import SummaryWriter # logging
from tqdm import tqdm # progress bar




### Load, Process, and Split Data

In [3]:
# Load the dataset

file_path = '/kaggle/input/two-sentence-horror-jan-2015-apr-2023/reddit_cleansed_data.csv'
df = pd.read_csv(file_path)


In [4]:
# Preprocess the text (basic cleaning)

df['title'] = df['title'].str.strip().str.lower()
df['selftext'] = df['selftext'].str.strip().str.lower()


In [5]:
# Split the data into training and test sets (80/20 split)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


### Model Architecture

In [6]:
# Load BART and its tokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

### Data Generator

In [7]:
class HorrorStoriesDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        story = self.dataframe.iloc[idx]
        title, selftext = story['title'], story['selftext']

        # Encoding the inputs
        input_encoding = self.tokenizer(
            title,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Encoding the labels (selftext)
        label_encoding = self.tokenizer(
            selftext,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(0),
            'attention_mask': input_encoding['attention_mask'].squeeze(0),
            'labels': label_encoding['input_ids'].squeeze(0)
        }


In [8]:
# Create the dataset instances
train_dataset = HorrorStoriesDataset(train_df, tokenizer)
test_dataset = HorrorStoriesDataset(test_df, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

### Train Model

In [9]:
# Use GPU

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [10]:
# Model parameters

epochs = 3  # BERT paper (origin model) recommends fine-tuning for 2-4 epochs
learning_rate = 5e-5
gradient_accumulation_steps = 2
best_val_loss = float('inf')
early_stopping_patience = 3
early_stopping_counter = 0


In [11]:
# Optimizer and scheduler setup

optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# TensorBoard writer
writer = SummaryWriter()
save_path = './bart_model'



In [12]:
# Training loop with tqdm
for epoch in range(epochs):
    # Set model to training mode
    model.train()
    total_train_loss = 0

    # Create progress bar
    train_progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]", leave=False)
    
    # Training loop
    for step, batch in enumerate(train_progress_bar):
        # Get batch data on device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update params
        if (step + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            model.zero_grad()

        # Update progress bar
        train_progress_bar.set_postfix({'train_loss': f'{loss.item():.4f}'})

    # Calculate average loss over all batches
    avg_train_loss = total_train_loss / len(train_loader)
    writer.add_scalar('Loss/Train', avg_train_loss, epoch)

    # Validation loop with tqdm
    model.eval()
    total_eval_loss = 0
    val_progress_bar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{epochs} [Validate]", leave=False)

    with torch.no_grad():
        # For each batch in the validation dataloader
        for i, batch in enumerate(val_progress_bar):
            # Get batch data on device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            eval_loss = outputs.loss.item()
            total_eval_loss += eval_loss

            # Update progress bar
            avg_eval_loss_so_far = total_eval_loss / (i + 1)
            val_progress_bar.set_postfix({'avg_val_loss': f'{avg_eval_loss_so_far:.4f}'})

    # Calculate average loss over all batches
    avg_val_loss = total_eval_loss / len(test_loader)
    writer.add_scalar('Loss/Val', avg_val_loss, epoch)

    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss} | Val Loss: {avg_val_loss}")

    # Checkpointing
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        model.save_pretrained(save_path)
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= early_stopping_patience:
            print("Early stopping triggered")
            break

# Close TensorBoard writer
writer.close()

                                                                                              

Epoch 1/3 | Train Loss: 0.22458247996241928 | Val Loss: 0.12914172655919032


                                                                                              

Epoch 2/3 | Train Loss: 0.13077665878091005 | Val Loss: 0.12742529754695922


                                                                                              

Epoch 3/3 | Train Loss: 0.12318229066434278 | Val Loss: 0.12701901581646582


### Export Model

In [None]:
# Save model and tokenizer to save_path
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

### Inference (Generate Stories)

In [13]:
# Switch to eval mode
model.eval()

input_sentences = ['I got out of bed this morning.',
                   'I was horrified when I got my test results back.',
                   'My parents told me not to go upstairs.',
                   'There was a knock on the door.',
                   'I was walking home from school.',
                   'My friend told me to go to the bathroom.',
                   'There was a loud noise coming from the basement.',
                   'There was a ghost.',
                   'I heard someone whispering in my ear.'
]
generated_stories = []

for input_sentence in input_sentences:
    # Encode the input prompt
    input_ids = tokenizer.encode(input_sentence, return_tensors='pt').to(device)

    # Generate the output
    generated_ids = model.generate(input_ids, max_length=50)
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    generated_stories.append(input_sentence + ' ' + generated_text)


In [14]:
for generated_story in generated_stories:
    print(generated_story)
    print('\n')

I got out of bed this morning. when i woke up, i saw my reflection in the mirror.


I was horrified when I got my test results back. it was only when i got home that i realized they weren’t human.


My parents told me not to go upstairs. i don’t know what’s worse, the fact that i’m the only one down here, or that i can hear them screaming.


There was a knock on the door. it was the only way i could get out of the basement.


I was walking home from school. but when i turned around, i saw a man with a knife in his hand.


My friend told me to go to the bathroom. i didn’t expect him to come back.


There was a loud noise coming from the basement. it was only when i turned on the lights that i realized the noise wasn't coming from the basement.


There was a ghost. it was the only thing keeping me alive.


I heard someone whispering in my ear. i thought it was just a hallucination, until i heard a voice whisper back, “don’t worry, you’re not alone.”


