# Shakespearean Text Generation using BERT with PyTorch

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vuhung16au/nlp-learning-journey/blob/main/docs/shakespearean-text-BERT.ipynb)

## Overview

This notebook demonstrates how to use BERT (Bidirectional Encoder Representations from Transformers) for generating Shakespearean-style text using PyTorch. While BERT is primarily designed for understanding tasks rather than generation, we can leverage its masked language modeling capabilities to create text in the style of Shakespeare.

**Note**: This repository prioritizes PyTorch over TensorFlow. This notebook has been updated to use PyTorch implementations.

## What You'll Learn

- How to adapt BERT for text generation using masked language modeling
- Fine-tuning BERT on Shakespeare's works using PyTorch
- Implementing iterative text generation with BERT
- Comparing BERT-based generation with traditional autoregressive models
- Educational insights about BERT's bidirectional nature

## Prerequisites

Basic understanding of transformers, BERT architecture, and Python programming.

In [None]:
# Environment Detection and Setup (Required for all notebooks in this repository)
import sys
import subprocess
import os
import time

# Detect the runtime environment
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

print(f"Environment detected:")
print(f"  - Local: {IS_LOCAL}")
print(f"  - Google Colab: {IS_COLAB}")
print(f"  - Kaggle: {IS_KAGGLE}")

# Platform-specific system setup
if IS_COLAB:
    print("\nSetting up Google Colab environment...")
    !apt update -qq
    !apt install -y -qq libpq-dev
elif IS_KAGGLE:
    print("\nSetting up Kaggle environment...")
    # Kaggle usually has most packages pre-installed
else:
    print("\nSetting up local environment...")

# PyTorch logging setup
def setup_pytorch_logging():
    """Setup platform-specific PyTorch logging directories."""
    if IS_COLAB:
        root_logdir = "/content/pytorch_logs"
    elif IS_KAGGLE:
        root_logdir = "./pytorch_logs"
    else:
        root_logdir = os.path.join(os.getcwd(), "pytorch_logs")
    
    os.makedirs(root_logdir, exist_ok=True)
    return root_logdir

def get_run_logdir(experiment_name="run"):
    """Generate unique run directory for training logs."""
    root_logdir = setup_pytorch_logging()
    run_id = time.strftime(f"{experiment_name}_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

# Install required packages for this notebook
required_packages = [
    "transformers",
    "torch",
    "numpy",
    "pandas",
    "matplotlib",
    "seaborn",
    "tqdm",
    "requests"
]

print("\nInstalling required packages...")
for package in required_packages:
    if IS_COLAB or IS_KAGGLE:
        !pip install -q {package}
    else:
        subprocess.run([sys.executable, "-m", "pip", "install", "-q", package], 
                      capture_output=True)
    print(f"✓ {package}")

In [None]:
# Import required libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import (
    BertTokenizer, BertForMaskedLM, BertConfig,
    pipeline
)
import re
import random
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("All imports successful!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

## BERT Text Generator Class

We'll create a PyTorch-based text generator using BERT's masked language modeling capabilities.

In [None]:
class BERTTextGenerator:
    """PyTorch-based BERT text generator using masked language modeling."""
    
    def __init__(self, model_name="bert-base-uncased"):
        """Initialize the BERT text generator."""
        self.model_name = model_name
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = None
        self.model = None
        self.load_model()
    
    def load_model(self):
        """Load BERT model and tokenizer."""
        try:
            self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
            # Use PyTorch version
            self.model = BertForMaskedLM.from_pretrained(self.model_name)
            self.model.to(self.device)
            self.model.eval()
            print(f"✓ Loaded BERT model: {self.model_name}")
        except Exception as e:
            print(f"✗ Could not load BERT model: {e}")
            return False
        return True
    
    def predict_masked_words(self, text, num_predictions=5):
        """Predict words for [MASK] tokens in the text."""
        if self.model is None:
            return []
        
        # Tokenize input
        inputs = self.tokenizer(text, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Get predictions
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = outputs.logits
        
        # Find mask positions
        mask_token_id = self.tokenizer.mask_token_id
        mask_positions = (inputs['input_ids'] == mask_token_id).nonzero(as_tuple=True)[1]
        
        results = []
        for pos in mask_positions:
            # Get top predictions for this position
            masked_predictions = predictions[0, pos]
            top_k = torch.topk(masked_predictions, num_predictions)
            
            predicted_tokens = []
            for i, (score, token_id) in enumerate(zip(top_k.values, top_k.indices)):
                token = self.tokenizer.decode([token_id])
                predicted_tokens.append({
                    'token': token,
                    'score': score.item(),
                    'probability': torch.softmax(masked_predictions, dim=-1)[token_id].item()
                })
            
            results.append(predicted_tokens)
        
        return results
    
    def generate_shakespearean_text(self, prompt, max_length=50, temperature=1.0):
        """Generate Shakespearean-style text using iterative masking."""
        if self.model is None:
            return "Model not loaded"
        
        # Start with the prompt
        generated_text = prompt.strip()
        
        # Generate additional words iteratively
        for _ in range(max_length):
            # Add a mask token
            masked_text = generated_text + " [MASK]"
            
            # Get predictions
            predictions = self.predict_masked_words(masked_text, num_predictions=10)
            
            if not predictions:
                break
            
            # Sample from top predictions with temperature
            top_predictions = predictions[0]
            
            # Apply temperature scaling
            scores = torch.tensor([p['score'] for p in top_predictions])
            probabilities = torch.softmax(scores / temperature, dim=-1)
            
            # Sample a token
            selected_idx = torch.multinomial(probabilities, 1).item()
            selected_token = top_predictions[selected_idx]['token']
            
            # Add the selected token
            generated_text += " " + selected_token.strip()
            
            # Stop if we hit punctuation that ends a sentence
            if selected_token.strip() in ['.', '!', '?']:
                break
        
        return generated_text

# Initialize the generator
bert_generator = BERTTextGenerator()
print("BERT text generator initialized!")

## Testing BERT's Masked Language Modeling

Let's test BERT's ability to fill in masked words in Shakespearean contexts.

In [None]:
# Test masked language modeling with Shakespearean examples
test_sentences = [
    "To be or not to [MASK], that is the question.",
    "Romeo, Romeo, wherefore art thou [MASK]?",
    "All the world's a [MASK], and all the men and women merely players.",
    "What light through yonder [MASK] breaks?",
    "Fair is [MASK] and foul is fair."
]

print("🎭 Testing BERT's Shakespearean Knowledge:\n")

for sentence in test_sentences:
    print(f"Input: {sentence}")
    predictions = bert_generator.predict_masked_words(sentence, num_predictions=3)
    
    if predictions:
        print("Top predictions:")
        for i, pred in enumerate(predictions[0][:3]):
            print(f"  {i+1}. {pred['token']} (confidence: {pred['probability']:.3f})")
    
    print("-" * 50)

## Generating Shakespearean-Style Text

Now let's use our generator to create new Shakespearean-style text starting from various prompts.

In [None]:
# Generate Shakespearean-style text with different prompts
prompts = [
    "Shall I compare thee",
    "Once upon a midnight",
    "Love is",
    "The fair maiden",
    "In the forest deep"
]

print("🎭 Generating Shakespearean-Style Text:\n")

for prompt in prompts:
    print(f"Prompt: '{prompt}'")
    
    # Generate with different temperatures
    for temp in [0.8, 1.2]:
        generated = bert_generator.generate_shakespearean_text(
            prompt, max_length=15, temperature=temp
        )
        print(f"  Temperature {temp}: {generated}")
    
    print("-" * 60)

## Interactive Text Generation

Let's create an interactive function where you can input your own prompts.

In [None]:
def interactive_generation(prompt="To be or not", max_length=20, temperature=1.0):
    """Interactive Shakespearean text generation."""
    print(f"\n🎭 Generating from prompt: '{prompt}'")
    print(f"Parameters: max_length={max_length}, temperature={temperature}\n")
    
    # Generate multiple variations
    for i in range(3):
        generated = bert_generator.generate_shakespearean_text(
            prompt, max_length=max_length, temperature=temperature
        )
        print(f"Variation {i+1}: {generated}")
    
    return generated

# Example usage
result = interactive_generation(
    prompt="The course of true love", 
    max_length=15, 
    temperature=1.0
)

# You can modify these parameters to experiment:
# interactive_generation("My love is", max_length=10, temperature=0.7)

## Analysis: BERT vs Autoregressive Models

Let's analyze the differences between BERT's bidirectional generation and traditional autoregressive models.

In [None]:
# Comparison analysis
def analyze_generation_quality(prompts, num_samples=5):
    """Analyze the quality and characteristics of BERT-generated text."""
    
    results = []
    
    for prompt in prompts:
        print(f"\nAnalyzing prompt: '{prompt}'")
        
        generations = []
        for i in range(num_samples):
            generated = bert_generator.generate_shakespearean_text(
                prompt, max_length=15, temperature=1.0
            )
            generations.append(generated)
            print(f"  {i+1}. {generated}")
        
        # Simple analysis
        avg_length = sum(len(g.split()) for g in generations) / len(generations)
        unique_words = set()
        for g in generations:
            unique_words.update(g.lower().split())
        
        results.append({
            'prompt': prompt,
            'avg_length': avg_length,
            'unique_words': len(unique_words),
            'generations': generations
        })
    
    return results

# Analyze different types of prompts
analysis_prompts = [
    "Love is",
    "The king",
    "In fair Verona"
]

analysis_results = analyze_generation_quality(analysis_prompts, num_samples=3)

# Summary
print("\n📊 Generation Analysis Summary:")
for result in analysis_results:
    print(f"Prompt '{result['prompt']}':")
    print(f"  - Average length: {result['avg_length']:.1f} words")
    print(f"  - Unique vocabulary: {result['unique_words']} words")

## Conclusion and Key Insights

This notebook demonstrates PyTorch-based text generation using BERT's masked language modeling capabilities.

In [None]:
# Final demonstration with Vietnamese/English mixed content
print("🌍 Multilingual Capabilities Demo:")
print("Testing BERT with Vietnamese/English mixed content...\n")

# Note: BERT base model has limited Vietnamese capabilities
# For better Vietnamese support, consider multilingual BERT
multilingual_prompts = [
    "Love is [MASK] and beautiful",  # English
    "My name is [MASK]",             # English (as in examples)
    "Hello [MASK] world",            # English
]

for prompt in multilingual_prompts:
    print(f"Input: {prompt}")
    predictions = bert_generator.predict_masked_words(prompt, num_predictions=3)
    
    if predictions:
        print("Predictions:")
        for i, pred in enumerate(predictions[0][:3]):
            print(f"  {i+1}. {pred['token']} (confidence: {pred['probability']:.3f})")
    print()

print("\n✅ PyTorch BERT Text Generation Demo Complete!")
print("\n📝 Key Takeaways:")
print("1. BERT uses bidirectional context for masked language modeling")
print("2. PyTorch provides flexible control over model behavior")
print("3. Temperature scaling affects generation diversity")
print("4. BERT is better for understanding than generation tasks")
print("5. For Vietnamese support, use multilingual models like mBERT")

# Vietnamese/English reference
print("\n🇻🇳🇺🇸 Vietnamese/English Examples:")
print("English: 'My name is John' → Vietnamese: 'Tên tôi là John'")
print("English: 'Hello' → Vietnamese: 'Xin chào'")
print("English: 'Thank you' → Vietnamese: 'Cảm ơn'")