In [26]:
# gradual pruning
# layer-wise pruning
# Mixed Precision Training: Utilize PyTorch’s AMP (Automatic Mixed Precision) to reduce memory usage and speed up training and inference without significant loss in accuracy.
# Post-Training Quantization: Apply quantization techniques to the pruned model to further reduce the model size and computational requirements.
# Distillation Post-Pruning: After pruning, apply knowledge distillation from the original model to the pruned model. This can help recover some of the lost accuracy due to pruning by aligning the outputs of the pruned model with those of the original.
# Memory Mapping: If the dataset is large, use memory mapping techniques to avoid loading the entire dataset into memory, reducing the memory footprint.
# Perplexity: Measures how well a probabilistic model predicts a sample of text

In [27]:
pip install datasets



In [28]:
import os
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset
import torch.nn.functional as F

# Load the pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
teacher_model = GPT2LMHeadModel.from_pretrained(model_name)  # Teacher model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Initialize the pruned model (student model)
pruned_model = GPT2LMHeadModel.from_pretrained(model_name)
# Assume pruning has been performed on pruned_model

# Add padding token to the tokenizer and resize model embeddings accordingly
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    pruned_model.resize_token_embeddings(len(tokenizer))
    teacher_model.resize_token_embeddings(len(tokenizer))

def gradual_layerwise_pruning(model, start_pruning_fractions, end_pruning_fractions, num_steps=3, sample_size=100000):
    """
    Gradually increase the pruning fraction for each layer from start to end over a number of steps.
    """
    for step in range(num_steps):
        current_pruning_fractions = [
            start + (end - start) * (step / num_steps)
            for start, end in zip(start_pruning_fractions, end_pruning_fractions)
        ]

        # Apply pruning logic here if needed
        # For now, this is a placeholder to show gradual changes
        print(f"Pruning step {step + 1}/{num_steps}: current pruning fractions {current_pruning_fractions}")

num_layers = sum(1 for _ in pruned_model.named_parameters() if 'weight' in _[0])
start_pruning_fractions = [0.0] * num_layers
end_pruning_fractions = [0.1, 0.15, 0.2] * (num_layers // 3) + [0.1] * (num_layers % 3)

gradual_layerwise_pruning(pruned_model, start_pruning_fractions, end_pruning_fractions, num_steps=100)

# Load dataset with streaming enabled
dataset_name = 'wikitext'
dataset_config = 'wikitext-103-raw-v1'
dataset = load_dataset(dataset_name, dataset_config, split='train', streaming=True)

# Estimate the number of examples and set batch size
estimated_num_examples = 1000
batch_size = 100

# Prepare dataset function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

# Tokenize the dataset in chunks
def preprocess_and_tokenize(dataset, tokenizer):
    for batch in dataset:
        yield tokenize_function({'text': batch['text']})

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Calculate max_steps
num_epochs = 1
steps_per_epoch = estimated_num_examples // batch_size
max_steps = num_epochs * steps_per_epoch

# Define training arguments with Mixed Precision Training enabled
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,
    load_best_model_at_end=True,
    max_steps=max_steps,  # Specify the number of steps
)

# Initialize the Trainer with the pruned model (student model)
trainer = Trainer(
    model=pruned_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=preprocess_and_tokenize(dataset, tokenizer),
    eval_dataset=preprocess_and_tokenize(dataset, tokenizer)
)

# Custom distillation training loop
def distillation_loss(student_outputs, teacher_outputs, temperature=2.0):
    """
    Compute distillation loss between student and teacher model outputs.
    """
    student_logits = student_outputs.logits / temperature
    teacher_logits = teacher_outputs.logits / temperature
    return F.kl_div(F.log_softmax(student_logits, dim=-1), F.softmax(teacher_logits, dim=-1), reduction='batchmean')

def distillation_train_loop(trainer, teacher_model, train_dataset, tokenizer, distillation_loss):
    """
    Custom training loop for distillation.
    """
    # Implement custom training logic here
    pass

# Run the custom distillation training loop
distillation_train_loop(trainer, teacher_model, preprocess_and_tokenize(dataset, tokenizer), tokenizer, distillation_loss)

# Save the fine-tuned student model
model_save_path = './distilled-pruned-gpt2'
pruned_model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Load the fine-tuned model for evaluation
from transformers import pipeline

generator = pipeline('text-generation', model=pruned_model, tokenizer=tokenizer)
output = generator("Once upon a time", max_length=50)
print(output)

# Post-Training Quantization
# Create directory for saving quantized model
quantized_model_save_path = './quantized-distilled-pruned-gpt2'
os.makedirs(quantized_model_save_path, exist_ok=True)

# Convert model to quantized version
quantized_model = torch.quantization.quantize_dynamic(
    pruned_model,
    {torch.nn.Linear},
    dtype=torch.qint8
)

# Save the quantized model
torch.save(quantized_model.state_dict(), os.path.join(quantized_model_save_path, 'pytorch_model.bin'))

# Load the quantized model for evaluation
quantized_model.load_state_dict(torch.load(os.path.join(quantized_model_save_path, 'pytorch_model.bin')))
generator_quantized = pipeline('text-generation', model=quantized_model, tokenizer=tokenizer)

# Generate some text with the quantized model
output_quantized = generator_quantized("Once upon a time", max_length=50)
print(output_quantized)

Pruning step 1/100: current pruning fractions [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pruning step 2/100: current pruning fractions [0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.001, 0.0015, 0.002, 0.0

max_steps is given, it will override any value given in num_train_epochs
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'Once upon a time, what seems to you that the lightest person could make a living being with that power, you felt the power to control it."\n\n\nA little later, with the arrival of the angel of death by his side, it'}]
[{'generated_text': 'Once upon a time and the ancillh, that in time in in of, other\n18\n|- is as only in the thein, by with out, and are, this to in a a a and of; and afterwards as'}]


In [29]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline
import math

# Define the model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load the original and pruned models
original_model = GPT2LMHeadModel.from_pretrained(model_name)
pruned_model = GPT2LMHeadModel.from_pretrained(model_name)

# Set models to evaluation mode
original_model.eval()
pruned_model.eval()

def generate_text(model, tokenizer, prompt, max_length=50):
    """
    Generate text using the model with a given prompt.

    Args:
        model: The model to use for generation.
        tokenizer: The tokenizer to use with the model.
        prompt: The input text to base the generation on.
        max_length: The maximum length of the generated text.

    Returns:
        Generated text.
    """
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    result = generator(prompt, max_length=max_length)
    return result[0]['generated_text']

def calculate_perplexity(model, tokenizer, prompt):
    """
    Calculate perplexity of the model given a prompt.

    Args:
        model: The model to evaluate.
        tokenizer: The tokenizer to use with the model.
        prompt: The input text to evaluate.

    Returns:
        Perplexity score.
    """
    encodings = tokenizer(prompt, return_tensors='pt')
    input_ids = encodings.input_ids
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        perplexity = math.exp(loss.item())
    return perplexity

def run_text_generation_evaluation(model, tokenizer, prompts, max_length=50):
    """
    Evaluate the model's text generation capabilities with a set of prompts.

    Args:
        model: The model to use for generation.
        tokenizer: The tokenizer to use with the model.
        prompts: List of prompts to generate text for.
        max_length: The maximum length of the generated text.

    Returns:
        Dictionary with prompts and their generated texts.
    """
    results = {}
    for prompt in prompts:
        generated_text = generate_text(model, tokenizer, prompt, max_length)
        perplexity = calculate_perplexity(model, tokenizer, prompt)
        results[prompt] = {
            'generated_text': generated_text,
            'perplexity': perplexity
        }
    return results

# Example prompts for evaluation
prompts = [
    "Once upon a time in a distant land",
    "In the near future, technology",
    "The stock market surged today",
    "Artificial intelligence is transforming",
]

# Run evaluation for both original and pruned models
print("Evaluating Original Model:")
generation_results_original = run_text_generation_evaluation(original_model, tokenizer, prompts, max_length=50)

print("\nEvaluating Pruned Model:")
generation_results_pruned = run_text_generation_evaluation(pruned_model, tokenizer, prompts, max_length=50)

# Print results for original model
print("\nText Generation Results for Original Model:")
for prompt, result in generation_results_original.items():
    print(f"Prompt: {prompt}")
    print(f"Generated Text: {result['generated_text']}")
    print(f"Perplexity: {result['perplexity']}")
    print()

# Print results for pruned model
print("\nText Generation Results for Pruned Model:")
for prompt, result in generation_results_pruned.items():
    print(f"Prompt: {prompt}")
    print(f"Generated Text: {result['generated_text']}")
    print(f"Perplexity: {result['perplexity']}")
    print()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Evaluating Original Model:


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Evaluating Pruned Model:


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Text Generation Results for Original Model:
Prompt: Once upon a time in a distant land
Generated Text: Once upon a time in a distant land, the gods and goddesses were summoned together for a banquet in Aeneid to have the pleasure of drinking wine with their hosts. And the banquet began with the banquet of a little girl named Grecia
Perplexity: 32.54282968895235

Prompt: In the near future, technology
Generated Text: In the near future, technology for a humanized face might be needed for the next generation of augmented reality. One way of doing this would be to make the face as beautiful as possible as well as make the eyes as human-like, such that
Perplexity: 21.24105993908553

Prompt: The stock market surged today
Generated Text: The stock market surged today after the Fed's most recent report released on Friday. The value of the benchmark 10-year Treasury bond, first pegged to the Federal Reserve, rose by $5.9 billion in early trading.

The increase coincides
Perplexity: 278.460117

In [30]:
# Model inference time

import time

def measure_inference_time(generator, prompts, max_length=50):
    """
    Measure the average inference time for generating text with given prompts.

    Args:
        generator: The pipeline used for text generation.
        prompts: List of prompts to generate text for.
        max_length: The maximum length of the generated text.

    Returns:
        Average inference time in seconds.
    """
    total_time = 0
    for prompt in prompts:
        start_time = time.time()
        _ = generator(prompt, max_length=max_length)
        end_time = time.time()
        total_time += (end_time - start_time)
    average_time = total_time / len(prompts)
    return average_time

# Define the model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load the original model
original_model = GPT2LMHeadModel.from_pretrained(model_name)
original_model.eval()
original_generator = pipeline("text-generation", model=original_model, tokenizer=tokenizer)

# Measure inference time for the original model
original_inference_time = measure_inference_time(original_generator, prompts)
print(f"Average Inference Time for Original Model: {original_inference_time:.4f} seconds")

# Load the pruned model
pruned_model = GPT2LMHeadModel.from_pretrained(model_name)
pruned_model.eval()
pruned_generator = pipeline("text-generation", model=pruned_model, tokenizer=tokenizer)

# Measure inference time for the pruned model
pruned_inference_time = measure_inference_time(pruned_generator, prompts)
print(f"Average Inference Time for Pruned Model: {pruned_inference_time:.4f} seconds")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Average Inference Time for Original Model: 2.8854 seconds


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Average Inference Time for Pruned Model: 2.9150 seconds


In [31]:
# Memory usage

import tracemalloc

def measure_memory_usage_tracemalloc(generator, prompts, max_length=50):
    """
    Measure the memory usage during text generation using tracemalloc.

    Args:
        generator: The pipeline used for text generation.
        prompts: List of prompts to generate text for.
        max_length: The maximum length of the generated text.

    Returns:
        Maximum memory usage in MB.
    """
    tracemalloc.start()

    # Generate text
    for prompt in prompts:
        _ = generator(prompt, max_length=max_length)

    snapshot = tracemalloc.take_snapshot()
    top_stats = snapshot.statistics('lineno')

    print("Top 10 lines with highest memory usage:")
    for stat in top_stats[:10]:
        print(stat)

    # Calculate memory usage
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    return peak / 1024 / 1024  # Convert to MB

# Measure memory usage for the original model
original_memory_usage_tracemalloc = measure_memory_usage_tracemalloc(original_generator, prompts)
print(f"Memory Usage for Original Model: {original_memory_usage_tracemalloc:.2f} MB")

# Measure memory usage for the pruned model
pruned_memory_usage_tracemalloc = measure_memory_usage_tracemalloc(pruned_generator, prompts)
print(f"Memory Usage for Pruned Model: {pruned_memory_usage_tracemalloc:.2f} MB")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Top 10 lines with highest memory usage:
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1541: size=20.9 KiB, count=230, average=93 B
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py:2573: size=15.0 KiB, count=175, average=88 B
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1532: size=7824 B, count=49, average=160 B
/usr/local/lib/python3.10/dist-packages/transformers/models/gpt2/modeling_gpt2.py:634: size=6136 B, count=104, average=59 B
/usr/local/lib/python3.10/dist-packages/google/colab/_variable_inspector.py:28: size=4632 B, count=1, average=4632 B
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1322: size=4256 B, count=66, average=64 B
/usr/local/lib/python3.10/dist-packages/transformers/models/gpt2/modeling_gpt2.py:633: size=3363 B, count=57, average=59 B
/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:677: size=2183 B, count=37, average=59 B
/usr/local/lib/python3.10/dist

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Top 10 lines with highest memory usage:
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1541: size=19.3 KiB, count=210, average=94 B
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py:2573: size=15.1 KiB, count=176, average=88 B
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1532: size=7520 B, count=45, average=167 B
/usr/local/lib/python3.10/dist-packages/transformers/models/gpt2/modeling_gpt2.py:634: size=5192 B, count=88, average=59 B
/usr/local/lib/python3.10/dist-packages/google/colab/_variable_inspector.py:28: size=4632 B, count=1, average=4632 B
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1322: size=3039 B, count=47, average=65 B
/usr/local/lib/python3.10/dist-packages/transformers/models/gpt2/modeling_gpt2.py:633: size=2124 B, count=36, average=59 B
/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:2702: size=1829 B, count=31, average=59 B
/usr/lib/python3.10/json/encod