# подключение модели


In [27]:
# подключение модели
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time

model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")


input_ids = tokenizer("Eifel tower is located in", return_tensors="pt").to("cuda")
# ------------------
start_time = time.time()
output = model.generate(**input_ids, cache_implementation="static")
end_time = time.time()

print(tokenizer.decode(output[0], skip_special_tokens=True))


# ------------------
elapsed_time = end_time - start_time
print('Elapsed time: ', elapsed_time)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
W0711 08:12:04.869000 14023 torch/_dynamo/convert_frame.py:906] [0/8] torch._dynamo hit config.cache_size_limit (8)
W0711 08:12:04.869000 14023 torch/_dynamo/convert_frame.py:906] [0/8]    function: 'forward' (/usr/local/lib/python3.11/dist-packages/transformers/models/gpt2/modeling_gpt2.py:1148)
W0711 08:12:04.869000 14023 torch/_dynamo/convert_frame.py:906] [0/8]    last reason: 0/6: Cache line invalidated because L['past_key_values'].key_cache[11] got deallocated
W0711 08:12:04.869000 14023 torch/_dynamo/convert_frame.py:906] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W0711 08:12:04.869000 14023 torch/_dynamo/convert_frame.py:906] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.


RecompileLimitExceeded: cache_size_limit reached

# pruning


In [24]:
# pruning
import os
import torch
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# ----------------------------
# Configuration
# ----------------------------
MODEL_NAME = "gpt2"
OUTPUT_DIR = "./pruned_gpt2"


# ----------------------------
# Helper Functions
# ----------------------------

PRUNE_PERCENTILE = 20

def get_total_size(model):
  #  size
    total_size_bytes = sum(p.element_size() * p.nelement() for p in model.parameters() if p is not None)
    return total_size_bytes / (1024 ** 3)

def is_linear_layer(name, module):
    """Check if module is a linear layer and should be pruned."""
    return isinstance(module, torch.nn.Linear) and 'wpe' not in name and 'wte' not in name and 'ln' not in name

def count_nonzero_params(model):
    """Count number of non-zero parameters."""
    return sum(p.abs().gt(0).sum().item() for p in model.parameters() if p is not None)


device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME).to(device)

original_size = get_total_size(model)
print(f"Original size: {original_size:.2f} GB")

total_params = sum(p.numel() for p in model.parameters() if p is not None)
print(f"Total parameters: {total_params:,}")

with torch.no_grad():
    for name, module in model.named_modules():
        if is_linear_layer(name, module):
            weight = module.weight.data
            threshold = np.percentile(torch.abs(weight).cpu().numpy(), PRUNE_PERCENTILE)
            mask = torch.abs(weight) < threshold
            module.weight.data[mask] = 0.0

nonzero_after_prune = count_nonzero_params(model)
sparsity = (1 - nonzero_after_prune / total_params) * 100
print(f"True sparsity after pruning: {sparsity:.2f}%")

pruned_size = get_total_size(model)
print(f"Pruned size (in memory): {pruned_size:.2f} GB")

os.makedirs(OUTPUT_DIR, exist_ok=True)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model and tokenizer saved to {OUTPUT_DIR}")


Original size: 0.46 GB
Total parameters: 124,439,808
True sparsity after pruning: 6.20%
Pruned size (in memory): 0.46 GB
Model and tokenizer saved to ./pruned_gpt2


# test with pruned model 1.0

In [29]:
# test with pruned model
import torch
import time
from transformers import GPT2LMHeadModel, GPT2Tokenizer


PRUNED_MODEL_DIR = "./pruned_gpt2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


tokenizer = GPT2Tokenizer.from_pretrained(PRUNED_MODEL_DIR)
model = GPT2LMHeadModel.from_pretrained(PRUNED_MODEL_DIR).to(DEVICE)
model.eval()

def generate_text(prompt, max_length=50, top_k=50, do_sample=True):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            top_k=top_k,
            do_sample=do_sample,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


test_prompt = "Eifel tower is located in"

# ---time--- #

start_time = time.time()
generated_text = generate_text(test_prompt)
end_time = time.time()
# ---------------------

print(generated_text)

elapsed_time = end_time - start_time
print('Elapsed time: ', elapsed_time)


Eifel tower is located in a circular valley. It is located on the northwest of the town. Once you have passed the town, you can start walking towards the west side of town. You can walk across one of three ways: 1)
Elapsed time:  1.0449903011322021
