In [15]:
from typing import Optional
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

In [16]:
DEFAULT_BASE_MODEL = "../MODELS/gpt2-medium-local/"            # local base model folder (saved_pretrained)
MY_MODEL_DIR = "../MODELS/Project_Mozart_gpt2-medium"        # where LoRA adapters live
CACHE_DIR = "../MODELS/"   
DEVICE = "cuda"

In [19]:
def load_model_and_tokenizer(
    my_model_dir: str = MY_MODEL_DIR,
    base_model_dir: str = DEFAULT_BASE_MODEL,
    cashed_dir: str = CACHE_DIR,
    device: str =  "cuda" if torch.cuda.is_available() else "cpu",
):
    
    # 1. Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
                                            pretrained_model_name_or_path = my_model_dir,
                                            local_files_only = True,
                                            cache_dir = cashed_dir,
                                            )
    
    # 2. Load model
    base_model  = AutoModelForCausalLM.from_pretrained(
                                            pretrained_model_name_or_path = base_model_dir,
                                            cache_dir=cashed_dir,
                                            dtype=torch.float16,
                                        ).to(device)

    # Resize embeddings if tokenizer has added tokens
    base_model.resize_token_embeddings(len(tokenizer),
                                        mean_resizing=False)

    # 3. Apply LoRA
    model = PeftModel.from_pretrained(
                                base_model,
                                my_model_dir, 
                                local_files_only=True
                                ).to(device)

    model.eval()
    return model, tokenizer

In [20]:
model, tokenizer = load_model_and_tokenizer()

In [24]:
prompt = "generate a piece that is around 1 minute long"

# 1. Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

# 2. Generate text
outputs = model.generate(
    **inputs,
    max_length=50,        # total length (input + generated)
    do_sample=True,       # sampling instead of greedy
    temperature=0.7,      # randomness
    top_p=0.9,            # nucleus sampling
)

# 3. Decode
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


generate a piece that is around 1 minute long. If you're interested in this instrument, check out the Piano Bozak: http://www.bozak-1.org/index.html
