In [7]:
from pathlib import Path
import wandb

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

## Save Model and Tokenizer

In [3]:
model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")

In [4]:
model.device

device(type='cpu')

saves model only

In [11]:
save_name = "models/opt_test"

In [14]:
model.save_pretrained(save_name, safe_serialization=True)

In [15]:
model = AutoModelForCausalLM.from_pretrained(save_name)

In [20]:
model.config.name_or_path

'models/opt_test'

## From Artifact

In [2]:
model_at = 'reviewco/alpaca_ft/1adjrp3l_meta-llama_Llama-2-7b-hf:v0'

In [4]:
def model_class(model_path):
    if list(model_path.glob("*adapter*")):
        return AutoPeftModelForCausalLM
    return AutoModelForCausalLM

In [16]:
def load_model_from_artifact(model_at, **model_kwargs):
    """Load model and tokenizer from W&B
    If the tokenizer is not present, we load a pretrained from the hub"""
    if not wandb.run:
        from wandb import Api
        api = Api()
        artifact = api.artifact(model_at, type="model")
    else:
        artifact = wandb.use_artifact(model_at, type="model")
    artifact_dir = Path(artifact.download())
    
    model = model_class(artifact_dir).from_pretrained(
        artifact_dir,
        **model_kwargs)
    try:
        tokenizer = AutoTokenizer.from_pretrained(artifact_dir)
        tokenizer.pad_token = tokenizer.eos_token
    except:
        model_id = artifact.metadata.get("model_id")
        if model_id is None:
            producer_run = artifact.logged_by()
            config = producer_run.config
            model_id = config.get("model_id")
        print(f"Tokenizer not found.\nLoading a pretrained tokenizer from the HF-hub: {model_id}")
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

In [17]:
load_model_from_artifact(model_at, device_map="cpu")

[34m[1mwandb[0m: Downloading large artifact 1adjrp3l_meta-llama_Llama-2-7b-hf:v0, 12852.56MB. 5 files... 
[34m[1mwandb[0m:   5 of 5 files downloaded.  
Done. 0:0:18.0


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Tokenizer not found.
Loading a pretrained tokenizer from the HF-hub: meta-llama/Llama-2-7b-hf


Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


(LlamaForCausalLM(
   (model): LlamaModel(
     (embed_tokens): Embedding(32000, 4096)
     (layers): ModuleList(
       (0-31): 32 x LlamaDecoderLayer(
         (self_attn): LlamaAttention(
           (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
           (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
           (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
           (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
           (rotary_emb): LlamaRotaryEmbedding()
         )
         (mlp): LlamaMLP(
           (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
           (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
           (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
           (act_fn): SiLUActivation()
         )
         (input_layernorm): LlamaRMSNorm()
         (post_attention_layernorm): LlamaRMSNorm()
       )
     )
     (norm): Ll

## Eval

In [5]:
from functools import partial
from time import perf_counter
from types import SimpleNamespace

import wandb
import torch
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

from llm_recipes.utils import parse_args, load_model_from_artifact
from llm_recipes.data import get_alpaca_split, create_alpaca_prompt

WANDB_PROJECT = "alpaca_ft"
DATASET_AT = 'capecape/alpaca_ft/alpaca_gpt4_splitted:v4'
MODEL_AT = 'reviewco/alpaca_ft/1adjrp3l_meta-llama_Llama-2-7b-hf:v0'

TORCH_DTYPES = {"bf16":torch.bfloat16, "fp16":torch.float16, "fp32":torch.float}

config = SimpleNamespace(
    wandb_project=WANDB_PROJECT,
    dataset_at=DATASET_AT,
    model_at=MODEL_AT,
    use_cache=True,
    device_map="auto",
    torch_dtype="bf16",
    num_eval_samples=10,
)

def _generate(prompt, model, tokenizer, gen_config):
    tokenized_prompt = tokenizer(prompt, return_tensors='pt')['input_ids'].to(model.device)
    with torch.inference_mode():
        t0 = perf_counter()
        output = model.generate(tokenized_prompt, generation_config=gen_config)
        generation_ids = output[0][len(tokenized_prompt[0]):]
        num_gen_tokens = len(generation_ids)
        time = perf_counter() - t0
        generation = tokenizer.decode(generation_ids, skip_special_tokens=True)
        return dict(generation=generation, generation_ids=generation_ids, time=time, num_gen_tokens=num_gen_tokens)
    
def prompt_table(examples, model, tokenizer, gen_config):
    sample_out = _generate(examples[0]["prompt"], model, tokenizer, gen_config)
    columns = ["prompt", "label"] + list(sample_out.keys()) + list(gen_config.to_dict().keys())
    table = wandb.Table(columns=columns)
    for example in tqdm(examples, leave=False):
        prompt, label = example["prompt"], example["output"]
        output = _generate(prompt, model, tokenizer, gen_config)
        table.add_data(prompt, label, *list(output.values()), *list(gen_config.to_dict().values()))
    return table
    

In [7]:
# parse_args(config)
wandb.init(project=config.wandb_project, job_type="eval", config=config)

_, eval_dataset = get_alpaca_split(dataset_at=config.dataset_at)

model, tokenizer, artifact_dir = load_model_from_artifact(
    config.model_at, 
    use_cache=config.use_cache,
    device_map=config.device_map,
    torch_dtype=TORCH_DTYPES[config.torch_dtype],
)
model.eval();

[34m[1mwandb[0m: Currently logged in as: [33mcapecape[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   2 of 2 files downloaded.  


{'prompt': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nDevelop a script that prints out the Fibonacci sequence.\n\n### Response:\n', 'output': 'Here is a Python script that prints out the Fibonacci sequence:\n\n```\n# number of elements in the sequence\nn = int(input("Enter the number of elements in the sequence: "))\n\n# initial two terms\na = 0\nb = 1\n\n# define a loop to generate the sequence\nif n <= 0:\n    print("Invalid input. The number of elements must be greater than 0.")\n\nelif n == 1:\n    print(a)\n\nelse:\n    print(a, b, end=" ")  # first two elements of the sequence\n    for i in range(3, n+1):\n        c = a + b\n        print(c, end=" ")\n        a = b\n        b = c\n```\n\nTo use, enter the number of elements you want in the sequence when prompted. The script will print out the sequence up to the specified number of elements.'}


[34m[1mwandb[0m: Downloading large artifact 1adjrp3l_meta-llama_Llama-2-7b-hf:v0, 12852.56MB. 5 files... 
[34m[1mwandb[0m:   5 of 5 files downloaded.  
Done. 0:0:18.0


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Tokenizer not found.
Loading a pretrained tokenizer from the HF-hub: meta-llama/Llama-2-7b-hf


In [12]:
# inference stuff
gen_config = GenerationConfig.from_pretrained(artifact_dir, max_new_tokens=256)
config.gen_config = gen_config  # we save this 
wandb.config.update(config)

In [28]:
gen_config.temperature

0.6

In [13]:
print("Running inference on your model!")
eval_samples = eval_dataset[:config.num_eval_samples]
table = prompt_table(eval_samples, model=model, tokenizer=tokenizer, gen_config=gen_config)

Running inference on your model!


  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
wandb.log({"eval_predictions":table})

In [14]:
wandb.finish()

In [15]:
import pandas as pd
df = pd.DataFrame(data=table.data, columns=table.columns)

## GPT3.5 results

In [28]:
from types import SimpleNamespace
import wandb
import pandas as pd

In [29]:
gpt35_config = SimpleNamespace(
    max_tokens=256,
    system_prompt="You are a helpful assistant.",
    temperature=1,
)

In [30]:
WANDB_PROJECT = "alpaca_ft"
ENTITY = "reviewco"

In [31]:
gpt35_df = pd.read_csv("gpt35_test_results.csv")

In [32]:
gpt35_df.head()

Unnamed: 0,prompt,generation
0,"Below is an instruction that describes a task,...",Based on the elevated levels of white blood ce...
1,"Below is an instruction that describes a task,...",Estamos emocionados de trabajar contigo en est...
2,Below is an instruction that describes a task....,Here are five potential threats to digital sec...
3,Below is an instruction that describes a task....,One potential area for improvement for my favo...
4,Below is an instruction that describes a task....,The purpose behind drug testing in professiona...


In [33]:
gpt35_table = wandb.Table(dataframe=gpt35_df)

In [34]:
with wandb.init(project=WANDB_PROJECT, entity=ENTITY, job_type="inference", tags=["gpt-3.5"], config=gpt35_config):
    wandb.log({"gpt35_results":gpt35_table})

[34m[1mwandb[0m: Currently logged in as: [33mcapecape[0m ([33mreviewco[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [35]:
import torch

In [36]:
t = torch.tensor([1,2,3], device="cuda")

In [42]:
t.tolist()

[1, 2, 3]

## OPT

In [1]:
from transformers import AutoModelForCausalLM

In [2]:
model_id = "facebook/opt-350m"

In [4]:
model = AutoModelForCausalLM.from_pretrained(model_id)

In [5]:
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=409

In [49]:
def _find_mod(model, module_name="layers"):
    for name, mod in model.named_modules():
        if name.endswith(module_name):
            return mod

In [50]:
ls = _find_mod(model, "embed_tokens")

In [51]:
ls

Embedding(50272, 512, padding_idx=1)