In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModel,
    BitsAndBytesConfig,
    TrainingArguments,
)

  from .autonotebook import tqdm as notebook_tqdm


[2023-12-01 16:18:10,826] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
import torch
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoTokenizer

In [3]:
import deepspeed
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

base_dir = "llama-linear-layers-all-conv-Nov-30-2"
checkpoint_number = "checkpoint-700"
checkpoint_path = "{}/{}".format(base_dir, checkpoint_number)

model_name = "NousResearch/Llama-2-7b-chat-hf"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype
)

original_model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,
    use_cache=False,
    device_map="auto"
)

peft_model = PeftModel.from_pretrained(original_model, checkpoint_path)

# load base LLM model and tokenizer
'''peft_model = AutoPeftModelForCausalLM.from_pretrained(
    checkpoint_path,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)'''

peft_model = peft_model.merge_and_unload()
#re_tokenizer = AutoTokenizer.from_pretrained(base_dir)

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:12<00:00,  6.01s/it]


In [4]:
peft_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )


In [5]:
'''config = {
    #"kernel_inject": True,
    #"tensor_parallel": {"tp_size": 4},
    "dtype": torch.float16 #"fp16",
    #"enable_cuda_graph": False
}

ds_engine = deepspeed.init_inference(peft_model, config=config) #config=config
deepspeed_peft_model = ds_engine.module'''

'config = {\n    #"kernel_inject": True,\n    #"tensor_parallel": {"tp_size": 4},\n    "dtype": torch.float16 #"fp16",\n    #"enable_cuda_graph": False\n}\n\nds_engine = deepspeed.init_inference(peft_model, config=config) #config=config\ndeepspeed_peft_model = ds_engine.module'

In [6]:
#deepspeed_peft_model

In [7]:
restored_tokenizer = AutoTokenizer.from_pretrained(checkpoint_path) #checkpoint_path #model_name

In [8]:
import pandas as pd
import time

eval_conv = pd.read_csv("../data/eval_dataset.csv", sep="\t")
eval_conv

Unnamed: 0,conversations,tensor_size
0,\n[INST]\ni am trying to use the infinium huma...,179
1,\n[INST]\ni wanted to run genes to transcript ...,103
2,"\n[INST]\nhi dears, i processed my pseudomonas...",219
3,"\n[INST]\ndear all, i have cutadapt raw output...",167
4,"\n[INST]\nhi all, i am currently analysing dat...",572
...,...,...
183,"\n[INST]\nhello, . i am working with candida g...",192
184,\n[INST]\ni have been trying to create a galax...,108
185,\n[INST]\ni am trying to follow this transcrip...,179
186,\n[INST]\nhello. two questions does anyone kno...,267


In [9]:
predictions = []
original_instructions = []
ground_truth_answer = []
extracted_answers = []

start_marker = '[INST]'
end_marker = '[/INST]'
end_tag = ""

s_time = time.time()

for ri, row in eval_conv.iterrows():
    entire_conv = row["conversations"]
    start_index = entire_conv.find(start_marker)
    end_index = entire_conv.find(end_marker)
    instruction = entire_conv[start_index + len(start_marker):end_index].strip()
    prompt = entire_conv[start_index:end_index + len(end_marker)].strip()
    original_answer = entire_conv[end_index + len(end_marker): len(entire_conv) - len(end_tag) - 1].strip()
    original_instructions.append(instruction)
    ground_truth_answer.append(original_answer)
    print("encoding prompt number {}...".format(ri+1))
    input_ids = restored_tokenizer.encode(prompt, return_tensors="pt").to('cuda')
    print("generating response number {} ...".format(ri+1))
    outputs = peft_model.generate(input_ids=input_ids, 
        max_new_tokens=256,
        do_sample=True,
    )
    pred = restored_tokenizer.decode(outputs[0])
    extracted_pred = pred[pred.find(end_marker) + len(end_marker): len(pred)].strip()
    predictions.append(pred)
    extracted_answers.append(extracted_pred)
    print("Prompt: \n")
    print(prompt)
    #print("Instruction: \n")
    #print(instruction)
    print()
    print("Ground truth answer: \n")
    print(original_answer)
    print()
    print("Generated answer: \n")
    print(extracted_pred)
    print()
    print("====================")
    if ri == 5:
        break

#"{}/{}".format(base_dir, checkpoint_number)
output_file_name = "generated_answers_peft_{}_{}".format(base_dir, checkpoint_number)

pred_dataframe = pd.DataFrame(zip(original_instructions, ground_truth_answer, extracted_answers, predictions), columns=["Instructions", "Ground truth answers", "Predicted answers", "Full generated answers"])

pred_dataframe.to_csv("../data/{}.csv".format(output_file_name), sep="\t", index=None)

e_time = time.time()

print("Finished generation in {} seconds".format(e_time - s_time))

encoding prompt number 1...
generating response number 1 ...




Prompt: 

[INST]
i am trying to use the infinium human methylation beadchip workflow in galaxy with idat files from an illumina epic chip. it is based on minfi, which i usually run in rstudio but wanted to give it a try. it is ok with the idat files ant the phenotype table, but then i am asked for a gtf genome table (in the example wgencodehaibmethyl450gm12878sitesrep1). what should this genome table be for an epic chip?

[/INST]

Ground truth answer: 

welcome, @ettoremeccia instructions are in the tutorial associated with the workflow in this section: galaxy training network galaxy training: infinium human methylation beadchip dna methylation is an epigenetic mechanism used by higher..

Generated answer: 

The Infinium HumanMethylation Chip workflow in Galaxy uses the UCSC hg19 genome assembly as a reference, so you will need to create a custom genome table for your Epic chip data.

The gtf genome table is a tab-delimited text file that contains information about the genomic location