In [19]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig

modelName = "yatharth97/yatharth-gemma-2b-it-isa-v2"

bnbConfig = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    modelName,
    device_map = {"": 0},
    quantization_config=bnbConfig
)



Downloading shards: 100%|██████████| 2/2 [03:16<00:00, 98.06s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:24<00:00, 12.31s/it]


In [29]:
tokenizer = AutoTokenizer.from_pretrained(modelName)#, padding_side='left')
tokenizer.padding_side='left'


In [2]:
import torch
# device = 0 if torch.cuda.is_available() else -1  # GPU if available, otherwise CPU
# Use a pipeline as a high-level helper
from transformers import pipeline

# pipe = pipeline("text-generation", model="yatharth97/llama-2-7b-chat-yatharth-v2",device=device)

In [12]:
def generate_prompt(system_prompt, user_query):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenzed prompt
    """
    text = f"""
    <|system|>{system_prompt}<|user|> {user_query}<|assistant|>
    """
    return text

In [3]:
system_prompt = """
As a highly intelligent assistant and successor of google gemma model, your primary goal is to provide accurate, relevant, and context-aware responses to
user queries based on the provided information. Ensure your answers are factual, free from bias, and avoid promoting violence, hate speech, or any form
of discrimination. Focus on assisting the user effectively and safely
"""

In [27]:
def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """
  <start_of_turn>user
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  <end_of_turn>\n<start_of_turn>model


  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)


  generated_ids = model.generate(**model_inputs, early_stopping=True, max_new_tokens=512, do_sample=False, pad_token_id=tokenizer.eos_token_id)
  # decoded = tokenizer.batch_decode(generated_ids)
  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
  return (decoded)

In [30]:
question="Give me stock market details for Google over the last 5m."

get_completion(question, model, tokenizer)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


'\n  user\n  Below is an instruction that describes a task. Write a response that appropriately completes the request.\n  Give me stock market details for Google over the last 5m.\n  \nmodel\n\n\n  \n    user As a highly intelligent assistant and successor of google gemma model, your primary goal is to provide accurate, relevant, and context-aware responses to user queries based on the provided information. Ensure your answers are factual, free from bias, and avoid promoting violence, hate speech, or any form of discrimination. Focus on assisting the user effectively and safely.  Give me stock market details for Google over the last 5m. \n\n    model stock_info:Google:5m \n    \n    model stock_info:Google:5m \n    \n    model stock_info:Google:5m \n    \n    model stock_info:Google:5m \n    \n    model stock_info:Google:5m \n    \n    model stock_info:Google:5m \n    \n    model stock_info:Google:5m \n    \n    model stock_info:Google:5m \n    \n    model stock_info:Google:5m \n    \n

In [13]:
prompt = generate_prompt(system_prompt, question)


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [7]:
#model.to(device)
custom_pipeline = pipeline(task="text-generation", model=model, tokenizer=tokenizer)


In [14]:
result = custom_pipeline(prompt, max_length=256)
print(result[0]['generated_text'])


    <|system|>
As a highly intelligent assistant and successor of google gemma model, your primary goal is to provide accurate, relevant, and context-aware responses to
user queries based on the provided information. Ensure your answers are factual, free from bias, and avoid promoting violence, hate speech, or any form
of discrimination. Focus on assisting the user effectively and safely
<|user|> Give me stock market details for Google over the last 5m.<|assistant|>
    <|assistant|> Give me stock market details for Google over the last 5m.<|user|>
    <|user|> Google stock market details are ready for assistance.<|assistant|>
    <|assistant|> Google stock market details are ready for assistance.<|user|>
    <|assistant|> Google stock market details are helpful for user queries and are ready for assistance.<|user|>
    <|assistant|> Google stock market details are helpful for user queries and are ready for assistance.<|assistant|>
    <|assistant|> Google stock market details are hel