## Test Main function

In [None]:
from llmtrack import get_llm

llm = get_llm("openai/gpt-4o-mii")
print(llm.generate("generate a positve word"))


## Test groq API

In [1]:
import os
from groq import Groq
client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)
completion = client.chat.completions.create(
        messages=[
        {
            "role": "user",
            "content": "Explain the importance of fast language models",
        }
    ],
    model="llama3-8b-8192",
    temperature= 0.7,
    max_tokens=20,
    top_p=0.9,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None,
    n=1,
)

In [8]:
try:
    print(completion.choices[0].message.content)
except:
    print("Error")
    
try:
    usage = completion.usage.to_dict()
    print(usage)
    assert "prompt_tokens" in usage
    assert "completion_tokens" in usage
    assert "total_tokens" in usage
except:
    print("Error")

Fast language models, also known as efficient language models or accelerated language models, have gained significant attention in
{'completion_tokens': 20, 'prompt_tokens': 18, 'total_tokens': 38, 'completion_time': 0.016666667, 'prompt_time': 0.006427718, 'queue_time': 0.20464392, 'total_time': 0.023094385}


## Test Huggingface Transformers

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import time
from threading import Thread

device ='mps'
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", padding_side="left")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", device_map=device)
model=model.eval()
tokenizer.pad_token=tokenizer.eos_token
model.generation_config.pad_token_id=tokenizer.eos_token_id



  from .autonotebook import tqdm as notebook_tqdm


In [50]:
verb_time = True
user_input= "who are you?"
chats=[[{"role": "system", "content": "You are a helpful assistant"}, 
      {"role": "user", "content": user_input}]]

# tokenization
if verb_time:
    t0_1=time.time()
encoded_input=tokenizer.apply_chat_template(chats*2, return_tensors="pt" ,
                                        add_generation_prompt=True,
                                        padding=True,
                                        return_dict=True).to(device)
# print(encoded_input['input_ids'])
print(tokenizer.batch_decode(encoded_input['input_ids'])[0])
# print(tokenizer.batch_decode(encoded_input['input_ids'])[1])


if verb_time:
    t0_2=time.time()
    print("Tokenization time:",t0_2 - t0_1)


tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   2705,   5020,    220,   2366,     19,    271,   2675,    527,
            264,  11190,  18328, 128009, 128006,    882, 128007,    271,  14965,
            527,    499,     30, 128009, 128006,  78191, 128007,    271],
        [128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   2705,   5020,    220,   2366,     19,    271,   2675,    527,
            264,  11190,  18328, 128009, 128006,    882, 128007,    271,  14965,
            527,    499,     30, 128009, 128006,  78191, 128007,    271]],
       device='mps:0')
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 06 Oct 2024

You are a helpful assistant<|eot_id|><|start_header_id|>

In [55]:
# inference
generation_config=dict(           
        eos_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        max_new_tokens =50,
        max_length=50,
        num_return_sequences=1,  
        do_sample=False,
        temperature=0.9,
        top_p=0.7,
        top_k=40,
        num_beams=1,)
if verb_time:
    t1=time.time()

generation_output = model.generate(
    encoded_input["input_ids"], 
    attention_mask=encoded_input["attention_mask"],
    output_scores= False,
    return_dict_in_generate=True,
    **generation_config,
)

if verb_time:
    t2=time.time()
    print ("Inference time:", t2-t1)


Both `max_new_tokens` (=50) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Inference time: 1.9465217590332031


In [58]:
print(generation_output.keys())
# print(generation_output.scores[0].shape)
print('All: ')
print(tokenizer.batch_decode(generation_output.sequences, skip_special_tokens=True) )

print("Only LLM Generation: ")
decoded_texts = [ tokenizer.decode(generation_output.sequences[i][len(encoded_input["input_ids"][i]):], skip_special_tokens=True) for i in range(len(generation_output.sequences))]
print(decoded_texts)



odict_keys(['sequences', 'past_key_values'])
All: 
['system\n\nCutting Knowledge Date: December 2023\nToday Date: 06 Oct 2024\n\nYou are a helpful assistantuser\n\nwho are you?assistant\n\nI\'m an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."', 'system\n\nCutting Knowledge Date: December 2023\nToday Date: 06 Oct 2024\n\nYou are a helpful assistantuser\n\nwho are you?assistant\n\nI\'m an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."']
Only LLM Generation: 
['I\'m an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."', 'I\'m an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."']


In [5]:
lst = [0,1,2,]
lst[0:1]

[0]

In [20]:
# model = HFModel(model_name= "meta-llama/Llama-3.2-1B-Instruct", )

model._ssgenerate('who are you?', verb_time=True, num_return_sequences=2) 

Both `max_new_tokens` (=2048) and `max_length`(=2048) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Tokenization time: 0.0013570785522460938


Both `max_new_tokens` (=2048) and `max_length`(=2048) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Inference time: 1.641625165939331
Tokenization time: 0.0009062290191650391
Inference time: 19.768184900283813


GenerateOutput(text=['I\'m an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."', "I'm an artificial intelligence assistant, and I'm here to provide information, answer questions, and help with tasks to the best of my abilities. I'm a large language model, which means I was trained on a massive dataset of text from the internet, books, and other sources.\n\nI don't have a personal identity or emotions like humans do, but I'm designed to be helpful and assist with a wide range of topics and requests. My main goal is to provide accurate and reliable information, and to help users like you get the information they need.\n\nSome of the things I can do include:\n\n* Answering questions on a wide range of topics, from science and history to entertainment and culture\n* Providing definitions and explanations for words and phrases\n* Offering grammar and spelling checks to help with writing and communication\n* Generating text on a given topic or pr