In [1]:
from dotenv import load_dotenv

load_dotenv()

user_message = "Write a limerick about the wonders of GPU computing."
messages = [("user", user_message)]

In [2]:
import os
import time

from langchain_groq import ChatGroq
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain_openai import ChatOpenAI
from langchain_together import ChatTogether

nim_model = ChatNVIDIA(model="meta/llama-3.3-70b-instruct")
groq_model_fast = ChatGroq(model="llama-3.3-70b-specdec")
groq_model = ChatGroq(model="llama-3.3-70b-versatile")
together_model = ChatTogether(model="meta-llama/Llama-3.3-70B-Instruct-Turbo")
openrouter_model = ChatOpenAI(
    model="meta-llama/llama-3.3-70b-instruct",
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)



In [20]:
from langchain_ollama import ChatOllama

ollama_model = ChatOllama(model="llama3.3:70b")

In [3]:
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch


model_id = "unsloth/Llama-3.3-70B-Instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
)
hf_llm = HuggingFacePipeline(pipeline=pipe)
huggingface_model = ChatHuggingFace(llm=hf_llm)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0


In [6]:
def count_tokens(response):
    return response.response_metadata.get("token_usage", {}).get("completion_tokens") or len(tokenizer.encode(response.content))

In [11]:
response = nim_model.invoke(messages)
print(response.content)

There once was a GPU so fine,
Whose parallel processing was divine,
It crunched with great zest,
Through its cores, it did quest,
For solutions to problems of design.


In [12]:
print(response)

content='There once was a GPU so fine,\nWhose parallel processing was divine,\nIt crunched with great zest,\nThrough its cores, it did quest,\nFor solutions to problems of design.' additional_kwargs={} response_metadata={'role': 'assistant', 'content': 'There once was a GPU so fine,\nWhose parallel processing was divine,\nIt crunched with great zest,\nThrough its cores, it did quest,\nFor solutions to problems of design.', 'token_usage': {'prompt_tokens': 22, 'total_tokens': 59, 'completion_tokens': 37}, 'finish_reason': 'stop', 'model_name': 'meta/llama-3.3-70b-instruct'} id='run-71e9f704-c32e-4bf9-b2f1-159cc66aacb7-0' usage_metadata={'input_tokens': 22, 'output_tokens': 37, 'total_tokens': 59} role='assistant'


In [13]:
response = groq_model_fast.invoke(messages)
print(response.content)

There once was a GPU so fine,
Whose computing powers did shine.
It processed with pace,
And a parallel face,
Making complex tasks truly divine.


In [14]:
print(response)

content='There once was a GPU so fine,\nWhose computing powers did shine.\nIt processed with pace,\nAnd a parallel face,\nMaking complex tasks truly divine.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 32, 'prompt_tokens': 46, 'total_tokens': 78, 'completion_time': 0.028891941, 'prompt_time': 0.005572687, 'queue_time': 0.014289119999999999, 'total_time': 0.034464628}, 'model_name': 'llama-3.3-70b-specdec', 'system_fingerprint': 'fp_9eb2d06c09', 'finish_reason': 'stop', 'logprobs': None} id='run-b610ae88-11f4-4478-af11-e5c2ef138e05-0' usage_metadata={'input_tokens': 46, 'output_tokens': 32, 'total_tokens': 78}


In [15]:
response = groq_model.invoke(messages)
print(response.content)

There once were GPUs so fine,
Whose parallel processing did shine.
They crunched with great speed,
Through data with ease and deed,
And made complex tasks truly divine.


In [16]:
print(response)

content='There once were GPUs so fine,\nWhose parallel processing did shine.\nThey crunched with great speed,\nThrough data with ease and deed,\nAnd made complex tasks truly divine.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 36, 'prompt_tokens': 47, 'total_tokens': 83, 'completion_time': 0.130909091, 'prompt_time': 0.008215492, 'queue_time': 0.013799417999999999, 'total_time': 0.139124583}, 'model_name': 'llama-3.3-70b-versatile', 'system_fingerprint': 'fp_c0cfa69934', 'finish_reason': 'stop', 'logprobs': None} id='run-7bb45ead-bd14-4ce4-8385-1b148ac737c3-0' usage_metadata={'input_tokens': 47, 'output_tokens': 36, 'total_tokens': 83}


In [17]:
response = together_model.invoke(messages)
print(response.content)

There once was a GPU so fine,
Whose computing powers did shine.
It processed with speed,
And its parallel deed,
 Made complex tasks truly divine.


In [18]:
print(response)

content='There once was a GPU so fine,\nWhose computing powers did shine.\nIt processed with speed,\nAnd its parallel deed,\n Made complex tasks truly divine.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 32, 'prompt_tokens': 47, 'total_tokens': 79, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'meta-llama/Llama-3.3-70B-Instruct-Turbo', 'system_fingerprint': None, 'finish_reason': 'eos', 'logprobs': None} id='run-6be594c7-cb6c-4277-9417-84c03813695e-0' usage_metadata={'input_tokens': 47, 'output_tokens': 32, 'total_tokens': 79, 'input_token_details': {}, 'output_token_details': {}}


In [27]:
response = openrouter_model.invoke(messages)
print(response.content)

There once was a GPU so fine,
Whose computing powers did shine.
It processed with speed,
And its cores did proceed,
To solve complex tasks in no time!


In [28]:
print(response)

content='There once was a GPU so fine,\nWhose computing powers did shine.\nIt processed with speed,\nAnd its cores did proceed,\nTo solve complex tasks in no time!' additional_kwargs={'refusal': ''} response_metadata={'token_usage': {'completion_tokens': 34, 'prompt_tokens': 22, 'total_tokens': 56, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'meta-llama/llama-3.3-70b-instruct', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-9a0086a5-dafa-46e8-bfdf-605270fc720a-0' usage_metadata={'input_tokens': 22, 'output_tokens': 34, 'total_tokens': 56, 'input_token_details': {}, 'output_token_details': {}}


In [23]:
torch.cuda.empty_cache()
response = ollama_model.invoke(messages)
print(response.content)

There once was a GPU so fine,
Whose parallel processing did shine.
It crunched with great pace,
And a wonderful face,
And made computations divine.


In [24]:
print(response)

content='There once was a GPU so fine,\nWhose parallel processing did shine.\nIt crunched with great pace,\nAnd a wonderful face,\nAnd made computations divine.' additional_kwargs={} response_metadata={'model': 'llama3.3:70b', 'created_at': '2024-12-13T08:55:46.60147788Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 2811016880, 'load_duration': 40834609, 'prompt_eval_count': 22, 'prompt_eval_duration': 87273000, 'eval_count': 33, 'eval_duration': 2640163000} id='run-d08c1855-1daa-4e40-9006-f7e150e0f2d2-0' usage_metadata={'input_tokens': 22, 'output_tokens': 33, 'total_tokens': 55}


In [25]:
torch.cuda.empty_cache()
start_time = time.time()
response = huggingface_model.invoke(messages)
end_time = time.time()
print(f"Time taken: {end_time - start_time:.2f} seconds")
print(response.content)

Time taken: 16.20 seconds
There once was a GPU so fine,
Whose computing powers did truly shine.
It processed with zest,
And its speeds were the best,
And its parallel tasks did align.


In [26]:
print(response)

content='There once was a GPU so fine,\nWhose computing powers did truly shine.\nIt processed with zest,\nAnd its speeds were the best,\nAnd its parallel tasks did align.' additional_kwargs={} response_metadata={} id='run-c6c15db5-d113-4f40-88d9-5b72c1e00fce-0'
