In [2]:
from dotenv import load_dotenv

load_dotenv()

user_message = """
Write a biography of Elon Musk across different time periods and fields he involed the most significantly.
"""

messages = [("user", user_message)]

In [3]:
import os
import time

from langchain_groq import ChatGroq
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain_openai import ChatOpenAI
from langchain_together import ChatTogether

# Cloud models
nim_model = ChatNVIDIA(model="meta/llama-3.3-70b-instruct")
groq_model_fast = ChatGroq(model="llama-3.3-70b-specdec")
groq_model = ChatGroq(model="llama-3.3-70b-versatile")
together_model = ChatTogether(model="meta-llama/Llama-3.3-70B-Instruct-Turbo")
openrouter_model = ChatOpenAI(
    model="meta-llama/llama-3.3-70b-instruct",
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)



In [3]:
import pandas as pd


def append_to_csv(new_row):
    df = pd.read_csv("llm_comparison.csv")
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    df.to_csv("llm_comparison.csv", index=False)

In [4]:
start_time = time.time()
response = nim_model.invoke(messages)
end_time = time.time()
completion_tokens = response.response_metadata["token_usage"]["completion_tokens"]
completion_time = end_time - start_time
new_row = {
    "provider": "NVIDIA",
    "model": response.response_metadata["model_name"],
    "type": "cloud",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

response = groq_model_fast.invoke(messages)
completion_tokens = response.response_metadata["token_usage"]["completion_tokens"]
completion_time = response.response_metadata["token_usage"]["completion_time"]
new_row = {
    "provider": "Groq",
    "model": response.response_metadata["model_name"],
    "type": "cloud",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

response = groq_model.invoke(messages)
completion_tokens = response.response_metadata["token_usage"]["completion_tokens"]
completion_time = response.response_metadata["token_usage"]["completion_time"]
new_row = {
    "provider": "Groq",
    "model": response.response_metadata["model_name"],
    "type": "cloud",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

start_time = time.time()
response = together_model.invoke(messages)
end_time = time.time()
completion_tokens = response.response_metadata["token_usage"]["completion_tokens"]
completion_time = end_time - start_time
new_row = {
    "provider": "Together",
    "model": response.response_metadata["model_name"],
    "type": "cloud",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

start_time = time.time()
response = openrouter_model.invoke(messages)
end_time = time.time()
completion_tokens = response.response_metadata["token_usage"]["completion_tokens"]
completion_time = end_time - start_time
new_row = {
    "provider": "OpenRouter",
    "model": response.response_metadata["model_name"],
    "type": "cloud",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

In [None]:
import torch
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
from langchain_ollama import ChatOllama
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Local models
ollama_model = ChatOllama(model="llama3.3:70b")


model_id = "unsloth/Llama-3.3-70B-Instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
)
hf_llm = HuggingFacePipeline(pipeline=pipe)
huggingface_model = ChatHuggingFace(llm=hf_llm)

In [None]:
torch.cuda.empty_cache()
start_time = time.time()
response = ollama_model.invoke(messages)
end_time = time.time()
completion_tokens = response.response_metadata["eval_count"]
completion_time = response.response_metadata["total_duration"] / 10**9
new_row = {
    "provider": "Ollama",
    "model": response.response_metadata["model"],
    "type": "local",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

torch.cuda.empty_cache()
start_time = time.time()
response = huggingface_model.invoke(messages)
end_time = time.time()
completion_tokens = len(tokenizer.encode(response.content))
completion_time = end_time - start_time
new_row = {
    "provider": "HuggingFace",
    "model": model_id,
    "type": "local",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

In [4]:
from dotenv import load_dotenv

load_dotenv()

user_message = "Hello"
messages = [("user", user_message)]

In [5]:
response = nim_model.invoke(messages)
print(response.content)

Hello! How can I assist you today?


In [6]:
print(response)

content='Hello! How can I assist you today?' additional_kwargs={} response_metadata={'role': 'assistant', 'content': 'Hello! How can I assist you today?', 'token_usage': {'prompt_tokens': 11, 'total_tokens': 20, 'completion_tokens': 9}, 'finish_reason': 'stop', 'model_name': 'meta/llama-3.3-70b-instruct'} id='run-c96d0c1d-8b48-4ad9-9d64-4907f217abdb-0' usage_metadata={'input_tokens': 11, 'output_tokens': 9, 'total_tokens': 20} role='assistant'


In [7]:
response = groq_model_fast.invoke(messages)
print(response.content)

Hello. How can I assist you today?


In [8]:
print(response)

content='Hello. How can I assist you today?' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 35, 'total_tokens': 45, 'completion_time': 0.003883119, 'prompt_time': 0.004241874, 'queue_time': 0.044123721, 'total_time': 0.008124993}, 'model_name': 'llama-3.3-70b-specdec', 'system_fingerprint': 'fp_9eb2d06c09', 'finish_reason': 'stop', 'logprobs': None} id='run-4f87a7a8-ebde-4dbd-8d54-6215d088920a-0' usage_metadata={'input_tokens': 35, 'output_tokens': 10, 'total_tokens': 45}


In [9]:
response = groq_model.invoke(messages)
print(response.content)

Hello. It's nice to meet you. Is there something I can help you with or would you like to chat?


In [10]:
print(response)

content="Hello. It's nice to meet you. Is there something I can help you with or would you like to chat?" additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 25, 'prompt_tokens': 36, 'total_tokens': 61, 'completion_time': 0.090909091, 'prompt_time': 0.00715968, 'queue_time': 0.01075391, 'total_time': 0.098068771}, 'model_name': 'llama-3.3-70b-versatile', 'system_fingerprint': 'fp_fcc3b74982', 'finish_reason': 'stop', 'logprobs': None} id='run-e2a1ab68-e090-4052-9f81-fa4cc74d8a9c-0' usage_metadata={'input_tokens': 36, 'output_tokens': 25, 'total_tokens': 61}


In [11]:
response = together_model.invoke(messages)
print(response.content)

Hello. It's nice to meet you. Is there something I can help you with or would you like to chat?


In [12]:
print(response)

content="Hello. It's nice to meet you. Is there something I can help you with or would you like to chat?" additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 25, 'prompt_tokens': 36, 'total_tokens': 61, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'meta-llama/Llama-3.3-70B-Instruct-Turbo', 'system_fingerprint': None, 'finish_reason': 'eos', 'logprobs': None} id='run-5ac38395-b451-4b37-aef0-3ab8f2bdd7c8-0' usage_metadata={'input_tokens': 36, 'output_tokens': 25, 'total_tokens': 61, 'input_token_details': {}, 'output_token_details': {}}


In [13]:
response = openrouter_model.invoke(messages)
print(response.content)

Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?


In [14]:
print(response)

content="Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?" additional_kwargs={'refusal': ''} response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 11, 'total_tokens': 35, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'meta-llama/llama-3.3-70b-instruct', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-c993e354-bee3-4758-9c05-184814d5025e-0' usage_metadata={'input_tokens': 11, 'output_tokens': 24, 'total_tokens': 35, 'input_token_details': {}, 'output_token_details': {}}


In [None]:
torch.cuda.empty_cache()
response = ollama_model.invoke(messages)
print(response.content)

In [None]:
print(response)

In [None]:
torch.cuda.empty_cache()
start_time = time.time()
response = huggingface_model.invoke(messages)
end_time = time.time()
print(f"Time taken: {end_time - start_time:.2f} seconds")
print(response.content)

In [None]:
print(response)