In [1]:
from dotenv import load_dotenv

load_dotenv()

user_message = """
Write a biography of Elon Musk across different time periods and fields he involed the most significantly.
"""

messages = [("user", user_message)]

In [2]:
import os
import time

from langchain_groq import ChatGroq
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain_openai import ChatOpenAI
from langchain_together import ChatTogether

# Cloud models
nim_model = ChatNVIDIA(model="meta/llama-3.3-70b-instruct")
groq_model_fast = ChatGroq(model="llama-3.3-70b-specdec")
groq_model = ChatGroq(model="llama-3.3-70b-versatile")
together_model = ChatTogether(model="meta-llama/Llama-3.3-70B-Instruct-Turbo")
openrouter_model = ChatOpenAI(
    model="meta-llama/llama-3.3-70b-instruct",
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)



In [3]:
import pandas as pd


def append_to_csv(new_row):
    df = pd.read_csv("llm_comparison.csv")
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    df.to_csv("llm_comparison.csv", index=False)

In [4]:
start_time = time.time()
response = nim_model.invoke(messages)
end_time = time.time()
completion_tokens = response.response_metadata["token_usage"]["completion_tokens"]
completion_time = end_time - start_time
new_row = {
    "provider": "NVIDIA",
    "model": response.response_metadata["model_name"],
    "type": "cloud",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

response = groq_model_fast.invoke(messages)
completion_tokens = response.response_metadata["token_usage"]["completion_tokens"]
completion_time = response.response_metadata["token_usage"]["completion_time"]
new_row = {
    "provider": "Groq",
    "model": response.response_metadata["model_name"],
    "type": "cloud",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

response = groq_model.invoke(messages)
completion_tokens = response.response_metadata["token_usage"]["completion_tokens"]
completion_time = response.response_metadata["token_usage"]["completion_time"]
new_row = {
    "provider": "Groq",
    "model": response.response_metadata["model_name"],
    "type": "cloud",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

start_time = time.time()
response = together_model.invoke(messages)
end_time = time.time()
completion_tokens = response.response_metadata["token_usage"]["completion_tokens"]
completion_time = end_time - start_time
new_row = {
    "provider": "Together",
    "model": response.response_metadata["model_name"],
    "type": "cloud",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

start_time = time.time()
response = openrouter_model.invoke(messages)
end_time = time.time()
completion_tokens = response.response_metadata["token_usage"]["completion_tokens"]
completion_time = end_time - start_time
new_row = {
    "provider": "OpenRouter",
    "model": response.response_metadata["model_name"],
    "type": "cloud",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

In [None]:
import torch
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
from langchain_ollama import ChatOllama
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Local models
ollama_model = ChatOllama(model="llama3.3:70b")


model_id = "unsloth/Llama-3.3-70B-Instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
)
hf_llm = HuggingFacePipeline(pipeline=pipe)
huggingface_model = ChatHuggingFace(llm=hf_llm)

In [None]:
torch.cuda.empty_cache()
start_time = time.time()
response = ollama_model.invoke(messages)
end_time = time.time()
completion_tokens = response.response_metadata["eval_count"]
completion_time = response.response_metadata["total_duration"] / 10**9
new_row = {
    "provider": "Ollama",
    "model": response.response_metadata["model"],
    "type": "local",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

torch.cuda.empty_cache()
start_time = time.time()
response = huggingface_model.invoke(messages)
end_time = time.time()
completion_tokens = len(tokenizer.encode(response.content))
completion_time = end_time - start_time
new_row = {
    "provider": "HuggingFace",
    "model": model_id,
    "type": "local",
    "input": user_message,
    "output": response.content,
    "completion_tokens": completion_tokens,
    "completion_time": completion_time,
    "speed": completion_tokens / completion_time,
}
append_to_csv(new_row)

In [2]:
from dotenv import load_dotenv

load_dotenv()

user_message = "Hello"
messages = [("user", user_message)]

In [None]:
response = nim_model.invoke(messages)
print(response.content)

In [None]:
print(response)

In [None]:
response = groq_model_fast.invoke(messages)
print(response.content)

In [None]:
print(response)

In [None]:
response = groq_model.invoke(messages)
print(response.content)

In [None]:
print(response)

In [None]:
response = together_model.invoke(messages)
print(response.content)

In [None]:
print(response)

In [None]:
response = openrouter_model.invoke(messages)
print(response.content)

In [None]:
print(response)

In [None]:
torch.cuda.empty_cache()
response = ollama_model.invoke(messages)
print(response.content)

In [None]:
print(response)

In [None]:
torch.cuda.empty_cache()
start_time = time.time()
response = huggingface_model.invoke(messages)
end_time = time.time()
print(f"Time taken: {end_time - start_time:.2f} seconds")
print(response.content)

In [None]:
print(response)