### PROD MODELS
distil-whisper-large-v3-en, gemma2-9b-it, llama-3.3-70b-versatile, llama-3.1-8b-instant, llama-guard-3-8b, llama3-70b-8192, llama3-8b-8192
mixtral-8x7b-32768, whisper-large-v3,whisper-large-v3-turbo
### PREVIEW MODELS
deepseek-r1-distill-llama-70b, llama-3.3-70b-specdec, llama-3.2-1b-preview, llama-3.2-3b-preview, llama-3.2-11b-vision-preview, llama-3.2-90b-vision-preview

In [1]:

import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")


In [16]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")
llmnocache = ChatGroq(model="llama3-8b-8192")

In [18]:
# <!-- ruff: noqa: F821 -->
from langchain_core.globals import set_llm_cache

# LLM Cache 
### It can save you money by reducing the number of API calls you make to the LLM provider, 
## In Memory Cache



In [19]:
%%time
from langchain_core.caches import InMemoryCache
inmemory_cache = InMemoryCache()
set_llm_cache(inmemory_cache)

# The first time, it is not yet in cache, so it should take longer
llm.invoke("Tell me a joke")

CPU times: user 19.9 ms, sys: 8.56 ms, total: 28.4 ms
Wall time: 737 ms


AIMessage(content="Why couldn't the bicycle stand up by itself?\n\nBecause it was two-tired!", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 14, 'total_tokens': 32, 'completion_time': 0.015, 'prompt_time': 0.005034346, 'queue_time': 0.348458214, 'total_time': 0.020034346}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_a97cfe35ae', 'finish_reason': 'stop', 'logprobs': None}, id='run-ca4664f8-6b70-46bd-ad87-aa713889b5b6-0', usage_metadata={'input_tokens': 14, 'output_tokens': 18, 'total_tokens': 32})

In [24]:
%%time
# The second time it is, so it goes faster
llm.invoke("Tell me a joke")

CPU times: user 3.11 ms, sys: 926 μs, total: 4.04 ms
Wall time: 3.28 ms


AIMessage(content="Why couldn't the bicycle stand up by itself?\n\nBecause it was two-tired!", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 14, 'total_tokens': 32, 'completion_time': 0.015, 'prompt_time': 0.005034346, 'queue_time': 0.348458214, 'total_time': 0.020034346}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_a97cfe35ae', 'finish_reason': 'stop', 'logprobs': None}, id='run-ca4664f8-6b70-46bd-ad87-aa713889b5b6-0', usage_metadata={'input_tokens': 14, 'output_tokens': 18, 'total_tokens': 32})

In [25]:
%%time
# The second time it is, so it goes faster
llm.invoke("Tell me a joke")

CPU times: user 3.06 ms, sys: 894 μs, total: 3.96 ms
Wall time: 3.39 ms


AIMessage(content="Why couldn't the bicycle stand up by itself?\n\nBecause it was two-tired!", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 14, 'total_tokens': 32, 'completion_time': 0.015, 'prompt_time': 0.005034346, 'queue_time': 0.348458214, 'total_time': 0.020034346}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_a97cfe35ae', 'finish_reason': 'stop', 'logprobs': None}, id='run-ca4664f8-6b70-46bd-ad87-aa713889b5b6-0', usage_metadata={'input_tokens': 14, 'output_tokens': 18, 'total_tokens': 32})

# SQLite Cache

In [26]:
!rm .langchain.db

rm: .langchain.db: No such file or directory


In [27]:
# We can do the same thing with a SQLite cache
from langchain_community.cache import SQLiteCache

set_llm_cache(SQLiteCache(database_path=".langchain.db"))

In [28]:
%%time
# The first time, it is not yet in cache, so it should take longer
llm.invoke("Tell me a joke")

CPU times: user 37.9 ms, sys: 8.27 ms, total: 46.2 ms
Wall time: 511 ms


AIMessage(content="Here's one:\n\nWhy couldn't the bicycle stand up by itself?\n\n(wait for it...)\n\nBecause it was two-tired!\n\nHope that made you smile!", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 32, 'prompt_tokens': 14, 'total_tokens': 46, 'completion_time': 0.026666667, 'prompt_time': 0.002846753, 'queue_time': 0.031804906, 'total_time': 0.02951342}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_6a6771ae9c', 'finish_reason': 'stop', 'logprobs': None}, id='run-c390c906-18f1-4ad0-9be0-80c580b9807f-0', usage_metadata={'input_tokens': 14, 'output_tokens': 32, 'total_tokens': 46})

In [29]:
%%time
# The second time it is, so it goes faster
llm.invoke("Tell me a joke")

CPU times: user 5.74 ms, sys: 1.3 ms, total: 7.04 ms
Wall time: 6.65 ms


AIMessage(content="Here's one:\n\nWhy couldn't the bicycle stand up by itself?\n\n(wait for it...)\n\nBecause it was two-tired!\n\nHope that made you smile!", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 32, 'prompt_tokens': 14, 'total_tokens': 46, 'completion_time': 0.026666667, 'prompt_time': 0.002846753, 'queue_time': 0.031804906, 'total_time': 0.02951342}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_6a6771ae9c', 'finish_reason': 'stop', 'logprobs': None}, id='run-c390c906-18f1-4ad0-9be0-80c580b9807f-0', usage_metadata={'input_tokens': 14, 'output_tokens': 32, 'total_tokens': 46})