In [7]:
from langchain_core.rate_limiters import InMemoryRateLimiter

# Configure the rate limiter
rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.1,  # One request every 10 seconds
    check_every_n_seconds=0.1,  # Checks every 100ms
    max_bucket_size=10,  # Allows a burst of up to 10 requests
)


In [8]:
from langchain_ollama import ChatOllama

# Initialize the Ollama model with the rate limiter
llm = ChatOllama(
    model="llama3.2:latest",
    temperature=0,  # Deterministic response
    rate_limiter=rate_limiter,  # Apply rate limiting
)


In [None]:
import time

# Test rate limiting by invoking the model multiple times
for _ in range(5):
    start_time = time.time()  # Record start time
    response = llm.invoke("Hello, how are you?")  # Send a query
    end_time = time.time()  # Record end time
    
    # Print the response and the time taken
    print(f"Response: {response.content}")
    print(f"Time elapsed: {end_time - start_time:.2f} seconds")
