In [1]:
import os
import re
import sys
import time
import json
import numpy as np

from openai import OpenAI

In [2]:
assistant_instruction = """
"""

In [3]:
user_query = """
Explain the importance of low latency LLMs
"""

In [4]:
GPT_API_KEY = ""
GPT_MODEL = "gpt-4o-mini"
TEMPERATURE = 1.0

In [5]:
client = OpenAI(api_key=GPT_API_KEY)

In [6]:
completion = client.chat.completions.create(
  model=GPT_MODEL,
  messages=[
    {"role": "system", "content": assistant_instruction},
    {"role": "user", "content": user_query}
  ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content="Low latency in large language models (LLMs) refers to the ability to generate responses quickly, with minimal delay between a user's input and the model's output. Here are several key reasons why low latency LLMs are important:\n\n1. **Real-time Applications**: In scenarios such as conversational AI, customer support, and virtual assistance, users expect immediate responses. Low latency ensures that interactions feel natural and fluid, enhancing user satisfaction and engagement.\n\n2. **User Experience**: Quick response times improve the overall user experience. Delays can lead to frustration, drop-offs, or negative perceptions of the technology. Smooth interactions encourage continued use.\n\n3. **Interactive Environments**: Low latency is crucial in interactive environments like games, interactive storytelling, and virtual reality. LLMs need to respond promptly to maintain immersion and engagement.\n\n4. **Scalability**: Low latency LLMs can handle a hi

In [7]:
assistant = client.beta.assistants.create(
  name="ucds-test-2",
  instructions=assistant_instruction,
  model=GPT_MODEL,
  tools=[{"type": "file_search"}],
)

In [8]:
vector_store = client.vector_stores.create(name="Financial Statements")

In [9]:
file_paths = ["rag_files/March_26.pdf"]
file_streams = [open(path, "rb") for path in file_paths]

file_batch = client.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vector_store.id, files=file_streams
)

In [10]:
assistant = client.beta.assistants.update(
    assistant_id=assistant.id,
    tools=[{"type": "file_search"}],
    tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)   

In [11]:
thread = client.beta.threads.create()

In [12]:
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content=user_query
)

run = client.beta.threads.runs.create_and_poll(
    thread_id=thread.id,
    assistant_id=assistant.id
)

messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))

In [13]:
message_content = messages[0].content[0].text
annotations = message_content.annotations
for index, annotation in enumerate(annotations):
    message_content.value = message_content.value.replace(annotation.text, f"[{index}]")

In [14]:
print(message_content)

Text(annotations=[], value="Low latency in large language models (LLMs) is crucial for several reasons:\n\n1. **User Experience**: In applications such as chatbots, virtual assistants, and live customer support, low latency is essential to provide real-time interactions. Users expect immediate responses, and high latency can lead to frustration and disengagement.\n\n2. **Productivity Enhancement**: In environments where LLMs are used to augment human decision-making or creative processes, such as content generation or data analysis, low latency can significantly enhance productivity. Faster responses allow users to iterate more quickly on ideas and decisions.\n\n3. **Real-Time Applications**: Applications requiring real-time data processing, such as gaming, financial trading platforms, and live transcription services, benefit from low latency to ensure timely information delivery. Delays can result in missed opportunities or inaccuracies in time-sensitive scenarios.\n\n4. **Interactive