In [1]:
!pip install chromadb openai requests mistralai

import os
import requests
import chromadb
from mistralai import Mistral
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import uuid

# 1. Set up API key for Mistral
os.environ["MISTRAL_API_KEY"] = "2HIbLNjqvf3mt7sXkVPlmfzQlSy6g2IX"
api_key = os.getenv("MISTRAL_API_KEY")
model = "mistral-large-latest"

# 2. Initialize Mistral client
client = Mistral(api_key=api_key)

# 3. URLs for the text files from GitHub repository
base_url = "https://raw.githubusercontent.com/deep-stack/langchain-assignment-dataset/main/stories/"
files = [
    "a-mother.txt",
    "sorrow.txt",
    "the-lantern-keepers.txt",
    "the-poor-relations-story.txt",
    "the-schoolmistress.txt"
]

# 4. Download and concatenate content from all text files
file_content = ""
for file in files:
    download_url = f"{base_url}{file}"
    response = requests.get(download_url)
    if response.status_code == 200:
        file_content += response.text + "\n"
    else:
        print(f'Failed to download {file}. Status code: {response.status_code}')

# 5. Initialize Chroma client and check for collection
client_chroma = chromadb.Client()

# Check if the collection already exists and reuse it
if "documents_collection111" in client_chroma.list_collections():
    collection111 = client_chroma.get_collection("documents_collection111")
else:
    collection111 = client_chroma.create_collection("documents_collection111")

# 6. Create TF-IDF vectorizer to generate embeddings for documents
vectorizer = TfidfVectorizer()
embeddings = vectorizer.fit_transform([file_content]).toarray()

# 7. Generate unique ID for the document
document_id = str(uuid.uuid4())

# 8. Add document embedding to Chroma
collection111.add(
    ids=[document_id],
    documents=[file_content],
    metadatas=[{"source": "file_content"}],
    embeddings=embeddings
)

# 9. Define the user query (character name) and search for similar documents in Chroma
query = "Jon Snow"
query_embedding = vectorizer.transform([query]).toarray()

# 10. Perform similarity search in Chroma
results = collection111.query(
    query_embeddings=query_embedding,
    n_results=1
)

# 11. Check if any relevant document is found
if not results['documents']:
    print(f"No relevant documents found for character: {query}")
    relevant_document = "No relevant information found."
else:
    # 12. Extract the most relevant document content
    relevant_document = results['documents'][0]

# 13. Set up prompt for Mistral with the relevant document context
prompt = f"Using the following information, answer the query:\n{relevant_document}\n\nCharacter name: {query}\nReturn a JSON object with the following keys: name, storyTitle, summary, relations, characterType."

# 14. Generate response using Mistral API
messages = [
    {
        "role": "user",
        "content": prompt,
    }
]

chat_response = client.chat.complete(
    model=model,
    messages=messages,
    response_format={
        "type": "json_object",
    }
)

# 15. Output the response from Mistral
response_content = chat_response.choices[0].message.content
if response_content:
    print(response_content)
else:
    print(f"No response generated for character: {query}")

Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting mistralai
  Downloading mistralai-1.2.5-py3-none-any.whl.metadata (27 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.4-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=