In [1]:
!pip install chromadb
!pip install torch
!pip install transformers
!pip install sentence-transformers



In [2]:
import chromadb
from chromadb.utils import embedding_functions
import torch
from sentence_transformers import SentenceTransformer
import json

In [3]:
def parse_conversations(file_path):
    conversations = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():  # Skip empty lines
                try:
                    # Split by "User:" and "Assistant:"
                    parts = line.split("User:", 1)
                    if len(parts) > 1:
                        user_assistant = parts[1].split("Assistant:", 1)
                        if len(user_assistant) > 1:
                            question = user_assistant[0].strip()
                            answer = user_assistant[1].strip()
                            conversations.append({
                                "question": question,
                                "answer": answer
                            })
                except Exception as e:
                    print(f"Error parsing line: {e}")
                    continue
    return conversations

In [4]:
conversations = parse_conversations("conversations.txt")
print(f"Number of conversations parsed: {len(conversations)}")
print("\nFirst conversation sample:")
print(json.dumps(conversations[0], indent=2))

Number of conversations parsed: 511081

First conversation sample:
{
  "question": "Which of the following risk types are considered in your organization's climate-related risk assessments?",
  "answer": "In our organization, we utilize a standardized enterprise risk management (ERM) process to assess risks at the company level, which includes risks associated with climate change. Every business unit, region, and corporate function participates in this formal process to measure risks using standard evaluation criteria. Through this process, we typically identify 10-15 risks each year that are assigned ERM priorities to manage these risks under the supervision of the Senior Leadership Committee (SLC) and CEO. The EVP, EBS, and Sustainability, who is a member of the SLC, is responsible for climate-related issues. The SLC meets quarterly on major business issues, including Environment, Health, and Safety, with a subset focused on climate change. Our businesses address climate change issue

In [5]:
class BGEEmbeddingFunction():
    def __init__(self):
        device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
        print(f"Using device: {device}")
        self.device = device
        self.model = SentenceTransformer("BAAI/bge-large-en", device=device)
    
    def __call__(self, input: list[str]) -> list[list[float]]:
        if isinstance(input, str):
            input = [input]
        embeddings = self.model.encode(input, normalize_embeddings=True)
        return embeddings.tolist()

In [6]:
def init_embedding_model():
    return BGEEmbeddingFunction()

In [7]:
embedding_function = init_embedding_model()

Using device: mps


In [8]:
from chromadb.config import Settings

def init_chroma(embedding_function):
    # Create a persistent client
    client = chromadb.PersistentClient(
        path="./chroma_db",
        settings=Settings(
            anonymized_telemetry=False,
            allow_reset=True,
            persist_directory="./chroma_db",
        )
    )
    
    # Create or get collection
    collection = client.get_or_create_collection(
        name="conversations",
        embedding_function=embedding_function,
        metadata={"description": "Conversation QA pairs"}
    )
    
    return collection

In [9]:
collection = init_chroma(embedding_function)

In [10]:
def add_to_chroma(collection, conversations, batch_size=100):
    for i in range(0, len(conversations), batch_size):
        batch = conversations[i:i + batch_size]
        
        ids = [f"conv_{j}" for j in range(i, i + len(batch))]
        documents = [f"Question: {conv['question']}\nAnswer: {conv['answer']}" for conv in batch]
        metadatas = [{"question": conv['question']} for conv in batch]
        
        try:
            collection.add(
                ids=ids,
                documents=documents,
                metadatas=metadatas
            )
            print(f"Added batch {i//batch_size + 1} of {len(conversations)//batch_size + 1}")
        except Exception as e:
            print(f"Error adding batch starting at index {i}: {e}")

In [11]:
print("Parsing conversations...")
conversations = parse_conversations("conversations.txt")
print(f"Parsed {len(conversations)} conversations")

Parsing conversations...
Parsed 511081 conversations


In [None]:
print("Adding data to ChromaDB...")
add_to_chroma(collection, conversations)

In [None]:
def query_example(collection, query_text, n_results=5):
    results = collection.query(
        query_texts=[query_text],
        n_results=n_results
    )
    return results

In [None]:
query = "Please explain the level of inclusion of the planning process."
results = query_example(collection, query, 1)
print("\nExample query results:")
print(json.dumps(results, indent=2))