In [2]:
# Import neccesary libraries
import os
from dotenv import load_dotenv
import chromadb
from openai import OpenAI
from chromadb.utils import embedding_functions

In [3]:
# Load environment variables from .env file
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")

In [4]:
# load the OpenAI embedding function
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_key, model_name="text-embedding-ada-002")

In [5]:
# Initialize the Chroma client with persistence
chroma_client = chromadb.PersistentClient(path="chroma_persistent_storage")
collection_name = "document_qa_collection"
collection = chroma_client.get_or_create_collection(
    name=collection_name, embedding_function=openai_ef)

In [6]:
client = OpenAI(api_key=openai_key)

In [60]:
# # Be sure client is running
# resp = client.chat.completions.create(
#     model="gpt-3.5-turbo",
#     messages=[
#         {"role": "system", "content": "You are a helpful assistant."},
#         {
#             "role": "user",
#             "content": "Who is Jesus",
#         },
#     ],
# )

# print(resp.choices[0].message.content)

In [7]:
# Function to load documents from a directory
def load_documents_from_directory(directory_path):
    print("==== Loading documents from directory ====")
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            with open(
                os.path.join(directory_path, filename), "r", encoding="utf-8"
            ) as file:
                documents.append({"id": filename, "text": file.read()})
    return documents

In [8]:
# Function to split text into chunks
def split_text(text, chunk_size=1000, chunk_overlap=20):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    return chunks

In [9]:
# Load documents from the directory
directory_path = os.getenv("20casedocs")
documents = load_documents_from_directory(directory_path)

print(f"Loaded {len(documents)} documents")

==== Loading documents from directory ====
Loaded 20 documents


In [10]:
# Split documents into chunks
chunked_documents = []
for doc in documents:
    chunks = split_text(doc["text"])
    #print("==== Splitting docs into chunks ====")
    for i, chunk in enumerate(chunks):
        chunked_documents.append({"id": f"{doc['id']}_chunk{i+1}", "text": chunk})

print(f"Split documents into {len(chunked_documents)} chunks")

Split documents into 716 chunks


In [11]:
# Function to generate embeddings using OpenAI API
def get_openai_embedding(text):
    response = client.embeddings.create(input=text, model="text-embedding-ada-002")
    embedding = response.data[0].embedding
    print("==== Generating embeddings... ====")
    return embedding

In [12]:
# Generate embeddings for the document chunks
for doc in chunked_documents:
    print("==== Generating embeddings... ====")
    doc["embedding"] = get_openai_embedding(doc["text"])

print(doc["embedding"])   

==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embeddings... ====
==== Generating embe

In [13]:
# Upsert documents with embeddings into Chroma
for doc in chunked_documents:
    print("==== Inserting chunks into db;;; ====")
    collection.upsert(
        ids=[doc["id"]], documents=[doc["text"]], embeddings=[doc["embedding"]])

==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserting chunks into db;;; ====
==== Inserti

In [14]:
# Function to query documents
def query_documents(question, n_results=1):
    # query_embedding = get_openai_embedding(question)
    results = collection.query(query_texts=question, n_results=n_results)

    # Extract the relevant chunks
    relevant_chunks = [doc for sublist in results["documents"] for doc in sublist]
    print("==== Returning relevant chunks ====")
    return relevant_chunks

In [16]:
def generate_response(question, relevant_chunks):
    context = "\n\n".join(relevant_chunks)
    prompt = (
        "You are an intelligent legal assistant designed to provide preliminary insights on legal cases. "
        "Your task is to:\n"
        "1. Analyze the user's legal query or scenario and understand the key points of their situation.\n"
        "2. Search the knowledge base for case law documents that are most similar to the user's situation. Stick to only cases that are found in the knowledge base. Don't use any of your pretrained data.\n"
        "3. Identify the relevant case(s) and summarize the following:\n"
        "   - The key facts of the case.\n"
        "   - The legal issues involved.\n"
        "   - The decision or conclusion of the case.\n"
        "4. Present this information in a clear, conversational, and easy-to-understand way, avoiding complex legal jargon.\n"
        "5. If multiple cases are relevant, provide an overview of each and highlight the most applicable one.\n\n"
        "User Query Example:\n[Insert query or scenario description]\n\n"
        "System Response Example:\n"
        "I understand your situation. Here is a case that is similar to yours:\n"
        "1. **Case Name**: [Name of the case]\n"
        "2. **What Happened**: [Brief explanation of the key events and facts of the case]\n"
        "3. **Legal Issue**: [Description of the primary legal question(s) considered in the case]\n"
        "4. **What Was Decided**: [The court's decision and reasoning]\n"
        "5. **How This Relates to Your Case**: [Highlight the similarities and what you can learn from the case]\n\n"
        "Context:\n" + context + "\n\nQuestion:\n" + question
    )

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": prompt,
            },
            {
                "role": "user",
                "content": question,
            },
        ],
    )

    answer = response.choices[0].message
    return answer

In [17]:
question = "I belive I was wrongfully terminated from my job. What can I do?"
if not question:
    raise ValueError("Environment variable 'legal_query' is not set or is empty.")
relevant_chunks = query_documents(question)
answer = generate_response(question, relevant_chunks)

print(answer)

==== Returning relevant chunks ====
ChatCompletionMessage(content="I understand your situation. Here is a case that may be relevant to your scenario:\n\n**Case Name**: Gray v. Powers\n**What Happened**: In the Gray v. Powers case, an employee, Gray, filed a lawsuit against their employer, Powers, alleging wrongful termination. Gray claimed that they were fired without just cause and in violation of their employment contract.\n**Legal Issue**: The primary legal issue involved determining whether Gray's termination was wrongful and if Powers breached the terms of the employment contract.\n**What Was Decided**: The court found that Gray's termination was indeed wrongful as Powers failed to provide valid reasons for the dismissal and did not adhere to the terms of the employment contract. As a result, Powers was held liable for wrongful termination.\n**How This Relates to Your Case**: If you believe you were wrongfully terminated from your job, similar to Gray in this case, you may have le

In [None]:
# # Load the query from a text file
# try:
#     with open("query.txt", "r") as file:
#         question = file.read().strip()  # Read the file and remove any leading/trailing whitespace
# except FileNotFoundError:
#     raise FileNotFoundError("The query.txt file is not found. Please ensure it exists in the correct directory.")

# # Validate that the query is not empty
# if not question:
#     raise ValueError("The query file is empty. Please provide a valid query in the query.txt file.")

# # Process the query
# relevant_chunks = query_documents(question)
# answer = generate_response(question, relevant_chunks)

# # Print the answer
# print(answer)
