In [60]:
import os
import glob
import matplotlib.pyplot as plt

from dotenv import load_dotenv
from openai import OpenAI

import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter)

import umap
import umap.umap_ as umap


import os
from collections import Counter
import tiktoken

import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [61]:
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_key)

In [62]:
folder_path = os.getenv("20casedocs")
all_texts = []

for filepath in glob.glob(folder_path):
    with open(filepath, "r", encoding="utf-8") as f:
        text = f.read().strip()
        if text:
            all_texts.append({"text": text, "source": os.path.basename(filepath)})

IsADirectoryError: [Errno 21] Is a directory: '/Users/temit/Documents/Project/coding/AILA_2019_dataset_size20/small_casedocs'

In [5]:


# Initialize OpenAI tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")  # Correct for text-embedding-ada-002

# Single optimized splitter configuration
optimized_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n\n", "\n\n", "\n", ". ", " ", ""],  # Prioritize document structure
    chunk_size=8100,  # 8191 max - 91 safety buffer
    chunk_overlap=500,  # Generous overlap for context preservation
    length_function=lambda text: len(tokenizer.encode(text)),  # Exact token count
    is_separator_regex=False
)

chunked_texts = []
for doc_index, entry in enumerate(all_texts):
    # Single-pass splitting with token awareness
    chunks = optimized_splitter.split_text(entry["text"])
    
    # Add chunks with metadata
    for chunk_index, chunk in enumerate(chunks):
        chunked_texts.append({
            "id": f"doc_{doc_index}_chunk_{chunk_index}",
            "text": chunk,
            "metadata": {
                "source": entry["source"],
                "token_count": len(tokenizer.encode(chunk))  # Optional but useful
            }
        })

print(f"Total chunked segments: {len(chunked_texts)}")
print(f"Max tokens in any chunk: {max(len(tokenizer.encode(chunk['text'])) for chunk in chunked_texts)}")

Total chunked segments: 36
Max tokens in any chunk: 8016


In [6]:
chunked_texts[0:2]

[{'id': 'doc_0_chunk_0',
  'text': 'West Bengal State Electricity Board and Others v Desh Bandhu Ghosh and Others\nSupreme Court of India\n\n26 February 1985\nCivil Appeal No. 562 of 1985\nThe Judgment was delivered by : O. Chinnappa Reddy, J.\nSpecial leave granted.\n1.  The West Bengal State Electricity Board is the principal appellant in this appeal by special leave which we have just now granted. The first respondent, a permanent employee of the West Bengal State Electricity Board, filed the writ petition out of which the appeal arises in the Calcutta High Court to quash an order dated march 22, 1984 of the Secretary, West Bengal State Electricity Board terminating his services as Deputy Secretary with immediate effect on payment of three month\'s salary in lieu of three month\'s notice. The order gave no reasons for terminating the services of the respondent and there was nothing in the order which could possibly be said to attach any stigma to the respondent. Apparently the order

In [9]:
# Initialize OpenAI embedding function
embedding_function = OpenAIEmbeddingFunction(
    api_key=openai_key,
    model_name="text-embedding-ada-002"
)

# Initialize Chroma client/collection
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection(
    name="case-docs-collection",
    embedding_function=embedding_function  # Attach the OpenAI embedder
)

# Prepare data for Chroma (same as before)
ids = [item["id"] for item in chunked_texts]
documents = [item["text"] for item in chunked_texts]
metadatas = [item["metadata"] for item in chunked_texts]

# Add to Chroma collection
chroma_collection.add(
    ids=ids,
    documents=documents,
    metadatas=metadatas
)

print("Number of documents in collection:", chroma_collection.count())

Number of documents in collection: 36


START OF RAG SYSTEM

In [11]:
def augment_query_generated(user_query, model="gpt-3.5-turbo"):
    system_prompt = """You are a helpful expert research assistant. 
    Provide a plausible example answer to the user's query as if you found it in a case document."""
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_query},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
    )
    content = response.choices[0].message.content
    return content



In [52]:
original_query = "I believe I was wrongfully terminated from my job. What can I do?"
hypothetical_answer = augment_query_generated(original_query)

joint_query = f"{original_query} {hypothetical_answer}"

In [53]:
# Now query the Chroma collection
results = chroma_collection.query(
    query_texts=[joint_query],
    n_results=3,
    include=["documents", "embeddings", "metadatas"]
)
retrieved_documents = results["documents"][0]
retrieved_metadata = results["metadatas"][0]

In [54]:
retrieved_metadata

[{'source': 'C14.txt', 'token_count': 7798},
 {'source': 'C14.txt', 'token_count': 7941},
 {'source': 'C9.txt', 'token_count': 1354}]

In [None]:
# Retrieve all embeddings from the collection
all_data = chroma_collection.get(include=["embeddings", "metadatas"])
all_embeddings = all_data["embeddings"]

# Fit UMAP on all embeddings
umap_transform = umap.UMAP(random_state=0, transform_seed=0).fit(all_embeddings)
projected_dataset_embeddings = umap_transform.transform(all_embeddings)

# For the retrieved embeddings
retrieved_embeddings = results["embeddings"][0]
original_query_embedding = embedding_function([original_query])
augmented_query_embedding = embedding_function([joint_query])

projected_original_query_embedding = umap_transform.transform(original_query_embedding)
projected_augmented_query_embedding = umap_transform.transform(augmented_query_embedding)
projected_retrieved_embeddings = umap_transform.transform(retrieved_embeddings)

In [None]:
plt.figure(figsize=(50, 50))

# Plot entire dataset in gray
plt.scatter(
    projected_dataset_embeddings[:, 0],
    projected_dataset_embeddings[:, 1],
    s=10, 
    color="gray", 
    alpha=0.5, 
    label="All Data Chunks"
)

# Plot retrieved docs in green circles
plt.scatter(
    projected_retrieved_embeddings[:, 0],
    projected_retrieved_embeddings[:, 1],
    s=100,
    facecolors="none",
    edgecolors="green",
    label="Retrieved Chunks"
)

# Plot original and augmented query points
plt.scatter(
    projected_original_query_embedding[:, 0],
    projected_original_query_embedding[:, 1],
    s=150,
    marker="X",
    color="red",
    label="Original Query"
)
plt.scatter(
    projected_augmented_query_embedding[:, 0],
    projected_augmented_query_embedding[:, 1],
    s=150,
    marker="X",
    color="orange",
    label="Augmented Query"
)

plt.gca().set_aspect("equal", "datalim")
plt.title(f"UMAP Projection for Query: {original_query}")
plt.axis("off")
plt.legend()
plt.show()

In [55]:
def get_most_frequent_source_context(metadata_list, docs_directory):
    """Identify the most frequent source file and return its full content as context"""
    # Count source occurrences
    source_counts = Counter([item['source'] for item in metadata_list])
    most_common_source = source_counts.most_common(1)[0][0]
    
    # Get full document path
    doc_path = os.path.join(docs_directory, most_common_source)
    
    # Read the entire file
    with open(doc_path, 'r', encoding='utf-8') as f:
        full_text = f.read()
    
    # Initialize tokenizer
    tokenizer = tiktoken.get_encoding("cl100k_base")
    
    # Split if exceeds token limit (with safe buffer)
    max_tokens = 8191
    safety_buffer = 91
    tokens = tokenizer.encode(full_text)
    
    if len(tokens) > max_tokens:
        # Split with maximum possible chunk size and overlap
        chunk_size = max_tokens - safety_buffer
        overlap = 500
        
        chunks = []
        for i in range(0, len(tokens), chunk_size - overlap):
            chunk = tokens[i:i + chunk_size]
            chunks.append(tokenizer.decode(chunk))

        # NEW: Automatically use half the chunks (rounded down)
        half_chunks = len(chunks) // 2
        if half_chunks < 1:  # Ensure at least 1 chunk
            half_chunks = 1
            
        selected_chunks = chunks[:half_chunks]
        selected_tokens = sum(len(tokenizer.encode(c)) for c in selected_chunks)

        return {
            "source": most_common_source,
            "content": selected_chunks,
            "chunk_count": half_chunks,
            "total_tokens": selected_tokens,
            "original_chunk_count": len(chunks),  # For reference
            "original_total_tokens": len(tokens)   # For reference
        }
    else:
        return {
            "source": most_common_source,
            "content": [full_text],
            "chunk_count": 1,
            "total_tokens": len(tokens)
        }
    
# Example usage
metadata = retrieved_metadata  # Your metadata list
docs_directory = folder_path  # Update this path

# Usage remains the same
context = get_most_frequent_source_context(metadata, folder_path)

# The joined context will automatically use half the chunks
final_context = "\n\n[CONTEXT BREAK]\n\n".join(context['content'])

print(f"Using {context['source']} as primary context")
print(f"Document split into {context['chunk_count']} chunks")
print(f"Total tokens: {context['total_tokens']}")

Using C14.txt as primary context
Document split into 3 chunks
Total tokens: 24300


In [56]:
context

{'source': 'C14.txt',
 'content': ['Central Inland Water Transport Corporation Limited and Another v Brojo Nath Ganguly and Another\nSupreme Court of India\n\n6 April 1986\nC.A. No. 4412 and 4413 of 1985\nThe Judgment was delivered by : D. P. Madon, J.\n1. These Appeals by Special Leave granted by this Court raise two questions of considerable importance to Government companies and their employees including their officers. These questions are:\n1) Whether a Government company as defined in s. 617 of the Companies Act, 1956, is "the State" within the meaning of Art. 12 of the Constitution?\n2) Whether an unconscionable term in a contract of employment is void u/s. 23 of the Indian Contract Act, 1872, as being opposed to public policy and, when such a term is contained in a contract of employment entered into with a Government company, is also void as infringing Art. 14 of the Constitution in case a Government company is "the State" under Art. 12 of the Constitution?\n2. Although the rec

In [57]:
final_context

'Central Inland Water Transport Corporation Limited and Another v Brojo Nath Ganguly and Another\nSupreme Court of India\n\n6 April 1986\nC.A. No. 4412 and 4413 of 1985\nThe Judgment was delivered by : D. P. Madon, J.\n1. These Appeals by Special Leave granted by this Court raise two questions of considerable importance to Government companies and their employees including their officers. These questions are:\n1) Whether a Government company as defined in s. 617 of the Companies Act, 1956, is "the State" within the meaning of Art. 12 of the Constitution?\n2) Whether an unconscionable term in a contract of employment is void u/s. 23 of the Indian Contract Act, 1872, as being opposed to public policy and, when such a term is contained in a contract of employment entered into with a Government company, is also void as infringing Art. 14 of the Constitution in case a Government company is "the State" under Art. 12 of the Constitution?\n2. Although the record of these Appeals is voluminous,

In [58]:
def generate_response(question, context_data):
    # Unpack the context data
    source_file = context_data['source']
    context_chunks = context_data['content']
    
    # Create document-aware context string
    context_str = f"FULL TEXT OF {source_file}:\n\n" + "\n\n[CONTEXT BREAK]\n\n".join(context_chunks)
    
    # Enhanced prompt with document structure awareness
    prompt = f"""You are a senior legal analyst helping assess cases from the full text of {source_file}. 
Always use THIS EXACT structure:

1. **Empathetic Opening**: Start with "I understand how [adjective] this [specific aspect]" using context
2. **Case Analysis**: Use EXACTLY these headers/format:
   ------------------------------------------------
   **Case Context**: [2-3 sentences from full text]
   **Key Precedent**: [Specific legal rule from document]
   ------------------------------------------------
   **Simplified Breakdown**:
   - [Header 1]: • [Relevant detail from full text]
   - [Header 2]: • [Document-specific procedure]
   ------------------------------------------------
   **Actionable Path**:
   1. [Step matching document outcomes]
   2. [Step using document's legal framework]

**Full Document Context**:
{context_str}

**RULES**:
- Use "you/your" when explaining implications
- Bold headers but NO other markdown
- If context breaks exist, treat as single document
- Never invent details - say "The document shows..." when uncertain"""

    response = client.chat.completions.create(
        model="gpt-4-turbo",  # Recommended for better document comprehension
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": question},
        ],
        temperature=0.3,  # Increased accuracy for legal text
        # max_tokens=1500
    )

    return response.choices[0].message.content

# Usage with previous context data
final_response = generate_response(original_query, context)
print(final_response)

I understand how distressing wrongful termination can be. Let's assess your situation based on the legal framework provided in the document.

------------------------------------------------
**Case Context**: The Central Inland Water Transport Corporation Limited, a government company, was involved in a legal dispute where employees challenged their termination under Rule 9(i) of the Corporation's Service Rules, which allowed termination without cause but with compensation.

**Key Precedent**: The Supreme Court held that such termination clauses are void if they are unconscionable and violate Article 14 of the Constitution, which ensures equality before the law.
------------------------------------------------

**Simplified Breakdown**:
- **Rule Analysis**: • Rule 9(i) allowed termination without cause, which was challenged as being unconscionable.
- **Legal Outcome**: • The Supreme Court struck down such clauses when used by entities considered as "the State", ensuring protection unde