In [None]:
import os
os.environ["USE_TF"] = "0"
# This disables TensorFlow usage in transformers and related libraries.


In [None]:
#pip install docling langchain langchain-docling langchain-huggingface langchain-community sentence-transformers chromadb sqlalchemy psycopg2-binary spacy fastapi uvicorn gradio

In [None]:
#pip install --upgrade "transformers>=4.40"


In [None]:
#!python -m spacy download en_core_web_sm
#%pip uninstall numpy -y
#%pip install "numpy<2"

In [None]:
import torch
import numpy as np

# Check GPU availability
gpu_available = torch.cuda.is_available()
print(f"GPU Available: {gpu_available}")

# Print PyTorch version
print(f"PyTorch version: {torch.__version__}")

# Print NumPy version
print(f"NumPy version: {np.__version__}")

In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

In [None]:
import warnings


# --- Suppress warnings ---
def warn(*args, **kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings("ignore")

In [None]:
#detect whether a pdf is text or image

from pypdf import PdfReader

def pdf_is_image_heavy(path):
    reader = PdfReader(path)
    text = "".join((pg.extract_text()or "") for pg in reader.pages)
    return len(text.strip())<100 # heuristic: little extractable text -> image-heavy

pdf_is_image_heavy("my_contract.pdf")

In [None]:
from docling.document_converter import DocumentConverter
# Initialize converter
converter = DocumentConverter()

# Convert a PDF
result = converter.convert("my_contract.pdf")

# Extract the DoclingDocument (structured representation)
doc = result.document  

# Show raw structured output
print(doc)


In [None]:
from langchain_docling import DoclingLoader

loader = DoclingLoader("my_contract.pdf")
docs = loader.load()
print(docs[6].page_content)


In [None]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")

#rule based (regex, spacy) extraction of entities
def rule_based_extract(text):
    results = {}
    confidence = {}

    # Example: effective date
    date_match = re.search(r"(?:effective\s+date\s*[:\-]?\s*)(\w+\s\d{1,2},\s\d{4})", text, re.IGNORECASE)
    if date_match:
        results["effective_date"] = date_match.group(1)
        confidence["effective_date"] = "high"
    else:
        confidence["effective_date"] = "low"

    # Example: party names (look for 'between X and Y')
    parties_match = re.search(r"This agreement.*between\s+(.+?)\s+and\s+(.+?)\.", text, re.IGNORECASE)
    if parties_match:
        results["party_names"] = [parties_match.group(1), parties_match.group(2)]
        confidence["party_names"] = "high"
    else:
        confidence["party_names"] = "low"

    return results, confidence




In [None]:
from transformers import pipeline

# Load the NER pipeline with a legal-friendly model
ner = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", aggregation_strategy="simple", framework="pt")
def huggingface_fallback(text):
    entities = ner(text)
    # Post-process entities to fit your output format if needed
    return entities

In [None]:
#%pip install langchain-openai

#llm fallback extraction of entities
from langchain_openai import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
from langchain.prompts import PromptTemplate
import json

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a legal document parser. Extract fields in JSON."),
    ("human", "Document text:\n\n{doc_text}\n\nReturn JSON with keys: party_names, effective_date, termination_date, jurisdiction, signatories.")
])

def llm_fallback(text):
    chain = prompt | llm
    response = chain.invoke({"doc_text": text})
    try:
        return json.loads(response.content)
    except Exception:
        return {"error": "Failed to parse JSON"}
    

In [None]:
#hybrid controller
def hybrid_extraction(text, threshold="high"):
    results, confidence = rule_based_extract(text)
    final = results.copy()
    
    # If any key has low confidence, use LLM fallback
    if any(c == "low" for c in confidence.values()):
        llm_results = llm_fallback(text)
        results.update(llm_results)
    
    return results

In [None]:
#chunking documents
from langchain_text_splitters import RecursiveCharacterTextSplitter

def chunk_docs(docs):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100  #contetxt overlap
        
    )
    return splitter.split_documents(docs)

In [None]:
chunks = chunk_docs(docs)

print(f"Original docs: {len(docs)}")
print(f"Chunked docs: {len(chunks)}")

# Show first chunk
print(chunks[0].page_content[:200])
print(chunks[0].metadata)

In [None]:
#embedding model using huggingface transformers + auto moedel avoid sentence transformers dependency issues
from transformers import AutoTokenizer, AutoModel
import torch

class HFCustomEmbeddings:
    def __init__(self, model_name="sentence-transformers/all-mpnet-base-v2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def embed_text(self, text: str):
        import torch
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings[0].cpu().numpy()

    # ✅ For LangChain compatibility
    def embed_documents(self, texts):
        return [self.embed_text(t) for t in texts]

    def embed_query(self, text):
        return self.embed_text(text)




In [None]:
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata

# Suppose chunks is your Docling split output
filtered_chunks = filter_complex_metadata(chunks)


emb_model = HFCustomEmbeddings("sentence-transformers/all-mpnet-base-v2")

vectorstore = Chroma.from_documents(
        documents=filtered_chunks,
        embedding=emb_model,
        persist_directory="./chroma_db" #save locally
)

In [None]:
#testing siilairyt search with query
query = "What are the deposit amounts?"
results = vectorstore.similarity_search(query, k=3)

for r in results:
    print("----")
    print(r.page_content[:200])
    print(r.metadata)


In [None]:
#addig

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Load FLAN-T5
model_name = "google/flan-t5-large"  #t5-large/t5-tglobal-base ideal if GPU is available
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# HF pipeline
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512
)

# Wrap in LangChain
local_llm = HuggingFacePipeline(pipeline=pipe)

# Retriever from your vector store
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Build RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=local_llm,
    retriever=retriever,
    chain_type="stuff"  # simplest: just concatenate retrieved chunks
)



In [None]:
#gradio ui for prototyping
import gradio as gr

qa_chain = None

def process_pdf(pdf_file):
    global qa_chain

    #load
    loader = DoclingLoader(pdf_file.name)
    docs = loader.load()

    #filter metadata
    filtered_docs = filter_complex_metadata(docs)

    #chunk
    chunks = chunk_docs(filtered_docs)  

    #embed + vectorstore
    emb_model = HFCustomEmbeddings()
    vectorstore = Chroma.from_documents(chunks, emb_model)

    #QA chain
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    qa_chain = RetrievalQA.from_chain_type(llm=local_llm, retriever=retriever, chain_type="stuff")
    return "PDF processed. You can now ask questions."
def answer_query(query):
    if qa_chain is None:
        return "Please upload and process a PDF first."
    return qa_chain.run(query)


    


In [None]:
with gr.Blocks() as demo:
    gr.Markdown("## 📄 Legal Document Q&A")
    with gr.Row():
        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        upload_btn = gr.Button("Process Document")
    status = gr.Textbox(label="Status")
    query = gr.Textbox(label="Ask a question")
    answer = gr.Textbox(label="Answer")

    upload_btn.click(fn=process_pdf, inputs=pdf_input, outputs=status)
    query.submit(fn=ask_question, inputs=query, outputs=answer)

demo.launch()