## Installations

In [None]:
!pip install langchain openai faiss-cpu sentence-transformers gradio PyPDF2 spacy
!python -m spacy download en_core_web_sm
!pip install -U langchain-community

## Importing the Libraries

In [2]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
import requests
import gradio as gr

## Set Hugging Face API key

In [3]:
HUGGING_FACE_API_KEY = "huggingFaceApiKey"

## Global variables


In [4]:
vectorstore = None
stored_documents = []

## Utility Functions


In [5]:
def process_file(file):
    """Extract text from uploaded file."""
    if file.name.endswith(".pdf"):
        from PyPDF2 import PdfReader
        pdf_reader = PdfReader(file)
        return "\n".join([page.extract_text() for page in pdf_reader.pages])
    elif file.name.endswith(".txt"):
        return file.read().decode("utf-8")
    return None


def preprocess_documents(file_text, metadata=None):
    """Preprocess and split text into chunks."""
    global stored_documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    document = Document(page_content=file_text, metadata=metadata or {})
    chunks = text_splitter.split_documents([document])
    stored_documents.extend(chunks)  # Store processed documents
    return chunks


def summarize_text_with_hugging_face(text, model_name="facebook/bart-large-cnn"):
    from transformers import pipeline
    summarizer = pipeline("summarization", model=model_name)
    max_length = 200  # Set a reasonable max length for summaries
    min_length = 100   # Ensure summaries are not too short
    summary = summarizer(text, max_length=max_length, min_length=min_length, truncation=True)
    return summary[0]['summary_text']

def create_vectorstore(documents):
    """Create or update a FAISS vectorstore."""
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    return FAISS.from_documents(documents, embeddings)

## Gradio App Functions


In [6]:
def upload_and_process(file, jurisdiction, date):
    """Handle file upload and metadata tagging."""
    global vectorstore
    file_text = process_file(file)
    metadata = {"jurisdiction": jurisdiction, "date": date}
    documents = preprocess_documents(file_text, metadata)
    vectorstore = create_vectorstore(documents)
    return "Document uploaded and processed successfully!"


def ask_question_with_filters(query, jurisdiction, date, chat_history=[]):
    """Query the document with optional filters."""
    global stored_documents, vectorstore

    # Filter documents by jurisdiction and date
    filtered_documents = [
        doc for doc in stored_documents
        if (not jurisdiction or doc.metadata.get("jurisdiction") == jurisdiction) and
           (not date or doc.metadata.get("date") == date)
    ]

    if not filtered_documents:
        chat_history.append((query, "No matching documents found."))
        return chat_history, chat_history

    # Create a temporary vectorstore for filtered documents
    temp_vectorstore = FAISS.from_documents(filtered_documents, vectorstore.embedding_function)
    retriever = temp_vectorstore.as_retriever(search_kwargs={"k": 3})

    # Retrieve relevant documents
    relevant_documents = retriever.get_relevant_documents(query)

    # Combine the content of retrieved documents for summarization
    combined_text = " ".join([doc.page_content for doc in relevant_documents])

    # Summarize the combined text
    summarized_answer = summarize_text_with_hugging_face(combined_text)

    # Format chatbot response
    chat_history.append((query, summarized_answer))
    return chat_history, chat_history

## Gradio UI

In [None]:
with gr.Blocks() as app:
    gr.Markdown("# Legal Document Assistant")

    with gr.Row():
        file_input = gr.File(label="Upload Legal Document")
        jurisdiction_input = gr.Textbox(label="Jurisdiction (optional)", placeholder="e.g., US, EU")
        date_input = gr.Textbox(label="Date (optional)", placeholder="e.g., 2022-11-20")
        upload_button = gr.Button("Upload and Process")
        upload_output = gr.Textbox(label="Upload Status", interactive=False)

    upload_button.click(
        upload_and_process,
        inputs=[file_input, jurisdiction_input, date_input],
        outputs=upload_output
    )

    chatbot = gr.Chatbot()
    query_input = gr.Textbox(label="Ask a Legal Question")
    filter_jurisdiction = gr.Textbox(label="Filter by Jurisdiction (optional)")
    filter_date = gr.Textbox(label="Filter by Date (optional)")
    state = gr.State([])
    ask_button = gr.Button("Ask")

    ask_button.click(
        ask_question_with_filters,
        inputs=[query_input, filter_jurisdiction, filter_date, state],
        outputs=[chatbot, state]
    )

app.launch(debug=True)