In [None]:
#Automated Multistage Legal Extraction Pipeline (LLM + LangChain)
import os, uuid
import pandas as pd
from dotenv import load_dotenv
import re

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.pydantic_v1 import BaseModel, Field

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
embedding_function = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)
llm = ChatOpenAI(model="gpt-4o", api_key=OPENAI_API_KEY)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=100,
    separators=["\n\n", "\n", " "]
)

class ExtractedInfo(BaseModel):
    communication_submitted_by: str
    alleged_victims: str
    state_party: str
    date_of_communication: str
    date_of_adoption_of_views: str
    subject_matter: str
    articles_of_convention: str
    articles_of_optional_protocol: str
    nationality_of_victim: str


# --- Prompts ---
MAIN_PROMPT = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
Extract answers exactly as they are written in the document. 
Do NOT paraphrase, shorten, or summarize any part of the response.

Extract the following fields from the context:
- communication_submitted_by (from the field labelled "Communication submitted by:" — do not confuse this with "Alleged victims" or similar entries. Include the name of the legal representative if mentioned.)
- alleged_victims (from "Alleged victim(s):" or "Alleged victim:")
- state_party (from "State Party:")
- date_of_communication (from "Date of communication:")
- date_of_adoption_of_views (from either "Date of adoption of Views:" or "Date of adoption of decision:")
- subject_matter (from "Subject matter:")

{context}

---

Answer the following instruction: {question}
"""
main_prompt_template = ChatPromptTemplate.from_template(MAIN_PROMPT)

# --- Violation Prompt ---
VIOLATION_PROMPT = """
You are an assistant extracting legal decision outcomes from UN human rights communications.

Your task is to determine the *violation outcome* and the *articles violated*.

Extract the following two fields:

1. Violation Outcome:
- "violation" — if the Committee found that a violation of any article occurred.
- "no violation" — if the case proceeded to merits and no violation was found.
- "N/A" — if the case was entirely inadmissible or discontinued before the merits.

2. Articles Violated:
- List the exact articles (e.g., "article 3", "article 24") that were found to be violated.
- If none or not applicable, return "N/A".

Here is the text:
\"\"\"{context}\"\"\"    
"""
violation_prompt_template = ChatPromptTemplate.from_template(VIOLATION_PROMPT)

# --- GPT Violation Call ---
def extract_violation_section(text):
    """
    Extract the specific section where the decision on violation is made.
    Typically, this is found under "Consideration of the Merits", "Findings", or "Conclusions".
    """
    start_keywords = [
        "Consideration of the Merits",
        "Findings",
        "Conclusions",
        "Decision"
    ]
    end_keywords = [
        "Recommendations",
        "Follow-up",
        "Implementation"
    ]

    start_idx, end_idx = None, None

    for keyword in start_keywords:
        start_idx = text.lower().find(keyword.lower())
        if start_idx != -1:
            break

    for keyword in end_keywords:
        end_idx = text.lower().find(keyword.lower(), start_idx)
        if end_idx != -1:
            break

    if start_idx is not None and end_idx is not None:
        return text[start_idx:end_idx]
    
    return ""


def classify_violation_with_gpt(text):
    """
    Uses GPT to extract the final determination of violation from the specified section of the document.
    """
    violation_section = extract_violation_section(text)
    
    if not violation_section.strip():
        return "N/A", "N/A"
    
    prompt = f"""
    You are a legal assistant specializing in extracting human rights violations.
    Based on the following section, determine the *violation outcome* and list the *articles violated*.

    - "violation" if the Committee explicitly finds a violation.
    - "no violation" if the Committee explicitly finds there was no violation.
    - "N/A" if it is unclear or not stated.

    Also, list the exact articles if there was a violation, otherwise respond with "N/A".

    Here is the text:
    \"\"\"{violation_section}\"\"\"       
    Respond with:
    Violation Outcome: <violation / no violation / N/A>
    Articles Violated: <list of articles or N/A>
    """
    response = llm.predict(prompt)

    # Safe Parsing
    try:
        lines = response.strip().split("\n")
        violation_outcome = lines[0].replace("Violation Outcome: ", "").strip()
        articles_violated = lines[1].replace("Articles Violated: ", "").strip()
    except IndexError:
        violation_outcome = "N/A"
        articles_violated = "N/A"

    return violation_outcome, articles_violated


# --- Admissibility Extraction ---
def extract_admissibility_section(text):
    start_keywords = ["Consideration of Admissibility", "Admissibility", "Consideration of admissibility"]
    end_keywords = ["Consideration of the Merits", "Merits", "Substantive Issues"]
    
    start_idx, end_idx = None, None
    
    for keyword in start_keywords:
        start_idx = text.lower().find(keyword.lower())
        if start_idx != -1:
            break
    
    for keyword in end_keywords:
        end_idx = text.lower().find(keyword.lower(), start_idx)
        if end_idx != -1:
            break
    
    if start_idx is not None and end_idx is not None:
        return text[start_idx:end_idx]
    
    return None


# --- Nationality Prompt ---
NATIONALITY_PROMPT = """
You are an assistant extracting legal decision outcomes from UN human rights communications.

Your task is to determine the *nationality of the victim*.

Extract the following field:

1. Nationality of the Victim:
- Look for direct mentions such as "born in", "citizen of", "national of", "of nationality", or "residing in".
- Search for terms like "originated from," "of [Country] descent," or "originally from."
- If the text mentions parents' nationalities, use that as a clue.
- If multiple nationalities are mentioned, list them.
- If you cannot determine it, respond with "Unknown".

Here is the text:
\"\"\"{context}\"\"\"    
"""
nationality_prompt_template = ChatPromptTemplate.from_template(NATIONALITY_PROMPT)

def classify_nationality_with_gpt(text):
    """
    Uses GPT to extract the nationality of the victim.
    """
    prompt = f"""
    You are a legal assistant specializing in extracting nationality information.
    Based on the following text, determine the *nationality of the victim*.

    Respond with:
    Nationality of the Victim: <nationality or 'Unknown'>

    Clues to look for:
    - born in
    - citizen of
    - national of
    - of nationality
    - residing in
    - originated from
    - of [Country] descent
    - or any direct mention of country names

    Here is the text:
    \"\"\"{text}\"\"\"
    """
    response = llm.predict(prompt)

    nationality = "Unknown"  # Default if nothing found
    if response:
        lines = response.strip().split("\n")
        if len(lines) > 0:
            extracted = lines[0].replace("Nationality of the Victim: ", "").strip()
            if extracted:  # If something was found
                nationality = extracted
    return nationality


import re
import pandas as pd

PARENT_PROMPT = """
You are an assistant specializing in legal document analysis.
Your task is to determine if the communication is **brought by the parents or guardians** of the victim.

Instructions:
- Respond with **"Yes"** if the text clearly indicates it was submitted by parents or legal guardians.
- Respond with **"No"** if there is no indication of parents or guardians being the submitters.
- If it is ambiguous or unclear, respond with **"Unclear"**.

Here is the text:
\"\"\"{context}\"\"\"    
"""
parent_prompt_template = ChatPromptTemplate.from_template(PARENT_PROMPT)

def classify_parent_involvement_with_gpt(text):
    """
    Uses GPT to extract if the claim is brought by parents or guardians.
    """
    prompt = f"""
    You are a legal assistant specializing in determining if the claim was brought by parents or legal guardians.
    Based on the following text, determine if the claim is **brought by parents**.

    Respond with:
    Parent Involvement: <Yes / No / Unclear>

    Here is the text:
    \"\"\"{text}\"\"\"    
    """
    response = llm.predict(prompt)

    parent_involvement = "Unclear"  # Default if nothing found
    if response:
        lines = response.strip().split("\n")
        if len(lines) > 0:
            extracted = lines[0].replace("Parent Involvement: ", "").strip()
            if extracted:
                parent_involvement = extracted
    return parent_involvement


def extract_parent_involvement(text):
    """
    This function scans the text to identify if the claim is explicitly brought by parents.
    If found, it returns "Yes" and the specific description; otherwise, it uses GPT for a second check.
    """
    parent_patterns = [
        r"brought by the parents",
        r"filed by the parents",
        r"submitted by the parents",
        r"parents of the victim",
        r"on behalf of their child",
        r"on behalf of the minor",
        r"parents claim",
        r"submitted by the mother",
        r"submitted by the father",
        r"the mother submitted on behalf",
        r"the father submitted on behalf",
        r"guardian of the child"
    ]
    
    for pattern in parent_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return "Yes"
    
    gpt_response = classify_parent_involvement_with_gpt(text)
    return gpt_response


# --- Main Runner ---
def main(pdf_paths):
    results = []
    
    for pdf_path in pdf_paths:
        print(f"\n--- Processing: {pdf_path} ---")
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()
        chunks = text_splitter.split_documents(pages)

        vectorstore_path = f"vectorstores/{os.path.basename(pdf_path).replace('.pdf', '')}"
        vectorstore = create_vectorstore(chunks, vectorstore_path)

        main_chain = build_main_chain_with_first_page(pages, vectorstore)
        result_main = main_chain.invoke("Extract main fields from the document")
        result_dict = result_main.dict()

        full_text = "\n".join(page.page_content for page in pages)
        admissibility_text = extract_admissibility_section(full_text)
        
        if admissibility_text:
            print("✅ Admissibility Section Found.")
            case_outcome, reason = classify_admissibility_with_gpt(admissibility_text)
            result_dict["case_outcome"] = case_outcome
            result_dict["reason_for_outcome"] = reason
        else:
            print("❌ Admissibility Section Not Found.")
            result_dict["case_outcome"] = "not found"
            result_dict["reason_for_outcome"] = "N/A"
        
        violation_outcome, articles_violated = classify_violation_with_gpt(full_text)
        result_dict["violation_outcome"] = violation_outcome
        result_dict["articles_violated"] = articles_violated
        
        result_dict["nationality_of_victim"] = classify_nationality_with_gpt(full_text)

        articles_convention = re.search(r"Articles of the Convention:\s*(.*)", full_text, re.IGNORECASE)
        if articles_convention:
            result_dict["articles_of_convention"] = articles_convention.group(1).strip()
        else:
            result_dict["articles_of_convention"] = "N/A"

        articles_optional_protocol = re.search(r"Articles of the Optional Protocol:\s*(.*)", full_text, re.IGNORECASE)
        if articles_optional_protocol:
            result_dict["articles_of_optional_protocol"] = articles_optional_protocol.group(1).strip()
        else:
            result_dict["articles_of_optional_protocol"] = "N/A"

        print("🔍 Checking for Parent Involvement...")
        parent_involvement = extract_parent_involvement(full_text)
        result_dict["claim_brought_by_parents"] = parent_involvement
        print(f"✅ Parent Involvement: {parent_involvement}")

        result_dict["filename"] = os.path.basename(pdf_path)
        results.append(result_dict)

    df = pd.DataFrame(results)
    from IPython.display import display
    display(df)

# --- Run ---
pdf_files = [
    r"C:\Users\vyach\Documents\UN Law Research\PDF Reader\RAG_LLM\data\Switzerland_CRC_C_98_D_153_2021.pdf",
    r"C:\Users\vyach\Documents\UN Law Research\PDF Reader\RAG_LLM\data\Belgium_CRC_C_98_D_143_2021.pdf",
]
main(pdf_files)


In [None]:
#Classification of the subject matter of UN CRC
import os
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

df = pd.read_excel("Final_Data_UN_CRC.xlsx", sheet_name="final_extraction_results")

def classify_subject_matter(subject_text):
    if pd.isna(subject_text) or subject_text.strip() == "":
        return "N/A"

    prompt = f"""
You are a legal assistant categorizing human rights complaint topics for the UN Committee on the Rights of the Child.

Classify the following complaint into one or more of the following categories:
- Access to education
- Immigration detention
- Family separation
- Statelessness/nationality
- Violence/abuse
- Health
- Other

Subject matter:
\"\"\"{subject_text}\"\"\"

Respond only with a semicolon-separated list of categories.
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a legal analyst."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print("❌ Error:", e)
        return "Error"

tqdm.pandas()
df["subject_matter_category"] = df["subject_matter"].progress_apply(classify_subject_matter)

output_file = "UN_CRC_with_Thematic_Categories.xlsx"
df.to_excel(output_file, index=False)
print(f"✅ File saved to: {output_file}")


In [None]:
#Classification of the reasons for inadmissibility in UN CRC

import os
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

input_file = "Final_Data_UN_CRC.xlsx"
df = pd.read_excel(input_file, sheet_name="final_extraction_results")

def classify_rejection_reason(row):
    reason_text = row["reason_for_outcome"]
    outcome = str(row["case_outcome"]).strip().lower()

    if pd.isna(reason_text) or reason_text.strip() == "":
        return "N/A"

    if outcome != "inadmissible":
        return "Not applicable"

    prompt = f"""
You are a legal assistant categorizing the *reason for inadmissibility* in cases brought before the UN Committee on the Rights of the Child.

Classify the reason below into ONE of the following categories:
- Lack of jurisdiction
- Failure to exhaust domestic remedies
- Insufficient substantiation
- Incompatibility with the Convention
- Duplication (already considered by another international body)
- Mootness or best interests of the child
- Other procedural reason

Reason:
\"\"\"{reason_text}\"\"\"

Respond only with one category label from the list above.
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a legal analyst."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print("❌ Error:", e)
        return "Error"

tqdm.pandas()
df["inadmissibility_category"] = df.progress_apply(classify_rejection_reason, axis=1)

output_file = "UN_CRC_with_Rejection_Categories.xlsx"
df.to_excel(output_file, index=False)
print(f"File saved to: {output_file}")
