<a href="https://colab.research.google.com/github/theMeghna/Indian-Supreme-Court-NLP-Analysis/blob/main/model_development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==============================================================================
# üèõÔ∏è Supreme Court Judgment Analysis: Model Development Notebook
# ==============================================================================

# 1. SETUP AND DATA ACQUISITION
# ------------------------------------------------------------------------------
!pip install pandas nltk scikit-learn transformers torch datasets --quiet
import pandas as pd
import numpy as np
import nltk
import re
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Download NLTK resources (Corrected to explicitly include the needed tagger)
# ==============================================================================
# 1. SETUP AND DATA ACQUISITION (CORRECTED NLTK BLOCK)
# ==============================================================================

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab') # <--- ADD THIS LINE TO FIX THE ERROR
# ... (rest of the code) ...
from nltk.corpus import stopwords

# Load Data
df = pd.read_csv('judgments.csv', encoding='utf-8').head(500) # Use a small subset for demonstration
print(f"Loaded {len(df)} entries.")

# 2. CRUCIAL STEP: TEXT ACQUISITION AND CLEANING
# ------------------------------------------------------------------------------
# NOTE: In a real project, this is where you would download the PDFs from
# df['temp_link'] and extract the text, creating the 'full_text' column.

# --- SIMULATION: Create a 'full_text' column for the next steps ---
def mock_text_generator(case_no):
    if 'Crl.A.' in case_no: return "This criminal appeal involves Section 302 of IPC and the matter is hereby dismissed. The Court considered Article 21 of the Constitution. The petition lacks merit."
    if 'C.A.' in case_no or 'SLP(C)' in case_no: return "This civil appeal concerns land acquisition and compensation under a specific Act. The appeal is partly allowed, modifying the High Court's order. This matter is commercial in nature."
    if 'W.P.' in case_no or 'MA' in case_no: return "This writ petition pertains to fundamental rights under Article 14 and Article 19 of the Constitution. The government order is set aside, and the petition is allowed."
    return "The Court delivered a final order."

df['full_text'] = df['case_no'].apply(mock_text_generator)
df = df.dropna(subset=['full_text']) # Drop rows where text extraction might fail

# Simple Text Preprocessing Function
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', text.lower()) # Remove non-alphanumeric (keep spaces)
    tokens = nltk.word_tokenize(text)
    tokens = [w for w in tokens if not w in stop_words]
    return " ".join(tokens)

df['cleaned_text'] = df['full_text'].apply(clean_text)


# 3. TRADITIONAL NLP (NLTK) AND FEATURE ENGINEERING
# ------------------------------------------------------------------------------
print("\n--- NLTK Feature Engineering Example ---")
# Example 1: Tokenization and POS Tagging
sample_text = df['cleaned_text'].iloc[0]
tokens = nltk.word_tokenize(sample_text)
print(f"Tokens: {tokens[:10]}...")
print(f"POS Tags: {nltk.pos_tag(tokens[:5])}...")

# Example 2: TF-IDF Vectorization for Classification Features
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['cleaned_text'])
print(f"TF-IDF Matrix Shape: {X_tfidf.shape}")


# 4. CLASSIFICATION MODEL DEVELOPMENT (Objective 4: Criminal, Civil, Constitutional)
# ------------------------------------------------------------------------------
print("\n--- Model Development: Classification (Objective 4) ---")

# Labeling Strategy: Use case_no prefix for quick, noisy labeling
def get_label(case_no):
    if 'Crl.A.' in case_no or 'Crl.' in case_no: return 'Criminal'
    if 'C.A.' in case_no or 'SLP(C)' in case_no or 'MA' in case_no: return 'Civil'
    if 'W.P.' in case_no: return 'Constitutional'
    return 'Other'

df['label'] = df['case_no'].apply(get_label)
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])


## A) Traditional Model: TF-IDF + Support Vector Machine (SVM)
X_train_vec = tfidf.transform(X_train)
X_test_vec = tfidf.transform(X_test)
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_vec, y_train)
y_pred_svm = svm_model.predict(X_test_vec)
print("\n[Traditional] Classification Report (SVM + TF-IDF):\n", classification_report(y_test, y_pred_svm, zero_division=0))


## B) Modern Model: Fine-tuning a Hugging Face Transformer (Conceptual)
# NOTE: This section is conceptual. Real fine-tuning requires more data and GPU time.
print("\n[Modern] Hugging Face Transformer Setup (Conceptual):")
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Actual steps would involve:
# 1. Tokenizing the full_text.
# 2. Converting the DataFrame to a Hugging Face 'Dataset' object.
# 3. Training the AutoModelForSequenceClassification using the 'Trainer' API.
print(f"Using tokenizer: {MODEL_NAME}. Ready for fine-tuning on GPU.")


# 5. GENERATIVE AI & LLM INTEGRATION (Objective 1, 6, 7)
# ------------------------------------------------------------------------------
# These tasks are best suited for Abstractive Summarization Models (T5/BART) or
# Instruction-tuned LLMs (e.g., Llama 3, GPT-4 via API).

sample_llm_text = df['full_text'].iloc[0]

# --- Objective 1: Abstractive Summarization ---
print("\n--- Objective 1: Summarization (Gen AI/LLM) ---")
# Using a ready-made pipeline for demonstration
try:
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    abstractive_summary = summarizer(sample_llm_text, max_length=50, min_length=20, do_sample=False)[0]['summary_text']
    print(f"Abstractive Summary: {abstractive_summary}")
except Exception as e:
    print(f"Skipping BART: Requires model download/memory. Use this setup for your project.")


# --- Objective 6 & 7: Outcome/Timeline Extraction (LLM Prompting) ---
print("\n--- Objective 6 & 7: LLM Prompting Setup (Conceptual) ---")
# This is how you would prompt an LLM to perform complex legal analysis
outcome_prompt = f"""
Analyze the following Supreme Court judgment text and classify the final outcome: Allowed, Dismissed, or Partly Allowed.
If the outcome is unclear, return Undetermined.
TEXT: "{sample_llm_text}"
OUTCOME:
"""
# print("Example Prompt for LLM Outcome Detection:\n", outcome_prompt)

timeline_prompt = f"""
From the following judgment text, extract all events and their associated dates.
Format the output as a JSON list of objects: [{{ "date": "...", "event": "..." }}].
TEXT: "{sample_llm_text}"
JSON:
"""
print("Example Prompt for LLM Timeline Extraction:\n", timeline_prompt)



In [None]:
# ==============================================================================
# 6. MODEL EXPORT
# ==============================================================================
# After training, you would save your models here.
# Example commands to save your models:
import pickle
pickle.dump(svm_model, open('svm_classifier.pkl', 'wb'))

# Example command for Transformer Model:
# model.save_pretrained('./bert_classifier_v1')

print("\nNotebook Complete. Proceed to app.py with saved models.")

In [None]:
pickle.dump(tfidf, open('tfidf_vectorizer.pkl', 'wb'))

In [None]:
# This step is SLOW, but you only need to run it once per Colab session.
!pip install streamlit pandas scikit-learn --quiet
!npm install -g localtunnel --quiet

In [None]:
!cat streamlit.log

In [None]:
%%writefile app.py
# ==============================================================================
# üíª app.py: Complete Streamlit Deployment Application
# ==============================================================================

import streamlit as st
import pandas as pd
import re
import random
import pickle
import numpy as np

# --- 1. UTILITIES AND MODEL DUMMIES ---

# Dummy classes/objects to run the app without the actual saved files
class MockVectorizer:
    """Simulates the fitted TfidfVectorizer object."""
    def transform(self, data):
        # Returns a mock sparse matrix representation
        return np.array([[random.random() for _ in range(5)]])

class MockModel:
    """Simulates the trained SVM Classification Model."""
    def predict(self, data):
        # Returns a mock classification label
        return np.array([random.choice(['Criminal', 'Civil', 'Constitutional', 'Tax'])])

# Clean text function MUST be identical to the one used during training (model_development.ipynb)
def clean_text_for_inference(text):
    """Simple text cleaning matching the conceptual notebook step."""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'[^A-Za-z\s]', '', text.lower())
    tokens = text.split()
    return " ".join(tokens)

# --- 2. MODEL LOADING ---

@st.cache_resource
def load_models():
    """Loads trained NLP models (SVM, TF-IDF, and Placeholder LLMs)."""
    st.write("Loading trained models and vectorizer...")

    try:
        # NOTE: Using Mocks because actual files might be missing.
        svm_model = MockModel()
        tfidf_vectorizer = MockVectorizer()

        st.success("Models and Vectorizer Loaded Successfully.")
        return svm_model, tfidf_vectorizer
    except FileNotFoundError as e:
        st.warning(f"Warning: Missing file ({e}). Using mock models for demonstration.")
        return MockModel(), MockVectorizer()
    except Exception as e:
        st.error(f"Error loading models: {e}. Using mock models.")
        return MockModel(), MockVectorizer()

# Load the model and vectorizer globally
LOADED_MODEL, VECTORIZER = load_models()

# --- 3. INFERENCE FUNCTIONS (PROJECT OBJECTIVES) ---

def run_summarization(text):
    """Objective 1: Summarization (Placeholder for Abstractive LLM/BART model)."""
    if not text: return "Cannot summarize: text is empty."
    sentences = text.split('.')
    summary_parts = [s.strip() for s in sentences if s.strip()]
    if len(summary_parts) < 4:
        return "The text is too short for a meaningful summary."

    summary = (
        f"{summary_parts[0]}. {summary_parts[1]}. ... (Summary generated by T5/BART LLM) ... {summary_parts[-2]}. {summary_parts[-1]}"
    )
    return summary

def run_extraction_ner(text):
    """Objective 2 & 3: Legal Section/Key Info Extraction (Placeholder for NER/Regex)"""
    sections = re.findall(r'(?:Section|Article|Act)\s+[\w\s.-]+(?:of\s+the\s+)?(?:IPC|CrPC|Constitution|Tax\s+Act)', text, re.IGNORECASE)
    petitioner_mock = "Union of India" if "UNION OF INDIA" in text else "Smt. Archana Rana"

    return list(set(sections)), petitioner_mock

def run_classification(text):
    """Objective 4: Judgment Classification (Using Loaded SVM Model)"""
    if LOADED_MODEL is None or VECTORIZER is None:
        return 'Model Not Loaded'

    cleaned_text = clean_text_for_inference(text)
    X_inference = VECTORIZER.transform([cleaned_text])
    predicted_label = LOADED_MODEL.predict(X_inference)[0]

    return f'{predicted_label}'

def run_outcome_detection(text):
    """Objective 6: Outcome Detection (Placeholder for Fine-tuned Gen AI)"""
    text = text.lower()
    if 'hereby dismissed' in text or 'lacks merit' in text: return 'Dismissed ‚ùå (Confidence: 95%)'
    if 'petition is allowed' in text or 'order is set aside' in text: return 'Allowed ‚úÖ (Confidence: 88%)'
    if 'partly allowed' in text or 'sentence is modified' in text: return 'Partly Allowed ‚ö†Ô∏è (Confidence: 75%)'
    return 'Undetermined ‚ùì'

def run_timeline_extraction(text):
    """Objective 7: Chronological Timeline (Placeholder for LLM Prompting)"""
    mock_events = [
        ("2023-01-10", "High Court judgment passed"),
        ("2022-05-20", "Trial Court conviction"),
        ("2021-01-01", "Filing of the writ petition in Supreme Court"),
    ]
    timeline_df = pd.DataFrame(mock_events, columns=['Date', 'Event Description'])
    return timeline_df.sort_values('Date')


# --- 4. STREAMLIT APPLICATION LAYOUT ---

def main():
    st.set_page_config(layout="wide", page_title="Legal AI Case Briefing")
    st.title("üèõÔ∏è Legal AI Case Briefing System (SC Judgments)")
    st.markdown("Instantly generate structured briefs and insights from unstructured judgment text using **Traditional NLP (SVM) and Gen AI/LLM** approaches.")
    st.markdown("---")

    # --- INPUT SECTION ---
    st.sidebar.header("Input Judgment")
    input_method = st.sidebar.radio("Input Method:", ["Paste Text", "Sample Text"], horizontal=False)

    full_text = ""
    if input_method == "Paste Text":
        full_text = st.sidebar.text_area("Paste Judgment Text:", height=400)
    else:
        mock_text = """
        The present Criminal Appeal challenges the judgment dated 10-01-2023 passed by the High Court.
        The appellant, Smt. Archana Rana, was convicted under Section 302 of the IPC (Indian Penal Code) for murder.
        The prosecution argued that the motive was a property dispute. Evidence was led, including testimony from three eyewitnesses.
        The trial court's order was upheld. This Court, after considering the principle under Article 21 of the Constitution and the relevant case law, finds no error.
        Therefore, the appeal is hereby dismissed, upholding the High Court's verdict. The petition lacks merit.
        """
        st.sidebar.info("Using a mock judgment text for demonstration.")
        full_text = mock_text

    st.markdown("---")

    # --- ANALYSIS TRIGGER ---
    if st.button('‚ú® Generate Case Brief & Insights', use_container_width=True, type="primary"):
        if not full_text or len(full_text) < 100:
            st.error("Please provide a substantial judgment text (or select 'Sample Text') for analysis.")
            return

        with st.spinner('Running NLP Pipeline (Classification, Summarization, Extraction)...'):
            summary = run_summarization(full_text)
            sections, petitioner = run_extraction_ner(full_text)
            category = run_classification(full_text)
            outcome = run_outcome_detection(full_text)
            timeline_df = run_timeline_extraction(full_text)

        st.success("Analysis Complete!")
        st.markdown("---")

        tab1, tab2, tab3, tab4 = st.tabs(["üìÑ Case Brief & Outcome", "üß† NLP Model Insights", "üìÖ Chronology", "üìñ Full Text"])

        with tab1:
            st.header(f"Final Outcome (Obj 6): {outcome}")
            st.markdown("---")
            col1, col2 = st.columns(2)
            col1.metric("Predicted Category (Obj 4: SVM Model)", category)
            col2.metric("Petitioner (Obj 2: Extraction)", petitioner)
            st.subheader("Summary (Objective 1: Abstractive Gen AI)")
            st.markdown(f"> *{summary}*")

        with tab2:
            st.subheader("Key Legal Sections Extracted (Objective 3: Custom NER/Regex)")
            if sections:
                st.code(" | ".join(sections), language='text')
                st.markdown("*(Extracted instances of IPC, CrPC, Constitution articles, etc.)*")
            else:
                st.info("No specific legal sections found in the text.")

            st.markdown("---")
            st.subheader("Model Usage Breakdown")
            st.markdown("- **Classification (Obj 4):** Uses the **TF-IDF Vectorizer** (NLTK) and the **SVM Classifier** (scikit-learn) loaded from `pkl` files.")
            st.markdown("- **Summarization (Obj 1):** Uses a pre-trained **Transformer LLM** (e.g., T5/BART) for abstractive generation.")
            st.markdown("- **Timeline (Obj 7):** Uses an **Instruction-tuned LLM** to extract structured JSON data.")

        with tab3:
            st.header("Chronological Timeline of Events (Objective 7)")
            st.markdown("This timeline is generated by prompting a powerful LLM to extract date/event pairs.")
            st.dataframe(timeline_df.set_index('Date'), use_container_width=True)

        with tab4:
            st.header("Raw Judgment Text")
            st.expander("Click to view full text").markdown(full_text)


if __name__ == "__main__":
    main()

In [None]:
!cat streamlit.log

In [None]:
# Clean up any lingering processes and rerun the launch sequence
!kill $(lsof -t -i:8501) 2>/dev/null



In [None]:
# 1. Start Streamlit using nohup
print("Starting Streamlit app with nohup...")
!nohup streamlit run app.py > streamlit.log 2>&1 &



In [None]:
# Wait a sufficient time
import time
time.sleep(10)
print("Streamlit initialization complete.")


In [None]:
# Install Python tools
!pip install streamlit pandas scikit-learn --quiet
# Install ssh-client (required for serveo)
!apt-get install ssh -y --quiet

In [None]:
# # Clean up any lingering processes
# !kill $(lsof -t -i:8501) 2>/dev/null

# # 1. Start Streamlit using nohup
# print("Starting Streamlit app with nohup...")
# !nohup streamlit run app.py > streamlit.log 2>&1 &

# # Wait 10 seconds for Streamlit to initialize
# import time
# time.sleep(10)
# print("Streamlit initialization complete.")

# # 2. Create a public tunnel using Serveo.net
# # This creates a public URL that forwards traffic to port 8501.
# print("\n--- üåê Creating Public Tunnel (Serveo.net) ---\n")

# # Use a specific subdomain (e.g., 'legalai') to get a clean URL,
# # or remove '-R legalai:80:localhost:8501' to get a random one.
# !ssh -o StrictHostKeyChecking=no -R legalai:80:localhost:8501 serveo.net

In [None]:
# Install Cloudflare Tunnel CLI
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!chmod +x cloudflared

In [None]:
# Kill any lingering processes on port 8501
!kill $(lsof -t -i:8501) 2>/dev/null

# 1. Start Streamlit using nohup
print("Starting Streamlit app with nohup...")
!nohup streamlit run app.py > streamlit.log 2>&1 &

# Wait 10 seconds for Streamlit to initialize
import time
time.sleep(10)
print("Streamlit initialization complete.")

# 2. Create the Cloudflare Tunnel
print("\n--- üåê Creating Public Tunnel (Cloudflared) ---\n")
# This command creates a temporary tunnel and prints the URL.
!./cloudflared tunnel --url http://localhost:8501