In [1]:
from openai import OpenAI
import os
import tiktoken
from tqdm import tqdm
import PyPDF2
from text_cleanup import clean_text
import re
import pandas as pd
from typing import List, Tuple, Optional
from pydantic import BaseModel

In [11]:
OpenAI.api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI()
MODEL = 'gpt-4o-mini'

In [3]:
def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
                    
        text = re.sub(r'Indian Kanoon - http://indiankanoon.org/doc/\d+/ \d', '', text)
        text = re.sub(r'http://www.judis.nic.in', '', text)
    
        return text

In [4]:
docs = os.listdir('Docs/food-safety')
token_lengths = pd.DataFrame(columns=['doc', 'token_length'])
for doc in tqdm(docs):
    text = read_pdf(f'Docs/food-safety/{doc}')
    encoding = tiktoken.encoding_for_model('gpt-4o-mini')
    token_lengths.loc[len(token_lengths)] = [doc, len(encoding.encode(text))]

100%|██████████| 20/20 [00:05<00:00,  3.96it/s]


In [9]:
token_lengths['token_length'].sum()

329080

In [55]:
from textwrap import dedent
summarization_prompt = '''
    You will be provided with a legal judgement of a food safety-related case.
    Your goal is to summarize the judgement and extract key information following the schema provided.
    Here is a description of the parameters:
    - summary: detailed summary of the judgement.
    - court: name of the court that issued the judgement.
    - petitioners: array of strings containing the names of ALL appellant(s) in the case.
    - respondents: array of strings containing the names of ALL respondent(s) in the case.
    - judges: array of strings containing the names of the judge(s) in the case.
    - date: date of the judgement, as a string in the format "DD-MM-YYYY".
    - org: array of strings containing the names of all organizations, companies, or government entities mentioned in the case, if any.
    - gpe: array of strings containing the names of all geographical locations mentioned in the case, if any.
    - provisions: array of strings containing the provisions of ALL statutes cited in the judgement. Provide these in the format "Section x of y". Make sure you get all the sections and provisions mentioned in the judgement.
    - statutes: array of strings containing the names of ALL acts or laws cited in the judgement. 
    - precedents: array of strings containing the names of ALL precedents cited in the judgement. 
    - key_facts: key facts about the case. 
    - type_of_case: the type of case, e.g. bail application, civil appeal, criminal appeal, public interest litigation, etc.
    - decision: the decision of the court in the case, if provided. If there is a verdict, respond with 'in favour of appellant' or 'against apellant'. If not, leave this field empty. 
'''

class ArticleSummary(BaseModel):
    summary: str
    court: str
    petitioners: list[str]
    respondents: list[str]
    judges: list[str]
    date: str
    org: list[str]
    gpe: list[str]
    provisions: list[str]
    statutes: list[str]
    precedents: list[str]
    key_facts: str
    type_of_case: str
    decision: str

def get_article_summary(text: str):
    completion = client.beta.chat.completions.parse(
        model=MODEL,
        temperature=0.0,
        messages=[
            {"role": "system", "content": dedent(summarization_prompt)},
            {"role": "user", "content": text}
        ],
        response_format=ArticleSummary,
    )

    return completion.choices[0].message.parsed

In [53]:
def print_summary(summary):
    print(f"Summary: {summary.summary}")
    print(f"Court: {summary.court}")
    print(f"Petitioners: {', '.join(summary.petitioners)}")
    print(f"Respondents: {', '.join(summary.respondents)}")
    print(f"Judges: {', '.join(summary.judges)}")
    print(f"Date: {summary.date}")
    print(f"Organizations: {', '.join(summary.org)}")
    print(f"Locations: {', '.join(summary.gpe)}")
    print(f"Provisions: {', '.join(summary.provisions)}")
    print(f"Statutes: {', '.join(summary.statutes)}")
    print(f"Precedents: {', '.join(summary.precedents)}")
    print(f"Key Facts: {summary.key_facts}")
    print(f"Decision: {summary.decision}")
    
def save_output_to_df(file_name, summary, df):
    df.loc[len(df)] = [file_name, summary.summary, summary.court, summary.petitioners, summary.respondents, summary.judges, summary.date, summary.org, summary.gpe, summary.provisions, summary.statutes, summary.precedents, summary.key_facts, summary.type_of_case, summary.decision]
    return df

In [15]:
def read_single_pdf(path):
    reader = PyPDF2.PdfReader(path)
    text = ''
    for page in range(len(reader.pages)):
        text += reader.pages[page].extract_text()
    
    text = re.sub(r'Indian Kanoon - http://indiankanoon.org/doc/\d+/ \d', '', text)
    text = re.sub(r'http://www.judis.nic.in', '', text)
    
    return text

In [56]:
files = os.listdir('Docs/food-safety')
path = r"C:\Users\suyog\Desktop\monsoon_24\capstone-legal-docs-analysis\Docs\food-safety\\"
out_df = pd.DataFrame(columns=['file_name','summary', 'court', 'petitioners', 'respondents', 'judges', 'date', 'org', 'gpe', 'provisions', 'statutes', 'precedents', 'key_facts', 'type_of_case', 'decision'])


for file_path in tqdm(files):
    text = read_single_pdf(path + file_path)
    out = get_article_summary(text)
    out_df = save_output_to_df(file_path, out, out_df)

out_df.to_csv('food_safety.csv', index=True)

100%|██████████| 20/20 [03:06<00:00,  9.31s/it]


PermissionError: [Errno 13] Permission denied: 'food_safety.csv'

In [57]:
out_df.to_csv('food_safety.csv', index=True)