In [1]:
from openai import OpenAI
import os
import tiktoken
from tqdm import tqdm
import PyPDF2
import re
import pandas as pd
from typing import List, Tuple, Optional
from pydantic import BaseModel
from textwrap import dedent

In [2]:
OpenAI.api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI()
MODEL = 'gpt-4o-mini'

In [3]:
def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
                    
        text = re.sub(r'Indian Kanoon - http://indiankanoon.org/doc/\d+/ \d', '', text)
        text = re.sub(r'http://www.judis.nic.in', '', text)
    
        return text

In [4]:
docs = os.listdir('Docs/food-safety')
token_lengths = pd.DataFrame(columns=['doc', 'token_length'])
for doc in tqdm(docs):
    text = read_pdf(f'Docs/food-safety/{doc}')
    encoding = tiktoken.encoding_for_model('gpt-4o-mini')
    token_lengths.loc[len(token_lengths)] = [doc, len(encoding.encode(text))]

  0%|          | 0/197 [00:00<?, ?it/s]

100%|██████████| 197/197 [00:12<00:00, 15.51it/s]


In [5]:
token_lengths['token_length'].sum()

1148944

In [6]:
summarization_prompt = '''
    You will be provided with a legal judgement of a food safety-related case.
    Your goal is to summarize the judgement and extract key information following the schema provided.
    Here is a description of the parameters:
    - summary: detailed summary of the judgement.
    - court: name of the court that issued the judgement.
    - petitioners: array of strings containing the names of ALL appellant(s) in the case.
    - respondents: array of strings containing the names of ALL respondent(s) in the case.
    - judges: array of strings containing the names of the judge(s) in the case.
    - date: date of the judgement, as a string in the format "DD-MM-YYYY".
    - org: array of strings containing the names of all organizations, companies, or government entities mentioned in the case, if any.
    - gpe: array of strings containing the names of all geographical locations mentioned in the case, if any.
    - provisions: array of strings containing the provisions of ALL statutes cited in the judgement. Provide these in the format "Section x of y". Make sure you get all the sections and provisions mentioned in the judgement.
    - statutes: array of strings containing the names of ALL acts or laws cited in the judgement. 
    - precedents: array of strings containing the names of ALL precedents cited in the judgement. 
    - key_facts: key facts about the case. 
    - type_of_case: the type of case, e.g. bail application, civil appeal, criminal appeal, public interest litigation, etc.
    - decision: the decision of the court in the case, if provided. If there is a verdict, respond with 'in favour of appellant' or 'against apellant'. If not, leave this field empty. 
'''



def get_article_summary(text: str):
    completion = client.beta.chat.completions.parse(
        model=MODEL,
        temperature=0.0,
        messages=[
            {"role": "system", "content": dedent(summarization_prompt)},
            {"role": "user", "content": text}
        ],
        response_format=ArticleSummary,
    )

    return completion.choices[0].message.parsed

In [7]:
summarization_prompt = '''
    You will be provided with a legal judgement of a food safety-related case.
    Your goal is to provide a detailed summary of the judgement.
    Here is a description of the parameters:
    - summary: a detailed summary of the judgement. Include all key points and arguments made in the judgement, along with the final decision of the court. Also include the names of the key parties involved in the case.
'''

class ArticleSummary(BaseModel):
    summary: str

def get_summary(text: str):
    completion = client.beta.chat.completions.parse(
        model=MODEL,
        temperature=0.0,
        messages=[
            {"role": "system", "content": dedent(summarization_prompt)},
            {"role": "user", "content": text}
        ],
        response_format=ArticleSummary,
    )

    return completion.choices[0].message.parsed


In [8]:
extraction_prompt = '''
    You will be provided with a legal judgement of a food safety-related case.
    Your goal is to extract key information following the schema provided.
    Here is a description of the parameters:
    - court: name of the court that issued the judgement.
    - petitioners: array of strings containing the names of ALL appellant(s) in the case.
    - respondents: array of strings containing the names of ALL respondent(s) in the case.
    - judges: array of strings containing the names of the judge(s) in the case.
    - date: date of the judgement, as a string in the format "DD-MM-YYYY".
    - org: array of strings containing the names of all organizations, companies, or government entities mentioned in the case, if any.
    - gpe: array of strings containing the names of all geographical locations mentioned in the case, if any.
    - provisions: array of strings containing the provisions of ALL statutes cited in the judgement. Provide these in the format "Section x of y".
    - statutes: array of strings containing the names of ALL acts or laws cited in the judgement.
    - precedents: array of strings containing the names of ALL precedents cited in the judgement.
    - key_facts: key facts about the case.
    - type_of_case: the type of case, e.g. bail application, civil appeal, criminal appeal, public interest litigation, etc.
    - decision: the decision of the court in the case, if provided. If there is a verdict, respond with 'in favour of appellant' or 'against appellant'. If not, leave this field empty.
'''

class EntityExtraction(BaseModel):
    court: str
    petitioners: list[str]
    respondents: list[str]
    judges: list[str]
    date: str
    org: list[str]
    gpe: list[str]
    provisions: list[str]
    statutes: list[str]
    precedents: list[str]
    key_facts: str
    type_of_case: str
    decision: str

def get_key_info(text: str):
    completion = client.beta.chat.completions.parse(
        model=MODEL,
        temperature=0.0,
        messages=[
            {"role": "system", "content": dedent(extraction_prompt)},
            {"role": "user", "content": text}
        ],
        response_format=EntityExtraction,
    )

    return completion.choices[0].message.parsed


In [9]:
def print_entities(extracted):
    print(f"Court: {extracted.court}")
    print(f"Petitioners: {', '.join(extracted.petitioners)}")
    print(f"Respondents: {', '.join(extracted.respondents)}")
    print(f"Judges: {', '.join(extracted.judges)}")
    print(f"Date: {extracted.date}")
    print(f"Organizations: {', '.join(extracted.org)}")
    print(f"Locations: {', '.join(extracted.gpe)}")
    print(f"Provisions: {', '.join(extracted.provisions)}")
    print(f"Statutes: {', '.join(extracted.statutes)}")
    print(f"Precedents: {', '.join(extracted.precedents)}")
    print(f"Key Facts: {extracted.key_facts}")
    print(f"Decision: {extracted.decision}")
    
def save_output_to_df(file_name, summary, extracted, df):
    df.loc[len(df)] = [file_name, summary.summary, extracted.court, extracted.petitioners, extracted.respondents, extracted.judges, extracted.date, extracted.org, extracted.gpe, extracted.provisions, extracted.statutes, extracted.precedents, extracted.key_facts, extracted.type_of_case, extracted.decision]
    return df

In [10]:
def read_single_pdf(path):
    reader = PyPDF2.PdfReader(path)
    text = ''
    for page in range(len(reader.pages)):
        text += reader.pages[page].extract_text()
    
    text = re.sub(r'Indian Kanoon - http://indiankanoon.org/doc/\d+/ \d', '', text)
    text = re.sub(r'http://www.judis.nic.in', '', text)
    
    return text

In [24]:
done = pd.read_csv('food_safety.csv', index_col=0)
completed = done.file_name.tolist()
completed

['Abhijeet_Suryakant_Maske_And_Anr_vs_The_State_Of_Maharashtra_on_1_March_2022.PDF',
 'Amluya_Anand_vs_State_on_8_March_2022.PDF',
 'Arulmurugan_Starch_Industries_vs_Food_Safety_And_Standards_Authority_Of_on_19_January_2022.PDF',
 'Arun_Raosaheb_Khot_vs_State_Of_Maharashtra_on_31_January_2022.PDF',
 'Asha_Singh_vs_The_State_Of_Bihar_on_24_March_2022.PDF',
 'Avinash_Balkrishna_Bhamre_vs_State_Of_Maharashtra_on_8_March_2022.PDF',
 'Ayesha_Hajeera_Almas_vs_Chief_Secretary_on_15_March_2022.PDF',
 'A_R_Starch_Products_vs_The_Commissioner_Of_Food_Safety_on_3_January_2022.PDF',
 'Badam_Sampath_Kumar_vs_State_Of_Odisha_Opp_Party_on_8_February_2022.PDF',
 'Baswaraj_Vishwanath_Agre_vs_The_State_Of_Maharashtra_on_3_March_2022.PDF',
 'Bharat_Sukhdev_Dhobale_vs_The_State_Of_Maharashtra_on_21_February_2022.PDF',
 'Bijaya_Kumar_Kamani_vs_State_Of_Odisha_Opposite_Party_on_1_February_2022.PDF',
 'Binod_Kumar_Agrawal_vs_State_Of_Odisha_Opp_Party_on_9_February_2022.PDF',
 'Bose_Mathew_vs_The_State_Of_Ker

In [25]:
out_df = done
out_df

Unnamed: 0,file_name,summary,court,petitioners,respondents,judges,date,org,gpe,provisions,statutes,precedents,key_facts,type_of_case,decision
0,Abhijeet_Suryakant_Maske_And_Anr_vs_The_State_...,In the case of Abhijeet Suryakant Maske and An...,HIGH COURT OF JUDICATURE AT BOMBAY,"['Abhijeet Suryakant Maske', 'Dilip Ramchandra...",['The State of Maharashtra'],['C.V. Bhadang'],01-03-2022,[],"['Pune', 'Maharashtra']","['Section 328 of IPC', 'Section 272 of IPC', '...","['Food Safety and Standards Act, 2006']","['Joseph Kurian & Anr. Vs. State of Kerala', '...",The Applicants are seeking anticipatory bail i...,Anticipatory Bail Application,in favour of appellant
1,Amluya_Anand_vs_State_on_8_March_2022.PDF,**Case Summary: Amluya Anand vs State (8 March...,HIGH COURT OF JUDICATURE FOR RAJASTHAN AT JODHPUR,['Amluya Anand'],"['State, Through Vinod Sharma, Food Safety Off...",['Sandeep Mehta'],08-03-2022,"['Amul Dairy', 'Food Safety Officer', 'Chief M...","['Mujjafarpur', 'Gurgaon', 'Nagaur', 'Devgarh ...","['Section 468 of Cr.P.C.', 'Section 77 of the ...","['Food Safety and Standards Act, 2006']",['Sri.V.V.S.S.R. Prakash Rao vs. The State of ...,The petitioner sought to quash orders taking c...,Criminal Miscellaneous Petition,in favour of appellant
2,Arulmurugan_Starch_Industries_vs_Food_Safety_A...,**Case Summary: Arulmurugan Starch Industries ...,High Court of Judicature at Madras,['Arulmurugan Starch Industries'],['Food Safety and Standards Authority of India...,['Krishnan Ramasamy'],19-01-2022,['Food Safety and Standards Authority of India...,"['Salem District', 'New Delhi', 'Chennai']",['Article 226 of the Constitution of India'],"['Food Safety and Standards Act, 2006']",[],The petitioner filed a writ petition challengi...,Writ Petition,The court directed the Commissioner of Food Sa...
3,Arun_Raosaheb_Khot_vs_State_Of_Maharashtra_on_...,In the case of Arun Raosaheb Khot vs. State of...,High Court of Judicature at Bombay,['Arun Raosaheb Khot'],['The State of Maharashtra'],['C.V. Bhadang'],31-01-2022,[],"['Pune', 'Maharashtra']","['Section 272 of IPC', 'Section 273 of IPC', '...","['Food Safety and Standards Act, 2006']",['Joseph Kurian & Anr. Vs. State of Kerala'],The applicant is seeking anticipatory bail in ...,Anticipatory Bail Application,in favour of appellant
4,Asha_Singh_vs_The_State_Of_Bihar_on_24_March_2...,In the case of Asha Singh vs The State of Biha...,HIGH COURT OF JUDICATURE AT PATNA,['Asha Singh'],"['The State of Bihar', 'THE UNION OF INDIA THR...",['Ashutosh Kumar'],24-03-2022,"['Tata Institute of Social Sciences', 'SAKHI']","['Bihar', 'Patna', 'East Champaran', 'Motihari']","['Section 341 of IPC', 'Section 342 of IPC', '...","['Indian Penal Code', 'Protection of Children ...",[],"The petitioner, Asha Singh, is seeking anticip...",Criminal Miscellaneous,against appellant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,M_P_Paramesh_vs_The_Managing_Director_on_18_Ja...,**Case Summary: M.P. Paramesh vs The Managing ...,XXIV Additional Small Causes Court & Motor Acc...,['M.P.Paramesh'],"['The Managing Director, KSRTC']",['Sri.R.Mahesha'],18-01-2022,['KSRTC'],"['Bengaluru', 'Madenahalli', 'Lakkur', 'Sompur...","['Section 166 of Motor Vehicles Act, 1989']","['Motor Vehicles Act, 1989']",[],M.P.Paramesh sustained grievous injuries in a ...,Motor Vehicle Accident Claim,in favour of appellant
101,M_Sowmiya_Sundari_vs_The_State_Of_Tamil_Nadu_o...,**Case Summary: M.Sowmiya Sundari vs The State...,High Court of Judicature at Madras,['M.Sowmiya Sundari'],"['The State of Tamil Nadu', 'The Commissioner ...",['N.Sathish Kumar'],21-09-2023,"['Health and Family Welfare Department', 'Food...","['Tamil Nadu', 'Chennai']","['Section 36 of Food Safety Standards Act, 200...","['Food Safety and Standards Act, 2006', 'Tamil...",[],M.Sowmiya Sundari was appointed as a designate...,Writ Petition,in favour of appellant
102,M_S_Beml_Limited_vs_M_S_Veer_Engineering_Works...,**Case Summary: M/S.Beml Limited vs M/S Veer E...,"LXXXIX Addl.City Civil & Sessions Judge, Benga...",['M/s.BEML Limited'],"['M/s Veer Engineering Works', ""Hon'ble Mr.Jus...",['Sri.S.J.Krishna'],03-01-2022,['M/s. Steel Authority of India Limited (SAIL)'],"['Bengaluru', 'Uttar Pradesh', 'Raipur']",['Section 34 of Arbitration & Conciliation Act...,"['Arbitration & Conciliation Act, 1996', 'Mini...","['Tarapore & Co. v. State of M.P.', 'MMTC Ltd....","The plaintiff, M/s.BEML Limited, filed an arbi...",Arbitration suit,against appellant
103,M_S_Hindustan_Unilever_Limited_vs_The_State_Of...,**Case Summary: M/S Hindustan Unilever Limited...,HIGH COURT FOR THE STATE OF TELANGANA,['M/s. Hindustan Unilever Limited'],"['The State of Andhra Pradesh', 'Public Prosec...",['Dr. JUSTICE CHILLAKUR SUMALATHA'],22-02-2022,[],"['Andhra Pradesh', 'Warangal', 'Hyderabad']","['Section 482 of Cr.P.C', 'Section 50 of Legal...","['Legal Metrology Act, 2009', 'Food Safety and...",['DHARAMPAL SATYAPAL LIMITED vs. DEPUTY COMMIS...,"The petitioner, M/s. Hindustan Unilever Limite...",Criminal Petition,in favour of appellant


In [26]:
files = os.listdir('Docs/food-safety')
path = r"C:\Users\suyog\Desktop\monsoon_24\capstone-legal-docs-analysis\Docs\food-safety\\"
out_df = done


for file_path in tqdm(files):
    if file_path not in completed:
        try:
            text = read_single_pdf(path + file_path)
            summary = get_summary(text)
            extracted = get_key_info(text)
            out_df = save_output_to_df(file_path, summary, extracted, out_df)
            out_df.to_csv('food_safety_2.csv', index=True)           
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            continue



 54%|█████▍    | 106/197 [00:02<00:02, 39.55it/s]

Error processing M_S_Lg_Electronics_India_Pvt_Ltd_vs_The_State_Of_Tamil_Nadu_on_31_March_2022.PDF: Error code: 400 - {'error': {'message': "This model's maximum context length is 128000 tokens. However, your messages resulted in 156397 tokens (including 31 in the response_format schemas.). Please reduce the length of the messages or schemas.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


100%|██████████| 197/197 [11:43<00:00,  3.57s/it]


In [13]:
out_df.to_csv('food_safety.csv', index=True)