In [1]:
from openai import OpenAI
import os
import tiktoken
from tqdm import tqdm
import PyPDF2
import re
import pandas as pd
from typing import List, Tuple, Optional
from pydantic import BaseModel
from textwrap import dedent

In [2]:
OpenAI.api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI()
MODEL = 'gpt-4o-mini'
DIR = 'C:/Users/suyog/Desktop/monsoon_24/capstone-legal-docs-analysis/Docs/fs_cases/'

In [3]:
def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
                    
        text = re.sub(r'Indian Kanoon - http://indiankanoon.org/doc/\d+/ \d', '', text)
        text = re.sub(r'http://www.judis.nic.in', '', text)
    
        return text

In [19]:
df = pd.read_csv('food_safety_newdocs.csv')
docs = df['file_name'].to_list()
docs
# token_lengths = pd.DataFrame(columns=['doc', 'token_length'])
# for doc in tqdm(docs):
#     text = read_pdf(DIR + doc)
#     encoding = tiktoken.encoding_for_model('gpt-4o-mini')
#     token_lengths.loc[len(token_lengths)] = [doc, len(encoding.encode(text))]

['8_vs_Wood_Craft_Products_1995_on_21_February_2015.PDF',
 'Adab_vs_State_Of_U_P_And_4_Others_on_9_May_2023.PDF',
 'Aditya_Birla_Retail_Ltd_Ludhiana_vs_Food_Safety_Officer_Ludhiana_on_14_March_2018.PDF',
 'Agro_Tech_Foods_Limited_vs_The_State_Of_Rajasthan_on_27_May_2019.PDF',
 'Akuthota_Narasimha_vs_The_State_Of_Telangana_on_22_June_2021.PDF',
 'Alpeshbhai_Babubhai_Pathodiya_vs_State_Of_Gujarat_3_on_24_November_2017.PDF',
 'Alpeshbhai_Babubhai_Pathodiya_vs_State_Of_Gujarat_on_4_December_2018.PDF',
 'Amar_Kumar_Khabare_Amar_Khabare_vs_The_State_Of_Jharkhand_on_1_May_2019.PDF',
 'Amir_Ahamad_vs_Commissioner_Of_Food_Safety_And_on_22_July_2022.PDF',
 'Amit_vs_The_State_Of_Madhya_Pradesh_on_21_April_2018.PDF',
 'Anant_Sharma_Anr_vs_The_Abjudicating_Officer_Ors_on_12_March_2018.PDF',
 'Anil_Kumar_Mishra_vs_Addl_Distt_Session_Judge_Vii_on_19_December_2019.PDF',
 'Anil_Yadav_Proprietor_Of_Ms_Anil_Sweets_vs_State_Of_Jharkhand_And_Ors_on_25_April_2016.PDF',
 'Anup_Kumar_vs_Food_Safety_Appellate_

In [5]:
def read_single_pdf(path):
    reader = PyPDF2.PdfReader(path)
    text = ''
    for page in range(len(reader.pages)):
        text += reader.pages[page].extract_text()
    
    text = re.sub(r'Indian Kanoon - http://indiankanoon.org/doc/\d+/ \d', '', text)
    text = re.sub(r'http://www.judis.nic.in', '', text)
    
    return text

In [16]:
questions_prompt = '''
    You will be provided with a court judgment related to a food safety issue. Your task is to answer questions based strictly and concisely based on the provided text. 
    Each response must be in plaintext, avoiding any extra content, commentary, or interpretations beyond what is explicitly stated in the judgment. Use only the information provided. 
    If the text does not contain the required information, respond with "Not specified."
    Here is a description of the parameters:
    - answer: the answer to the question asked.
    '''

class QuestionAnswer(BaseModel):
    answer: str

def question_answering(text: str):
    completion = client.beta.chat.completions.parse(
        model=MODEL,
        temperature=0.0,
        messages=[
            {"role": "system", "content": dedent(questions_prompt)},
            {"role": "user", "content": text}
        ],
        response_format=QuestionAnswer,
    )

    return completion.choices[0].message.parsed


In [None]:
questions = pd.read_csv(r'C:\Users\suyog\Desktop\monsoon_24\capstone-legal-docs-analysis\sheets\food_safety_questions.csv')
questions = questions[:12]
questions

Unnamed: 0,question_id,question
0,1.1,What food product was involved in this case?
1,1.2,What type of food product is discussed in this...
2,1.3,Which food item is central to this case?
3,2.1,"Is the petitioner individual, government entit..."
4,2.2,What classification best describes the petitio...
5,2.3,What kind of entity is the petitioner-organiza...
6,3.1,"Is the respondent individual, government entit..."
7,3.2,What classification best describes the respond...
8,3.3,What kind of entity is the respondent-organiza...
9,4.1,What crime was associated with the food item?


In [15]:
out_df = pd.DataFrame(columns=['doc', 'question_id', 'question', 'answer'])

In [21]:
errs = {}
for doc in tqdm(docs):
    try:
        text = read_single_pdf(DIR + doc)
        for i in range(len(questions)):
            text_with_question = text + '\nQuestion:\n' + questions['question'][i]
            answer = question_answering(text_with_question)
            out_df.loc[len(out_df)] = [doc, questions['question_id'][i], questions['question'][i], answer.answer]
            out_df.to_csv(r'C:\Users\suyog\Desktop\monsoon_24\capstone-legal-docs-analysis\sheets\food_safety_answers.csv', index=False)
    except Exception as e:
        print(doc, e)
        errs[doc] = e
        continue
    

100%|██████████| 249/249 [58:03<00:00, 13.99s/it]  


In [95]:
for doc in errs.keys():
    #remove the doc from out_df
    if doc in out_df['doc'].values:
        print(f'Removing {doc} from out_df')
        out_df = out_df[out_df['doc'] != doc]

Removing Potta_Suresh_vs_The_State_Of_Andhra_Pradesh_on_22_March_2022.PDF from out_df


In [None]:
for doc in errs.keys():
    try:
        text = read_single_pdf(DIR + doc)
        if 'Lg_Electronics' in doc:
            print(doc)
            # sample 125k tokens from the text
            text = text[:125000]
        for i in range(len(questions)):
            text_with_question = text + '\nQuestion:\n' + questions['question'][i]
            answer = question_answering(text_with_question)
            print(answer.answer)
            out_df.loc[len(out_df)] = [doc, questions['question_id'][i], questions['question'][i], answer.answer]
            out_df.to_csv(r'C:\Users\suyog\Desktop\monsoon_24\capstone-legal-docs-analysis\sheets\food_safety_answers.csv', index=False)
    except Exception as e:
        print(doc, e)
        errs[doc] = e
        continue

M_S_Lg_Electronics_India_Pvt_Ltd_vs_The_State_Of_Tamil_Nadu_on_31_March_2022.PDF
NA
NA
NA
3. organization
organization
organization
government entity
government entity
government entity
NA
NA
NA
NA
NA
NA
tobacco products
tobacco products
tobacco products
1. individual
individual
2. individual
2. government entity
3. government entity
3. government entity
NA
NA
NA
1. the petitioner
the petitioner
1. the petitioner
NA
NA
NA
NA
organization
NA
organization
organization
NA
NA
NA
NA
1. the petitioner
1. the petitioner
1. the petitioner
NA
NA
NA
individual
individual
individual
2. government entity
3. government entity
3. government entity
NA
NA
NA
2. the respondent
2. the respondent
2. the respondent
Goa Gutkha
Goa Gutkha products
Goa Gutkha
individual
individual
individual
2. government entity
3. government entity
3. government entity
Goa Gutkha products which are banned
The unlawful act involved the recovery of banned Goa Gutkha products.
The crime involved the possession of banned Goa Gu

In [40]:
out_df_selected = out_df[::3]
out_df_selected[900:930]

Unnamed: 0,doc,question_id,question,answer
2700,Ramakant_Gupta_vs_The_State_Of_Chhattisgarh_on...,4.1,What crime was associated with the food item?,The petitioner was associated with selling adu...
2703,Ramanand_Sankhla_vs_The_State_Of_Madhya_Prades...,1.1,What food product was involved in this case?,Not specified.
2706,Ramanand_Sankhla_vs_The_State_Of_Madhya_Prades...,2.1,"Is the petitioner individual, government entit...",Individual
2709,Ramanand_Sankhla_vs_The_State_Of_Madhya_Prades...,3.1,"Is the respondent individual, government entit...",Government entity
2712,Ramanand_Sankhla_vs_The_State_Of_Madhya_Prades...,4.1,What crime was associated with the food item?,Not specified.
2715,Raman_Yadav_vs_The_State_Of_Madhya_Pradesh_on_...,1.1,What food product was involved in this case?,Not specified.
2718,Raman_Yadav_vs_The_State_Of_Madhya_Pradesh_on_...,2.1,"Is the petitioner individual, government entit...",Individual
2721,Raman_Yadav_vs_The_State_Of_Madhya_Pradesh_on_...,3.1,"Is the respondent individual, government entit...",Government entity
2724,Raman_Yadav_vs_The_State_Of_Madhya_Pradesh_on_...,4.1,What crime was associated with the food item?,Not specified.
2727,Raman_Yadav_vs_The_State_Of_Madhya_Pradesh_on_...,1.1,What food product was involved in this case?,Not specified.


In [137]:
# remove numbers from answer
out_df_cleaned = out_df.copy()
out_df_cleaned['answer'] = out_df['answer'].apply(lambda x: re.sub(r'\d+\. ', '', x))
out_df_cleaned

Unnamed: 0,doc,question_id,question,answer
0,Abhijeet_Suryakant_Maske_And_Anr_vs_The_State_...,1.1,What food product was involved in this case?,
1,Abhijeet_Suryakant_Maske_And_Anr_vs_The_State_...,1.2,What type of food product is discussed in this...,
2,Abhijeet_Suryakant_Maske_And_Anr_vs_The_State_...,1.3,Which food item is central to this case?,
3,Abhijeet_Suryakant_Maske_And_Anr_vs_The_State_...,2.1,"Is the petitioner 1. individual, 2. government...",individual
4,Abhijeet_Suryakant_Maske_And_Anr_vs_The_State_...,2.2,What classification best describes the petitio...,individual
...,...,...,...,...
3022,Pritam_Kumar_Mittal_And_Ors_vs_Saurabh_Tanwar_...,4.2,What unlawful act was committed involving the ...,Defendant no.1 was running his food business i...
3023,Pritam_Kumar_Mittal_And_Ors_vs_Saurabh_Tanwar_...,4.3,What crime was committed in relation to the fo...,Running a restaurant business without obtainin...
3024,Pritam_Kumar_Mittal_And_Ors_vs_Saurabh_Tanwar_...,5.1,Was the outcome of the case in favour of 1. th...,the petitioner
3025,Pritam_Kumar_Mittal_And_Ors_vs_Saurabh_Tanwar_...,5.2,"Whose side did the judgment favor, 1. the peti...",the petitioner


In [138]:
tobacco_terms = ['tobacco', 'guthka','gutka', 'pan', 'cigarette', 'smoking', 'hookah']
tobacco_out = out_df_cleaned[out_df_cleaned['answer'].str.contains('|'.join(tobacco_terms), case=False)]

In [None]:
#replace any answers for questions with question_id 1.1, 1.2, or 1.3,  which include tobacco_terms with 'tobacco/pan/guthka'
for answer in 

In [136]:
list(out_df_cleaned['answer'])

['NA',
 'NA',
 'NA',
 'individual',
 'individual',
 'individual',
 'government entity',
 'government entity',
 'government entity',
 'Crime No. 53 of 2022 registered with Baramati Police Station, Dist. Pune under Section 328, 272, 273, 188 of IPC and Section 26(2)(iv), 27, 30(2), 31 and 59 of the Food Safety and Standards Act, 2006.',
 'NA',
 'NA',
 'the petitioner',
 'the petitioner',
 'the petitioner',
 'mixed milk',
 'mixed milk',
 'mixed milk',
 'individual',
 'individual',
 'individual',
 'government entity',
 'government entity',
 'government entity',
 'unsafe food',
 'The sample of mixed milk was found to be substandard and unsafe.',
 'The sample of mixed milk was found to be substandard and unsafe.',
 'the petitioner',
 'the petitioner',
 'the petitioner',
 'Sago',
 'Sago',
 'Sago',
 'organization',
 'organization',
 '1',
 'government entity',
 'government entity',
 'government entity',
 'NA',
 'NA',
 'NA',
 'the petitioner',
 'the petitioner',
 'the petitioner',
 'NA',
 'NA',
