**Goal 1**

In [15]:
import pymupdf
import pytesseract
import re
from PIL import Image
from io import BytesIO

case1 = "Lee Hui Chin v Chubb [2024] SGHC 69"
case2 = "PP v Ong Tze Boon [2016] SGDC 95"

def extract_text_from_pdf(file):
    text = ""
    # Get text if there is text in the PDF
    with pymupdf.open(file) as doc:
        text = "".join([page.get_text() for page in doc])
    # Otherwise, Use OCR to get text 
    if text == "":
        text = extract_text_with_ocr(file)
    return text

def extract_text_with_ocr(file):
    text = ""
    with pymupdf.open(file) as doc:
        for page in doc:
            pix = page.get_pixmap(dpi=300)
            img_bytes = pix.tobytes("png")
            img = Image.open(BytesIO(img_bytes))
            page_text = pytesseract.image_to_string(img)
            text += page_text
    # Preprocess text such that text format returned is similar to non-OCR method         
    text = pre_preprocess_ocr_text(text)
    return text

def pre_preprocess_ocr_text(text):
    lines = text.split("\n")
    processed_lines = []
    insert = True
    for line in lines:
        words = line.split(" ")
        # Format text such that Paragraph Numbers are on their on separate line
        if words[0].isnumeric():
            processed_lines.append(words[0])
            line = " ".join(words[1:])
        if line != "":
            processed_lines.append(line)
            # Insert Missing Header and Paragraph Number
            if line[-1] == ":" and insert == True:
                processed_lines.append("Introduction")
                processed_lines.append("1")
                insert = False
    return "\n".join(processed_lines)

def clean_lines(text):
    version_name_pattern = re.compile(r"version\s+no\s+\d+:\s+\d{1,2}\s+[A-Za-z]{3}\s+\d{4}\s+\(\d{2}:\d{2}\s+hrs\)", re.IGNORECASE)
    note_pattern = re.compile(r"\[nete:\s*\d+\]", re.IGNORECASE)
    lines = text.split("\n")
    cleaned_lines = []
    while lines:
        if re.fullmatch(version_name_pattern, lines[0]):
            for i in range(4):
                if lines:
                    lines.pop(0)
        if lines:
            if re.search(note_pattern, lines[0]):
                lines[0] = re.sub(note_pattern, ".", lines[0])
            cleaned_lines.append(lines[0])
            lines.pop(0)
    return cleaned_lines 

def extract_paragraphs(cleaned_lines):
    paragraphs = []
    current = []
    start = False
    lines_to_remove = find_number_of_lines_to_be_removed_from_behind(cleaned_lines)

    for i in range(len(cleaned_lines)-lines_to_remove):
        if cleaned_lines[i].isnumeric() and start == True:
            if cleaned_lines[i-1].strip()[-1] != ".":
                paragraphs.append(" ".join(current[:-1]))
            else:
                paragraphs.append(" ".join(current))
            current = []
        current.append(cleaned_lines[i])
        if cleaned_lines[i].strip()[-1] == ":" and start == False:
            start = True
            current = []
        
    if current:
        paragraphs.append(" ".join(current))

    return paragraphs[1:] 

def find_number_of_lines_to_be_removed_from_behind(cleaned_lines): 
    cleaned_lines = cleaned_lines[::-1]
    lines_to_remove = 0
    ignored_last = False
    for line in cleaned_lines:
        if line.strip()[-1] == "." and ignored_last == True:
            break
        if line.strip()[-1] == "." and ignored_last == False:
            ignored_last = True
        lines_to_remove += 1
    return lines_to_remove

**Case 1**

In [16]:
# case 1
c1_text = extract_text_from_pdf(f"cases/{case1}.pdf")
c1_cleaned_lines = clean_lines(c1_text)
c1_paragraph = extract_paragraphs(c1_cleaned_lines)

for para in c1_paragraph:
    print(para)

1 The applicant, Mdm Lee Hui Chin, applied under s 10 of the Arbitration  Act 2001 (2020 Rev Ed) (“AA”) to extend the time fixed by the terms of an  arbitration agreement to refer disputes to arbitration. I granted the application  for the reasons set out below.
2 The applicant was the policyholder of two insurance policies (the  “Policies”) taken out with the respondent, Chubb Insurance Singapore Limited.  The insured person under the Policies was the applicant’s spouse (the  “Deceased”). The Policies provided for Accidental Death Benefits (“ADB”),  which was payable in the event that death occurs as a result of an accidental  injury. The Policies also provided that any dispute was to be referred to  arbitration, and that such arbitration must be commenced three months from the  day such parties were unable to settle the dispute. On 2 April 2021, the Deceased  fell while riding his bicycle and was found unconscious in an uncovered drain.  He was brought to Ng Teng Fong General Hospita

**Case 2**

In [17]:
# case 2
c2_text = extract_text_from_pdf(f"cases/{case2}.pdf")
c2_cleaned_lines = clean_lines(c2_text)
c2_paragraph = extract_paragraphs(c2_cleaned_lines)

for para in c2_paragraph:
    print(para)

1 The Accused, 48 years old, pleaded guilty to a charge under section 65(b) of the Road Traffic Act (RTA), Chapter 276, in that he: “did drive motorcar SJP 8123Z, at the junction of River Valley Road by Hoot Kiam Road, Singapore, without reasonable consideration for other persons using the road, to wit, by failing to conform to the red light signal whilst going straight along River Valley Road by Zion Road, and resulting in a collision with motor taxi SHC 2261P, which was travelling from [his] left to right along Zion Road........”.
2 This offence under section 65(b), RTA is punishable with a fine not exceeding $1,000 or to imprisonment for a term not exceeding 6 months or to both.
3 I fined the Accused a sum of $800 (in default 5 days’ imprisonment) and imposed a disqualification of 3 months on all classes of licences on him. The Accused is dissatisfied with the order of disqualification and is appealing against it.
4 This accident happened on 13 April 2014, at about 6:19 am.
5 When t

**Goal 2**

In [18]:
def extract_paragraphs(cleaned_lines):
    paragraphs = []
    current = []
    start = False
    lines_to_remove = find_number_of_lines_to_be_removed_from_behind(cleaned_lines)

    for i in range(len(cleaned_lines)-lines_to_remove):
        if cleaned_lines[i].isnumeric() and start == True:
            paragraph = {}
            if current[0].isnumeric():
                paragraph["para_num"] = int(current[0])
            if cleaned_lines[i-1].strip()[-1] != ".":
                paragraph["text"] = " ".join(current[1:-1])
            else:
                paragraph["text"] = " ".join(current[1:])
            paragraphs.append(paragraph)
            current = []
        current.append(cleaned_lines[i])
        if cleaned_lines[i].strip()[-1] == ":" and start == False:
            start = True
            current = []
        
    if current:
        paragraph = {}
        paragraph["para_num"] = int(current[0])
        paragraph["text"] = " ".join(current[1:])
        paragraphs.append(paragraph) 

    return paragraphs[1:]  

**Case 1**

In [19]:
# case 1
c1_text = extract_text_from_pdf(f"cases/{case1}.pdf")
c1_cleaned_lines = clean_lines(c1_text)
c1_paragraph = extract_paragraphs(c1_cleaned_lines)

for para in c1_paragraph:
    print(para)

{'para_num': 1, 'text': 'The applicant, Mdm Lee Hui Chin, applied under s 10 of the Arbitration  Act 2001 (2020 Rev Ed) (“AA”) to extend the time fixed by the terms of an  arbitration agreement to refer disputes to arbitration. I granted the application  for the reasons set out below.'}
{'para_num': 2, 'text': 'The applicant was the policyholder of two insurance policies (the  “Policies”) taken out with the respondent, Chubb Insurance Singapore Limited.  The insured person under the Policies was the applicant’s spouse (the  “Deceased”). The Policies provided for Accidental Death Benefits (“ADB”),  which was payable in the event that death occurs as a result of an accidental  injury. The Policies also provided that any dispute was to be referred to  arbitration, and that such arbitration must be commenced three months from the  day such parties were unable to settle the dispute. On 2 April 2021, the Deceased  fell while riding his bicycle and was found unconscious in an uncovered drain.

**Case 2**

In [20]:
# case 2
c2_text = extract_text_from_pdf(f"cases/{case2}.pdf")
c2_cleaned_lines = clean_lines(c2_text)
c2_paragraph = extract_paragraphs(c2_cleaned_lines)

for para in c2_paragraph:
    print(para)

{'para_num': 1, 'text': 'The Accused, 48 years old, pleaded guilty to a charge under section 65(b) of the Road Traffic Act (RTA), Chapter 276, in that he: “did drive motorcar SJP 8123Z, at the junction of River Valley Road by Hoot Kiam Road, Singapore, without reasonable consideration for other persons using the road, to wit, by failing to conform to the red light signal whilst going straight along River Valley Road by Zion Road, and resulting in a collision with motor taxi SHC 2261P, which was travelling from [his] left to right along Zion Road........”.'}
{'para_num': 2, 'text': 'This offence under section 65(b), RTA is punishable with a fine not exceeding $1,000 or to imprisonment for a term not exceeding 6 months or to both.'}
{'para_num': 3, 'text': 'I fined the Accused a sum of $800 (in default 5 days’ imprisonment) and imposed a disqualification of 3 months on all classes of licences on him. The Accused is dissatisfied with the order of disqualification and is appealing against 

**Goal 3**

In [21]:
def extract_paragraphs(cleaned_lines):
    paragraphs = []
    current = []
    header = ""
    start = False
    lines_to_remove = find_number_of_lines_to_be_removed_from_behind(cleaned_lines)

    for i in range(len(cleaned_lines)-lines_to_remove):
        if cleaned_lines[i].isnumeric() and start == True:
            paragraph = {}
            paragraph["header"] = header
            if current[0].isnumeric():
                paragraph["para_num"] = int(current[0])
            if cleaned_lines[i-1].strip()[-1] != ".":
                header = current[-1]
                paragraph["text"] = " ".join(current[1:-1])
            else:
                paragraph["text"] = " ".join(current[1:])
            paragraphs.append(paragraph)
            current = []
        current.append(cleaned_lines[i])
        if cleaned_lines[i].strip()[-1] == ":" and start == False:
            start = True
            current = []
        
    if current:
        paragraph = {}
        paragraph["header"] = header
        paragraph["para_num"] = int(current[0])
        paragraph["text"] = " ".join(current[1:])
        paragraphs.append(paragraph) 

    return paragraphs[1:]  

**Case 1**

In [22]:
# case 1
c1_text = extract_text_from_pdf(f"cases/{case1}.pdf")
c1_cleaned_lines = clean_lines(c1_text)
c1_paragraph = extract_paragraphs(c1_cleaned_lines)

for para in c1_paragraph:
    print(para)

{'header': 'Introduction ', 'para_num': 1, 'text': 'The applicant, Mdm Lee Hui Chin, applied under s 10 of the Arbitration  Act 2001 (2020 Rev Ed) (“AA”) to extend the time fixed by the terms of an  arbitration agreement to refer disputes to arbitration. I granted the application  for the reasons set out below.'}
{'header': 'Facts', 'para_num': 2, 'text': 'The applicant was the policyholder of two insurance policies (the  “Policies”) taken out with the respondent, Chubb Insurance Singapore Limited.  The insured person under the Policies was the applicant’s spouse (the  “Deceased”). The Policies provided for Accidental Death Benefits (“ADB”),  which was payable in the event that death occurs as a result of an accidental  injury. The Policies also provided that any dispute was to be referred to  arbitration, and that such arbitration must be commenced three months from the  day such parties were unable to settle the dispute. On 2 April 2021, the Deceased  fell while riding his bicycle an

**Case 2**

In [23]:
# case 2
c2_text = extract_text_from_pdf(f"cases/{case2}.pdf")
c2_cleaned_lines = clean_lines(c2_text)
c2_paragraph = extract_paragraphs(c2_cleaned_lines)

for para in c2_paragraph:
    print(para)

{'header': 'Introduction', 'para_num': 1, 'text': 'The Accused, 48 years old, pleaded guilty to a charge under section 65(b) of the Road Traffic Act (RTA), Chapter 276, in that he: “did drive motorcar SJP 8123Z, at the junction of River Valley Road by Hoot Kiam Road, Singapore, without reasonable consideration for other persons using the road, to wit, by failing to conform to the red light signal whilst going straight along River Valley Road by Zion Road, and resulting in a collision with motor taxi SHC 2261P, which was travelling from [his] left to right along Zion Road........”.'}
{'header': 'Introduction', 'para_num': 2, 'text': 'This offence under section 65(b), RTA is punishable with a fine not exceeding $1,000 or to imprisonment for a term not exceeding 6 months or to both.'}
{'header': 'Introduction', 'para_num': 3, 'text': 'I fined the Accused a sum of $800 (in default 5 days’ imprisonment) and imposed a disqualification of 3 months on all classes of licences on him. The Accuse