In [1]:
# needed packages
import fitz
import pandas as pd
import sys
from collections import Counter

In [2]:
def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return ", ".join(l)

In [3]:
def get_narrative(pdf):
    doc = fitz.open(pdf)

    style_counts = []

    for page in doc:
        #, flags=11

        paths = page.get_drawings()  # get drawings on the page

        drawn_lines = []
        for p in paths:
            # print(p)
            for item in p["items"]:
                # print(item[0])
                if item[0] == "l":  # an actual line
                    # print(item[1], item[2])
                    p1, p2 = item[1], item[2]
                    if p1.y == p2.y:
                        drawn_lines.append((p1, p2))
                elif item[0] == "re":  # a rectangle: check if height is small
                    # print(item[0])
                    # print(item[1])
                    r = item[1]
                    if r.width > r.height and r.height <= 2:
                        drawn_lines.append((r.tl, r.tr))  # take top left / right points

        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                for s in l["spans"]:  # iterate through the text spans

                    font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                        s["font"],  # font name
                        flags_decomposer(s["flags"]),  # readable font flags
                        s["size"],  # font size
                        s["color"],  # font color
                    )

                    r = fitz.Rect(s['bbox'])
                    for p1, p2 in drawn_lines:  # check distances for start / end points
                        if abs(r.bl - p1) <= 4 and abs(r.br - p2) <= 4:
                            font_properties = " ".join([font_properties, 'underlined'])

                    style_counts.append(font_properties)

    styles = dict(Counter(style_counts))

    style_list = sorted(styles.items(), key=lambda x:x[1], reverse=True)

    headers = {}
    count = 0
    p_size = int(style_list[0][0].split('size')[1].split()[0].strip(','))

    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                texts = ""
                count+=1
                for s in l['spans']:
                    if s['size'] >= p_size:
                        texts = "".join ([texts, s['text']])
                text_list = texts.split()
                if len(text_list) > 0 and len(text_list) < 7:
                    headers.update({texts:count})

    opinion_loc = headers['Opinion']

    count = 0
    p_size = int(style_list[0][0].split('size')[1].split()[0].strip(','))
    new_headers = {}
    header_properties = ""

    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                count+=1
                if count==opinion_loc:
                    for s in l['spans']:
                        header_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                            s["font"],  # font name
                            flags_decomposer(s["flags"]),  # readable font flags
                            s["size"],  # font size
                            s["color"],  # font color
                        )

    count = 0
    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                count+=1
                for s in l['spans']:
                    font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                        s["font"],  # font name
                        flags_decomposer(s["flags"]),  # readable font flags
                        s["size"],  # font size
                        s["color"],  # font color
                    )
                    if font_properties==header_properties:
                        new_headers.update({s['text']:count})

    p_size = int(style_list[0][0].split('size')[1].split()[0].strip(','))
    p_color = style_list[0][0].split('color')[1].split()[0].strip(',')
    p_font = style_list[0][0]

    bad_fonts = []

    for style in style_list:
        font_str = style[0]
        s_size = int(font_str.split('size')[1].split()[0].strip(','))
        s_color = font_str.split('color')[1].split()[0].strip(',')

        # if font matches paragraph font, it's a bad_font
        if font_str==p_font:
            bad_fonts+=[font_str]
        # if font doesn't match paragraph text color, it's a bad_font
        if s_color!=p_color:
            bad_fonts+=[font_str]
        # if font matches characteristics of vocab word font, it's a bad font
        if ('bold' in font_str and 'underlined' in font_str) and ('italic' in font_str and p_size==s_size):
            bad_fonts+=[font_str]
        # if font size is smaller than paragraph text size, it's a bad_font
        if s_size<p_size:
            bad_fonts+=[font_str]

    master = []
    for style in style_list:
        if style[0] not in bad_fonts:
            master += [style[0]]

    for page in doc:

        paths = page.get_drawings()  # get drawings on the page

        drawn_lines = []
        for p in paths:
            # print(p)
            for item in p["items"]:
                # print(item[0])
                if item[0] == "l":  # an actual line
                    # print(item[1], item[2])
                    p1, p2 = item[1], item[2]
                    if p1.y == p2.y:
                        drawn_lines.append((p1, p2))
                elif item[0] == "re":  # a rectangle: check if height is small
                    # print(item[0])
                    # print(item[1])
                    r = item[1]
                    if r.width > r.height and r.height <= 2:
                        drawn_lines.append((r.tl, r.tr))  # take top left / right points

    count = 0
    opinion_subheaders = {}
    p_color = style_list[0][0].split('color')[1].split()[0].strip(',')

    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                texts = ""
                count+=1
                span_fonts = []
                if count>=opinion_loc:
                    for s in l['spans']:
                        font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                            s["font"],  # font name
                            flags_decomposer(s["flags"]),  # readable font flags
                            s["size"],  # font size
                            s["color"],  # font color
                        )

                        r = fitz.Rect(s['bbox'])
                        for p1, p2 in drawn_lines:  # check distances for start / end points
                            if abs(r.bl - p1) <= 4 and abs(r.br - p2) <= 4:
                                font_properties = " ".join([font_properties, 'underlined'])

                        span_fonts+=[font_properties]
                        texts = "".join ([texts, s['text']])

                text_list = texts.split()
                if len(text_list) > 0 and len(text_list) < 7:
                    if any(i in span_fonts for i in master):
                        opinion_subheaders.update({texts:count})
                    if texts.isupper()==True:
                        opinion_subheaders.update({texts:count})

    narrative = ""
    conclusion_loc = 100000
    count = 0
    p_size = int(style_list[0][0].split('size')[1].split()[0].strip(','))

    keys_as_list = list(opinion_subheaders)
    for header_index in range(len(keys_as_list)):
        header = keys_as_list[header_index]
        if 'conclusion' in header.lower():
            conclusion_loc = opinion_subheaders[header]

    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                texts = ""
                count+=1
                if count>=opinion_loc and count < conclusion_loc:
                    for s in l['spans']:
                        if s['size'] == p_size:
                            texts = "".join ([texts, s['text']])

                narrative = " ".join([narrative, texts])


    return narrative.strip()

In [10]:
#### CHANGE FOR URSELF, folder with pdf cases in it
mypath = "C:/Users/jacqu/Downloads/Court Case PDFs"

In [11]:
### Generate a df with your current cases
import os
import glob

pdf_files = glob.glob("%s/Court Case PDFs/*.pdf" % mypath)

# Initialize DataFrame with corresponding row #
df = pd.DataFrame(index=range(len(pdf_files)), columns=["CaseName", "Narrative"])

for idx, file in enumerate(pdf_files):
    # Extract the title after the word "cases" in the file path
    if "50cases" in file:
        title = file.split("50cases/")[-1].split("/")[0]
    else:
        # Assuming the title is the filename without the extension
        title = os.path.splitext(os.path.basename(file))[0]

    narrative = get_narrative(file)

    # Replace the 'CaseName' and 'Narrative' in the DataFrame directly
    df.at[idx, "CaseName"] = title
    df.at[idx, "Narrative"] = narrative

# SERENE

use this code to add your cases to the csv with all the cases

In [None]:
drive_csv = "/tatev_jacqui_grace.csv"
df2 = pd.read_csv(drive_csv)

In [None]:
## download the combined csv on google drive and combine with your csv for a new combined df
combined_df = pd.concat([df, df2]).drop_duplicates()
combined_df.head()

In [12]:
# this function takes the pdf, extracted narrative, folder where your pdfs are stored, and folder where you want to store txt files
# it processes each pdf file in your input folder to extract the narrative, and it saves it as a .txt file in your output folder
# make sure to change path to input and output folders at the bottom

def save_narrative_to_txt(pdf_path, narrative, output_folder, input_folder):
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    txt_path = os.path.join(output_folder, f"{base_name}.txt")

    with open(txt_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(narrative)

def process_pdfs(input_folder, output_folder):
    pdf_files = glob.glob(os.path.join(input_folder, "*.pdf"))
    narratives = []

    for pdf_path in pdf_files:
        # doc = fitz.open(pdf_path)
        #style_list = get_styles(doc)
        # opinion_loc = get_opinion(doc, style_list)
        # master = get_master(style_list)
        # opinion_subheaders = get_subheaders(doc, style_list, opinion_loc, master)

        # keys_as_list = list(opinion_subheaders)
        # for header_index in range(len(keys_as_list)):
            # header = keys_as_list[header_index]

        narrative = get_narrative(pdf_path)
        narratives.append(narrative)

        # Save narrative to text file in the same folder, overwriting existing files
        save_narrative_to_txt(pdf_path, narrative, output_folder, input_folder)

# Example usage:
# input_folder = "/Users/tatevgomtsyan/MSDS/Capstone/First12"
# output_folder = "/Users/tatevgomtsyan/MSDS/Capstone/NarrativeTexts"
# process_pdfs(input_folder, output_folder)

In [14]:
input_folder = f"{mypath}/Court Case PDFs"
#### CHANGE FOR URSELF
output_folder = f"{mypath}/Court Case TXTs"
process_pdfs(input_folder, output_folder)

### Making combined csv
1. download all cases and put into folder (called Court Case PDFs)
2. create df with cases, the if statement takes care of pdf repeats, under the assumption that if someone uploaded the same case, the file would be renamed to have ({number}) at the end

ex. C.S. Wyndham Hotels vs. C.S. Wyndham Hotels(1)

3. test to make sure df converted to csv ok