A pipeline to read briefs from pdf, preprocess them, extract the arguments from the table of contents, and split the brief into sections

In [1]:
%pip install pycryptodome pypdf2 fuzzywuzzy openai

Collecting pycryptodome
  Downloading pycryptodome-3.22.0-cp37-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting openai
  Downloading openai-1.66.5-py3-none-any.whl.metadata (24 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.9.0-cp39-cp39-win_amd64.whl.metadata (5.3 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting annotated-types>=0.6.0 (from pydantic<3,>=1.9.0->openai)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.27.2 (from pydantic<3,>=1.9.0->openai)
  Downloading pydantic_core-2.27.2-cp39-cp39-win_amd64.whl.metadata (6.7 kB)
Downloading pycryptodome-3.22.0-cp37-abi3-wi

In [2]:
import os
import PyPDF2
import pandas as pd
from Crypto.Cipher import AES
from PyPDF2.errors import PdfReadError

def extract_with_pypdf2(pdf_path):
  try:
    # Open the PDF file in binary mode
    with open(pdf_path, 'rb') as file:
        # Initialize a PDF reader object
        pdf_reader = PyPDF2.PdfReader(file)

        # Initialize a variable to hold the text of the entire document
        full_text = ""

        if pdf_reader.is_encrypted:
            # Attempt to decrypt it with an empty password
            try:
                pdf_reader.decrypt('')
            except Exception as e:
                return f"Failed to decrypt PDF: {e}"

        # Iterate through each page in the PDF
        for page in pdf_reader.pages:
            # Extract text from the page
            page_text = page.extract_text()

            # Append the page text to the full text of the document
            if page_text:  # Checking if page_text is not None
                full_text += page_text + "\n"

    return full_text
  except (PdfReadError, TypeError )as e:
    print(f"PDF read error in {pdf_path}: {e}")
    return None

In [3]:

data = []  # List to store file path and text data
directory = "../data/new_briefs"
files = os.listdir(directory)
for filename in files:
  if filename.endswith(".pdf"):
    file_path = os.path.join(directory, filename)
    text = extract_with_pypdf2(file_path)
    if text is not None:  # Check if text extraction was successful
      data.append({'filename': filename, 'text': text})
pdf_df = pd.DataFrame(data)


In [4]:
pdf_df = pd.DataFrame(data)

In [5]:
print(len(data))

27


In [6]:
print(pdf_df.head)

<bound method NDFrame.head of                       filename  \
0   Docket23-1275_Brief001.pdf   
1   Docket23-1275_Brief002.pdf   
2   Docket23-1275_Brief003.pdf   
3   Docket23-1275_Brief004.pdf   
4    Docket24-249_Brief001.pdf   
5    Docket24-249_Brief002.pdf   
6    Docket24-275_Brief001.pdf   
7    Docket24-304_Brief001.pdf   
8    Docket24-304_Brief002.pdf   
9    Docket24-304_Brief003.pdf   
10   Docket24-316_Brief001.pdf   
11   Docket24-316_Brief002.pdf   
12   Docket24-316_Brief003.pdf   
13   Docket24-316_Brief004.pdf   
14   Docket24-316_Brief005.pdf   
15   Docket24-354_Brief001.pdf   
16   Docket24-362_Brief001.pdf   
17   Docket24-362_Brief002.pdf   
18   Docket24-362_Brief003.pdf   
19   Docket24-394_Brief001.pdf   
20   Docket24-394_Brief002.pdf   
21   Docket24-394_Brief003.pdf   
22   Docket24-416_Brief001.pdf   
23   Docket24-416_Brief002.pdf   
24     Docket24-7_Brief001.pdf   
25     Docket24-7_Brief002.pdf   
26     Docket24-7_Brief003.pdf   

                 

In [7]:
pdf_df.to_csv('../data/new_dataset/new_extracted_briefs_ckpt.csv', index=False)

In [8]:
import pandas as pd
toc_df = pd.read_csv('../data/new_dataset/new_extracted_briefs_ckpt.csv')

In [9]:
import pandas as pd
import re

# Define the function to extract the table of contents and the rest of the content
def extract_toc_and_rest(content):
    toc_pattern = r"T\s*A\s*B\s*L\s*E\s*O\s*F\s*C\s*O\s*N\s*T\s*E\s*N\s*T\s*S\s*"
    toa_pattern = r"T\s*A\s*B\s*L\s*E\s*O\s*F\s*A\s*U\s*T\s*H\s*O\s*R\s*I\s*T\s*I\s*E\s*S\s*(?![ .]{2,})"
    # toa_pattern = r"TABLE OF AUTHORITIES"
    conclusion_pattern = r"(c\s*o\s*n\s*c\s*l\s*u\s*s\s*i\s*o\s*n)"

    def extract_text(start_pattern, end_pattern, content, toa=True):
        start_indices = [m.start() for m in re.finditer(start_pattern, content)]
        end_indices = [m.start() for m in re.finditer(end_pattern, content)]

         # Check if there is at least one start index and one end index
        if not start_indices or not end_indices:
            return None, None

        # Use the first start index
        start_index = start_indices[0]

        if toa:
          # Use the last end index, ensuring it is after the start index
          end_index = next((i for i in reversed(end_indices) if i > start_index), None)
        else:
          end_index = end_indices[0] # in this case you are using Conclusion, so you want first instance
        if end_index is not None:
            return content[start_index:end_index], end_index
        else:
            return content[start_index:], len(content)

    toc, toc_end_index = extract_text(toc_pattern, toa_pattern, content)

    if toc is None or len(toc.strip()) <= len('TABLE OF CONTENTS PagePage'):
        toc, toc_end_index = extract_text(toc_pattern, conclusion_pattern, content, toa=False)

    rest_of_content = content[toc_end_index:] if toc_end_index is not None else None

    return toc, rest_of_content

In [10]:
# Remove briefs that where text is null bc of pdf reading issues or they are very short
# usually because they are not actually briefs or were read improperly
old_len = len(toc_df)
toc_df = toc_df[(toc_df['text'].notnull()) & (toc_df['text'].str.len() >= 15000)]
toc_df = toc_df.reset_index(drop=True)

print(f"Dropped {old_len - len(toc_df)} rows of empty or very short text")

Dropped 0 rows of empty or very short text


In [12]:
import re

# Function to split brief text into the TOC and Content by finding and slicing off everything after Conclusion
def split_text(text):
    conclusion_pattern = r'(?:c\s*o\s*n\s*c\s*l\s*u\s*s\s*i\s*o\s*n)'
    conclusion_match = re.search(conclusion_pattern, text, flags=re.MULTILINE | re.IGNORECASE)
    if conclusion_match:
        # If a match is found, slice after conclusion
        # print("Here is the match for conclusion")
        # print(conclusion_match)
        toc_text = text[:conclusion_match.start()]
        content_text = text[conclusion_match.end():]
        return toc_text, content_text
    else:
        # toa_match = re.search(r'^Table of Authorities\b', text, flags=re.MULTILINE | re.IGNORECASE)
        toa_pattern = r"T\s*A\s*B\s*L\s*E\s*O\s*F\s*A\s*U\s*T\s*H\s*O\s*R\s*I\s*T\s*I\s*E\s*S\s*\.\s*\.\s*"
        toa_match = re.search(toa_pattern, text, flags=re.MULTILINE | re.IGNORECASE)
        if not toa_match:
          # If still no match, look for any line containing "Authorities"
          toa_match = re.search(r'^.*?\bAuthorities\b.*$', text, flags=re.MULTILINE | re.IGNORECASE)
    
    
    if toa_match:
        # If a match for "Table of Authorities" is found, split the text at that point
        toc_text = text[:toa_match.start()]
        content_text = text[toa_match.end():]
        return toc_text, content_text
    return None, None

In [13]:
# Apply the extract_toc_and_rest function to the 'text' field and store the results in new columns
toc_df[['toc', 'content']] = toc_df.apply(lambda row: pd.Series(split_text(row['text'])), axis=1)

# Now, toc_df contains all the original fields, plus the 'toc' and 'content' columns with the extracted data
print("toc_df updated with 'toc' and 'content' columns.")

toc_df updated with 'toc' and 'content' columns.


In [14]:
def extract_docket_number(filename):
    match = re.search(r'Docket(\d+-\d+)_', filename)
    if match:
        return match.group(1)
    else:
        return None

toc_df[['docket_num']] = toc_df.apply(lambda row: pd.Series(extract_docket_number(row['filename'])), axis=1)

# Now, toc_df contains all the original fields, plus the 'toc' and 'content' columns with the extracted data
print("toc_df updated with docket number column.")

toc_df updated with docket number column.


In [15]:
toc_df['court'] = 'SCOTUS'

In [16]:
print(toc_df.head)

<bound method NDFrame.head of                       filename  \
0   Docket23-1275_Brief001.pdf   
1   Docket23-1275_Brief002.pdf   
2   Docket23-1275_Brief003.pdf   
3   Docket23-1275_Brief004.pdf   
4    Docket24-249_Brief001.pdf   
5    Docket24-249_Brief002.pdf   
6    Docket24-275_Brief001.pdf   
7    Docket24-304_Brief001.pdf   
8    Docket24-304_Brief002.pdf   
9    Docket24-304_Brief003.pdf   
10   Docket24-316_Brief001.pdf   
11   Docket24-316_Brief002.pdf   
12   Docket24-316_Brief003.pdf   
13   Docket24-316_Brief004.pdf   
14   Docket24-316_Brief005.pdf   
15   Docket24-354_Brief001.pdf   
16   Docket24-362_Brief001.pdf   
17   Docket24-362_Brief002.pdf   
18   Docket24-362_Brief003.pdf   
19   Docket24-394_Brief001.pdf   
20   Docket24-394_Brief002.pdf   
21   Docket24-394_Brief003.pdf   
22   Docket24-416_Brief001.pdf   
23   Docket24-416_Brief002.pdf   
24     Docket24-7_Brief001.pdf   
25     Docket24-7_Brief002.pdf   
26     Docket24-7_Brief003.pdf   

                 

In [73]:
print(toc_df.iloc[6]["toc"])

 
No. 24-275 
IN THE 
Supreme Court of the United States  
   
 
DONTE PARRISH , 
Petitioner,  
v. 
UNITED STATES OF AMERICA , 
Respondent.  
   
On Writ of  Certiorari  to the  
United States Court of Appeals  
for the  Fourth  Circuit  
   
BRIEF FOR PETITIONER  
   
Amanda R. Parker  
Sarah Welch  
Samuel V. Lioi  
JONES DAY  
North Point  
901 Lakeside Ave.  
Cleveland, OH 44114  
 
Daniel C. Loesing  
JONES DAY  
325 John H. McConnell 
Blvd., Suite 600  
Columbus, OH 43215  Amanda K. Rice  
   Counsel of Record  
JONES  DAY  
150 W. Jefferson Ave.  
Suite 2100  
Detroit, MI 48226   
(313) 733- 3939  
 arice@jonesday.com  
 
 
Counsel for Petitioner  
 i  
 
QUESTION PRESENTED  
Ordinarily, litigants must file a notice of appeal 
within 30 or 60 days of an adverse judgment.  
28 U.S.C. §  2107(a) –(b).  Under 28 U.S.C. §  2107(c) 
and Fed. R. App. P. 4(a)(6), however, district courts 
can reopen an expired appeal period when a party did not receive timely notice of the judgment.  T

In [62]:
print(toc_df.iloc[25]["content"])

 .................................................................. 49  
   
 
iv 
TABLE OF AUTHORITIES 
 
Page 
 
 CASES  
Already, LLC v. Nike, Inc.  
568 U.S. 85 (2013) ................................................ 45 
Bennett v. Plenert  
1993 WL 669429  
(D. Or. Nov. 18, 1993)  .......................................... 30 
Bennett v. Spear 
520 U.S. 154 ( 1997) ...................... 28,  29, 30, 31, 33 
California v. Texas  
593 U.S. 659 (2021) ............................ 34, 39,  40, 41 
Chamber of Com. of U.S. v. EPA  
642 F.3d 192 (D.C. Cir. 2011)  .............................. 18 
Clapper v. Amnesty Int’l USA 
568 U.S. 398 ( 2013) ........................................ 26, 32 
CBS, Inc. v. United States  
316 U.S. 407 ( 1942) ........................................ 30, 31 
Competitive Enter. Inst. v. FCC  
970 F.3d 372 (D.C. Cir. 2020)  .............................. 36 
Competitive Enter. Inst. v. NHTSA  
901 F.2d 107 (D.C. Cir. 1990)  ..............................

In [78]:
toc_df['arguments'] = toc_df['toc']

In [79]:
buf_df = toc_df[['toc', 'arguments']].copy()
buf_df.to_csv('../data/new_dataset/manual_extract.csv', index=False)

In [68]:
toc_df.to_csv('../data/new_dataset/new_extracted_briefs_ckpt_2.csv', index=False)

In [17]:
toc_df = pd.read_csv('../data/new_dataset/new_extracted_briefs_ckpt_2.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/new_dataset/extracted_briefs_ckpt_2.csv'

In [19]:
# Count number of unique cases
unique_ids = list(toc_df['docket_num'].unique())
print(f"Number of cases: {len(unique_ids)}")

Number of cases: 10


In [20]:
# Tokenize each entry and count tokens
toc_df['token_count'] = toc_df['text'].apply(lambda x: len(x.split()) if pd.notnull(x) else 0)

# Calculate the average number of tokens
average_tokens = toc_df['token_count'].mean()

print("Average number of tokens per entry:", average_tokens)

Average number of tokens per entry: 12516.444444444445


Preprocess the toc to extract only the argument headers

In [74]:
import re
def clean_table_of_contents(toc_text):
    # Stage 1: Find the arguments
    # Remove standalone page numbers without a period. Must do this before removing periods below.
    toc_text = re.sub(r'^\s*\d+\s*$', '', toc_text, flags=re.MULTILINE)
    # Attempt to find the start of the arguments to extract. This will likely fail on some number of cases
    pattern = r'(Arguments?|Reasons?\s+for).*\n?'
    matches = re.search(pattern, toc_text, re.MULTILINE | re.IGNORECASE)
    # print("Here are the matches")
    # print(matches)
    if not matches:
        # print("oopsie no matches")
        return None
    toc_text = toc_text[matches.end():]
    # print(f"After matching on Argument, the ToC looks like: \n{toc_text}")
    # Stage 2: Use re.search to find "CONCLUSION" on a line by itself, case-insensitive

    conclusion_match = re.search(r'CONCLUSIONS?\b', toc_text, flags=re.MULTILINE | re.IGNORECASE) #remove ^ character?
    if conclusion_match:
        # If a match is found, slice after conclusion
        # print("Here is the match for conclusion")
        # print(conclusion_match)
        toc_text = toc_text[:conclusion_match.start()]
        # print(f"After matching on Conclusion, the ToC looks like: \n{toc_text}")

    # Now split on the periods
    split_text = re.split(r'\.\s*\.\s*\.\s*.*$', toc_text, flags=re.MULTILINE)
    # Removing empty strings and None elements that might result from capturing groups in the split

    # Finally, iterate through each line and apply other preprocessing steps, mainly removing periods and roman numerals
    processed_text = []
    for index, text in enumerate(split_text):
      # text = re.sub(r'\.{2,}', ' ', text)  # Replace periods
      text = re.sub(r'\.\s*\.\s*\.\s*.*$', '', text, flags=re.MULTILINE) # Find any sequence of three periods, with any amount of space after them, and remove the rest of the line
      # text = re.sub(r'(\.\s){2,}.*$', '', text, flags=re.MULTILINE)

      # Remove lowercase Roman numerals at the end of lines, ensuring they're not part of section titles
      text = re.sub(r'^\s*(i{1,3}|iv|vi{0,3}|ix|xi{0,3}|xii{0,3}|xiii|xiv|xv)\s*[\.\s]*$', '', text, flags=re.MULTILINE)
      # Remove spaces before newlines
      text = re.sub(r'[ \t]+$', '', text, flags=re.MULTILINE)
      text = re.sub(r'\n+', ' ', text)
      text = text.strip()

      # Special condition for the first item in the list
      if index == 0:
        # Remove "Argument" followed by any punctuation or space at the start of the line
        text = re.sub(r'^Argument[\s.,;:!?-]*', '', text, flags=re.IGNORECASE)
      # Look for section indicators to remove any extra tokens at start of the line
      pattern = r'(I\.|II\.|III\.|IV\.|V\.|VI\.|VII\.|VIII\.|IX\.|X\.|1\.|2\.|3\.|4\.|5\.|6\.|7\.|8\.|9\.|10\.|A\.|B\.|C\.|D\.|E\.|F\.|G\.|H\.|I\.|J\.)'
      match = re.search(pattern, text)

      if match:
        text = text[match.start():]

      if text: # Ensure non-empty, non-whitespace only sections are kept
        processed_text.append(text)

    split_text = [s for s in split_text if s and s.strip()]
    return processed_text

In [None]:
print(toc_df.head)

<bound method NDFrame.head of                         filename  \
0     Docket20-5279_Brief007.pdf   
1     Docket20-5279_Brief008.pdf   
2     Docket20-5279_Brief009.pdf   
3     Docket20-5279_Brief010.pdf   
4      Docket20-828_Brief001.pdf   
...                          ...   
3974  Docket16-1027_Brief009.pdf   
3975  Docket16-1027_Brief010.pdf   
3976   Docket17-387_Brief001.pdf   
3977   Docket17-387_Brief002.pdf   
3978   Docket17-387_Brief003.pdf   

                                                   text  \
0     No. 20-5279  \n \nIN THE \nSupreme Court of th...   
1     No. 20-5279 \nIN THE \nSupreme Court of the Un...   
2      \n No. 20-5279  \nIn the Supreme Court of the...   
3      \n \n \n \n \n \nNo. 20-5279 \n \n In the Sup...   
...                                                 ...   
3974  No. 16-1027\nIn the Supreme Court of the Unite...   
3976   \n \nNo. 17 -387 \n \n \nIN THE \nSUPREME COU...   
3977   \n No. 17-387 \nIn the Supreme Court of the U...   
3978  

In [75]:
toc_df['arguments'] = toc_df['toc'].apply(clean_table_of_contents)

In [77]:
buf_df = toc_df[['toc', 'arguments']].copy()
buf_df.to_csv('../data/new_dataset/manual_extract.csv', index=False)

In [107]:
import ast

arg_df = pd.read_csv('../data/new_dataset/manual_extract01.csv')
arg_df['arg_list'] = arg_df['arg_list'].apply(ast.literal_eval)

In [110]:
arg_df["arguments"] = ""

In [111]:
arg_df.head()

Unnamed: 0,toc,arg_list,arguments
0,\n No. 23-1275 \nIn the Supreme Court of the...,[I. The any-qualified-provider provision does ...,
1,NO. 23-1275 \nIN THE \nSupreme Court of the Un...,[I. The any-qualified-provider provision does ...,
2,No. 23-1275 \nIN THE \nSupreme Court of the ...,[I. PPSAT IS AN EXEMPLARY PROVIDER OFFERING V...,
3,\n \n \nNo. 23-1275 \nIn the Supreme Court of...,[The Free-Choice-Of-Provider Provision Unambig...,
4,\n No. 24-249 \n \nIN THE \nSupreme Court of...,[I. Congress Enacted The IDEA To Supplement Th...,


In [112]:
import json

def convert_args(row):
    arg_list = row['arg_list']
    print(arg_list)
    args = ""
    for arg in arg_list:
        args += f"{arg}\n"

    row['arguments'] = args

    return row

In [113]:
print(convert_args(arg_df.iloc[0]))

['I. The any-qualified-provider provision does not create individual rights enforceable under 42 U.S.C.  1983', 'A. Spending Clause statutes must unambiguously confer individual rights to be privately enforceable under Section 1983', 'B. The any-qualified-provider provision does not unambiguously confer individual federal rights', 'C. Finding a privately enforceable individual right in this case would create line-drawing problems', 'D. Other enforcement mechanisms protect beneficiaries', 'II. The court of appeals erred in finding an individual federal right']
toc           \n No. 23-1275  \nIn the Supreme Court of the...
arg_list     [I. The any-qualified-provider provision does ...
arguments    I. The any-qualified-provider provision does n...
Name: 0, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['arguments'] = args


In [118]:
arg_df = arg_df.apply(lambda row: pd.Series(convert_args(row)), axis=1)

['I. The any-qualified-provider provision does not create individual rights enforceable under 42 U.S.C.  1983', 'A. Spending Clause statutes must unambiguously confer individual rights to be privately enforceable under Section 1983', 'B. The any-qualified-provider provision does not unambiguously confer individual federal rights', 'C. Finding a privately enforceable individual right in this case would create line-drawing problems', 'D. Other enforcement mechanisms protect beneficiaries', 'II. The court of appeals erred in finding an individual federal right']
['I. The any-qualified-provider provision does not create a private right', 'A. Only clear rights-creating terms create a private right in spending laws', 'B. The any-qualified-provider provision lacks clear rights-creating language', 'C. The Court should retain its bright lines about what qualifies as clear rights-creating language', '1. Talevski  proves Congress can clearly confer a private right by explicitly labeling a benefit

In [119]:
print(arg_df.iloc[0]['arguments'])

I. The any-qualified-provider provision does not create individual rights enforceable under 42 U.S.C.  1983
A. Spending Clause statutes must unambiguously confer individual rights to be privately enforceable under Section 1983
B. The any-qualified-provider provision does not unambiguously confer individual federal rights
C. Finding a privately enforceable individual right in this case would create line-drawing problems
D. Other enforcement mechanisms protect beneficiaries
II. The court of appeals erred in finding an individual federal right



In [120]:
# clean_df = pd.read_csv('../data/new_dataset/manual_extract.csv')

In [121]:
toc_df['arguments'] = arg_df['arguments']

In [122]:
toc_df.head()

Unnamed: 0,filename,text,toc,content,docket_num,court,token_count,arguments
0,Docket23-1275_Brief001.pdf,\n No. 23-1275 \nIn the Supreme Court of the...,\n No. 23-1275 \nIn the Supreme Court of the...,................................ ..............,23-1275,SCOTUS,11528,I. The any-qualified-provider provision does n...
1,Docket23-1275_Brief002.pdf,NO. 23-1275 \nIN THE \nSupreme Court of the Un...,NO. 23-1275 \nIN THE \nSupreme Court of the Un...,................................................,23-1275,SCOTUS,14888,I. The any-qualified-provider provision does n...
2,Docket23-1275_Brief003.pdf,No. 23-1275 \nIN THE \nSupreme Court of the ...,No. 23-1275 \nIN THE \nSupreme Court of the ...,................................ ..............,23-1275,SCOTUS,12633,I. PPSAT IS AN EXEMPLARY PROVIDER OFFERING VI...
3,Docket23-1275_Brief004.pdf,\n \n \nNo. 23-1275 \nIn the Supreme Court of...,\n \n \nNo. 23-1275 \nIn the Supreme Court of...,................................................,23-1275,SCOTUS,16883,The Free-Choice-Of-Provider Provision Unambigu...
4,Docket24-249_Brief001.pdf,\n No. 24-249 \n \nIN THE \nSupreme Court of...,\n No. 24-249 \n \nIN THE \nSupreme Court of...,................................................,24-249,SCOTUS,4409,I. Congress Enacted The IDEA To Supplement The...


In [123]:
toc_df.to_csv("../data/new_dataset/new_preprocessed.csv", index=False)
print("Data saved successfully")

Data saved successfully
