In [1]:
%pip install requests beautifulsoup4 selenium webdriver-manager pandas

Note: you may need to restart the kernel to use updated packages.


Docket numbers retrieved manually from the list maintained by findlaw.com. Only cases decided in 2018 and onward have their documents hosted on the Scotus website. Briefs from 2012 term onward are stored on scotusblog.com, although some of them are in a very messy format (i.e. a picture of a scanned document). Prior briefs used to be hosted by the ABA but they took them down.

In [2]:
docket_nos_path = "./docket_nos_valid.txt"

In [3]:
import pandas as pd
import re

df = pd.DataFrame(columns=['docket_number', 'year', 'url_list'])

current_year = None
rows = [] 

with open(docket_nos_path, 'r') as file:
    for line in file:
        line = line.strip()

        # Check for year-only lines. The year indicates when the decision was released (not when heard).
        year_match = re.match(r'^<year>\s+(\d{4})$', line)
        if year_match:
            current_year = year_match.group(1)
            continue

        # Extract the docket number from regular lines
        match = re.search(r'No\.\s+([\w-]+)', line)
        if match:
            docket_number = match.group(1)
            # Use the current year for this entry and add to the rows list
            rows.append({'docket_number': docket_number, 'year': current_year})

df = pd.concat([df, pd.DataFrame(rows)], ignore_index=True)

print(df.head())


  docket_number  year url_list
0         17-71  2018      NaN
1       17-1676  2018      NaN
2       18-5181  2018      NaN
3        17-587  2018      NaN
4       17-7894  2018      NaN


In [4]:
print(len(df))

412


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import re

# Setup WebDriver with headless option
options = Options()
options.add_argument("--headless")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Function to process a single docket page
def process_docket_page(docket_number):
    url = f"https://www.supremecourt.gov//docket/docketfiles/html/public/{docket_number}.html"
    pdf_urls = []
    try:
        driver.get(url)
    except Exception as e:
        print(f"Error loading page {url}: {str(e)}")
        return pdf_urls

    try:
        rows = driver.find_elements(By.TAG_NAME, 'tr')
    except Exception as e:
        print(f"Error finding table rows: {str(e)}")
        return pdf_urls

    for row in rows:
        try:
            # Check if the row contains the text indicating a brief or document type
            pattern = re.compile(r'^(?!.*\b(motion|supplemental letter)\b).*\b(brief of|reply of|brief amicus curiae)\b', re.IGNORECASE)
            
            if pattern.search(row.text):
                # Find all 'a' elements in this row
                # print(f"Match found in row: {row.text}")
                next_td = row.find_element(By.XPATH, "./following-sibling::tr[1]/td[2]")
                # print(f"The next_td is {next_td.get_attribute('outerHTML')}")
                links = next_td.find_elements(By.TAG_NAME, 'a')
                # print(f"The links are: {links}")
                if len(links) > 0:
                    for link in links:
                        # print(f"the link is {link.text}")
                        if link.text.strip().lower() == 'main document':
                            pdf_urls.append(link.get_attribute('href'))
        except Exception as e:
            print(f"Error processing row: {str(e)}")
            continue

    return pdf_urls


In [None]:
import time
import random

# List of docket numbers to process
docket_numbers = df['docket_number'].tolist()

docket_pdf_urls = {}

# Try to do this directly with the dataframe rows, otherwise use a list.
for idx, row in df.iterrows():
    docket_number = row['docket_number']
    pdf_urls = process_docket_page(docket_number)
    docket_pdf_urls[docket_number] = pdf_urls
    df.at[idx, 'url_list'] = pdf_urls
    
    time.sleep(random.uniform(1, 2)) # Hopefully avoid rate limits

driver.quit()

In [6]:
print(len(docket_pdf_urls))

410


In [None]:
import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}

num_briefs = 0

for docket_number, urls_list in docket_pdf_urls.items():
    num = 1
    for url in urls_list:
        response = requests.get(url, headers=headers, stream=True)
        if response.status_code == 200:
            with open(f'../data/scotus_pdfs/Docket{docket_number}_Brief{num:03}.pdf', 'wb') as f:
                f.write(response.content)
            
            num += 1
            num_briefs += 1
            # print("pdf saved successfully, I sleep now")
            time.sleep(random.uniform(1, 2))
        else:
            print(f"Failed to download PDF for docket number {docket_number}. Status code: {response.status_code}")


In [8]:
print(f"You scraped {num_briefs} briefs.")

You scraped 4377 briefs.


In [None]:
df.to_json('../data/scotus_briefs.json', orient='records')