# Scraping Wikipedia Tables of SCOTUS Opinions from 1999-2019

In [1]:
# Webscraping
import requests
import re
from bs4 import BeautifulSoup

# Analysis
import numpy as np
import pandas as pd
import random

## Initial Scrape

In [2]:
# First URL
url = 'https://en.wikipedia.org/wiki/1999_term_opinions_of_the_Supreme_Court_of_the_United_States'
website_url = requests.get(url).text

In [25]:
# Instantiate BeautifulSoup object
soup = BeautifulSoup(website_url)

In [None]:
//*[@id="mw-content-text"]/div/table[2]/tbody/tr[1]/td[5]

In [4]:
# Find table rows helper function
def row_data(soup):
    # Find Supreme Court opinions 'table'
    table = soup.find('table', {'class': 'wikitable sortable'})

    # Find table headers
    headers = table.find_all('th')

    # Create list of justices from headers
    not_justices = ['#', 'Case name and citation', 'Argued', 'Decided']
    names = [ header.text.strip() for header in headers if header.text.strip() not in not_justices ]

    # Remove duplicate names
    justices = []
    for name in names:
        if name not in justices:
            justices.append(name)

    # Find rows
    first_row = table.find('tr')
    rows = first_row.next_siblings # Data is here
    
    return rows, justices

In [5]:
# Scrape opinions helper function
def scrape_opinions(rows, justices):
    j_votes = []
    cases = []
    n = 0
    for i, row in enumerate(rows):
        if i % 2 == 0:
            votes = []
            data = row.find_all('td')
            for m, datum in enumerate(data):
                if m == 1:
                    cases.append(datum.text.strip())
                elif m in [0, 2, 3]:
                    continue
                else:
                    if re.match(r'padding', datum['style']):
                        vote = datum['data-sort-value']
                        if re.match(r'^(.*)?(?=<)', datum['data-sort-value']):
                            vote = re.match(r'^(.*)?(?=<)', datum['data-sort-value'])[0].strip()
                        votes.append(vote)
            
            if votes != []: # Some rows are not cases
                assert len(votes) == len(justices), 'Number of votes/non-votes different than number of justices'
                j_votes.append(votes)
                
    assert len(cases) == len(j_votes), 'Number of cases different than justice opinions'
    
    return j_votes, cases

In [6]:
# Complete scrape function
def get_data(soup):
    rows, justices = row_data(soup)
    j_votes, cases = scrape_opinions(rows, justices)
    votes_by_case = dict(zip(cases, j_votes)) # Create dictionary (ex. {'case': [votes]})
    return votes_by_case, justices

### Note on opinion codes:
- 1: Court opinion
- 2: Concurrence
- 3: Concurrence and dissent
- 4: Dissent
- J prefix: Joined opinion or dissent
- X suffix: Joined in part
- X: No vote

In [7]:
votes_by_case, justices = get_data(soup)

In [8]:
key = random.choice(list(votes_by_case))
value = votes_by_case[key]
print('Case:', key)
print('Justices:', justices)
print('Votes:', value)

Case: Dickerson v. United States, 530 U.S. 428
Justices: ['Rehnquist', 'Stevens', "O'Connor", 'Scalia', 'Kennedy', 'Souter', 'Thomas', 'Ginsburg', 'Breyer']
Votes: ['1', 'J1', 'J1', '4', 'J1', 'J1', 'J4', 'J1', 'J1']


### To build from dictionary or DataFrame?
I decided to build from a DataFrame instead of a dictionary.  Generally, it is best (less memory) to build from a dictionary, however, we will have at most 180 rows (if all justices turned over each year) and maybe 2000 columns (if 100 cases were decided each year).  By building from a DataFrame I am able to join term opinions on justices which makes merging in each new term easier (especially since this will be a sparse matrix).

In [9]:
# Create DataFrame (rows are justices and columns are cases)
df = pd.DataFrame.from_dict(votes_by_case)
df.index = justices

In [10]:
df

Unnamed: 0,"Brancato v. Gunn, 528 U.S. 1","Antonelli v. Caridine, 528 U.S. 3","Judd v. United States Dist. Court for Western Dist. of Tex., 528 U.S. 5","Dempsey v. Martin, 528 U.S. 7","Prunty v. Brooks, 528 U.S. 9","Flippo v. West Virginia, 528 U.S. 11","In re Bauer, 528 U.S. 16","Texas v. Lesage, 528 U.S. 18","Fiore v. White, 528 U.S. 23","Los Angeles Police Dept. v. United Reporting Publishing Corp., 528 U.S. 32",...,"Crosby v. National Foreign Trade Council, 530 U.S. 363","Arizona v. California, 530 U.S. 392","Dickerson v. United States, 530 U.S. 428","Apprendi v. New Jersey, 530 U.S. 466","California Democratic Party v. Jones, 530 U.S. 567","Mobil Oil Exploration & Producing Southeast, Inc. v. United States, 530 U.S. 604","Boy Scouts of America v. Dale, 530 U.S. 640","Hill v. Colorado, 530 U.S. 703","Mitchell v. Helms, 530 U.S. 793","Stenberg v. Carhart, 530 U.S. 914"
Rehnquist,J1,J1,J1,J1,J1,J1,J1,J1,J1,1,...,J1,3,1,J4J4,J1,J1,1,J1,J1X,4J4J4
Stevens,4,4,4,4,4,J1,4,J1,J1,4,...,J1,J1,J1,1,4,4,4,1,J4,2J1J2
O'Connor,J1,J1,J1,J1,J1,J1,J1,J1,J1,J1J2,...,J1,J3,J1,4,J1,J1,J1,J1J2,2,2J1
Scalia,J1,J1,J1,J1,J1,J1,J1,J1,J1,2J1,...,2,J1,4,2J1J2X,1,J1,J1,4,J1X,4J4
Kennedy,J1,J1,J1,J1,J1,J1,J1,J1,J1,J4,...,J1,J1,J1,J4,2J1,J1,J1,4,J1X,4
Souter,J1,J1,J1,J1,J1,J1,J1,J1,J1,J1J2,...,1,J1,J1,J1,J1,J1,4J4,2J1,4,J1
Thomas,J1,J1,J1,J1,J1,J1,J1,J1,J1,J1J2,...,J2,J3,J4,2J1,J1,J1,J1,J4,1X,4
Ginsburg,J1,J1,J1,J1,J1,J1,J1,J1,J1,2J1,...,J1,1,J1,J1,J4,J1,J4J4,J1J2,J4,2J1J2
Breyer,J1,J1,J1,J1,J1,J1,J1,J1,1,J1J2,...,J1,J1,J1,4J4,J1,1,J4J4,J1J2,J2,1


## Scraping the rest of the SCOTUS terms

In [11]:
# Get all urls function
def make_urls(base_url, list_of_var):
    urls = []
    for var in list_of_var:
        url = base_url.format(var)
        urls.append(url)
    return urls

In [12]:
# Set years and base_url to scrape
years = list(range(2000, 2020))
base_url = 'https://en.wikipedia.org/wiki/{}_term_opinions_of_the_Supreme_Court_of_the_United_States'
urls = make_urls(base_url, years)

In [13]:
# Merge new data onto DataFrame by joining DataFrames
for url in urls:
    website_url = requests.get(url).text
    soup = BeautifulSoup(website_url, 'lxml')
    votes_by_case, justices = get_data(soup)
    new_df = pd.DataFrame.from_dict(votes_by_case)
    new_df.index = justices
    df = df.join(new_df, how='outer')

In [16]:
df

Unnamed: 0,"Brancato v. Gunn, 528 U.S. 1","Antonelli v. Caridine, 528 U.S. 3","Judd v. United States Dist. Court for Western Dist. of Tex., 528 U.S. 5","Dempsey v. Martin, 528 U.S. 7","Prunty v. Brooks, 528 U.S. 9","Flippo v. West Virginia, 528 U.S. 11","In re Bauer, 528 U.S. 16","Texas v. Lesage, 528 U.S. 18","Fiore v. White, 528 U.S. 23","Los Angeles Police Dept. v. United Reporting Publishing Corp., 528 U.S. 32",...,"Bostock v. Clayton County, 590 U.S. ___","Andrus v. Texas, 590 U.S. ___","Department of Homeland Security v. Regents of Univ. of Cal., 591 U.S. ___","Liu v. SEC, 591 U.S. ___","Department of Homeland Security v. Thuraissigiam, 591 U.S. ___","Seila Law v. Consumer Financial Protection Bureau, 591 U.S. ___","June Medical Services, LLC v. Russo, 591 U.S. ___","Agency for Int’l Development v. Alliance for Open Society, 591 U.S. ___","Espinoza v. Montana Dept. of Revenue, 591 U.S. ___","Patent and Trademark Office v. Booking.com B. V., 591 U.S. ___"
Alito,,,,,,,,,,,...,4,4,3J3,J1,1,J1,2,j1,2J1,J1
Breyer,J1,J1,J1,J1,J1,J1,J1,J1,1,J1J2,...,J1,J1,J1,J1,2,J3,1,4,4,4
Ginsburg,J1,J1,J1,J1,J1,J1,J1,J1,J1,2J1,...,J1,J1,J1,J1,J2,J3,J1,J4,4,1
Gorsuch,,,,,,,,,,,...,1,J4,J3,J1,J1,J1J3,4J4,J1,2J1J2,J1
Kagan,,,,,,,,,,,...,J1,J1,J1,J1,J4,3,J1,X,J4J4X,J1
Kavanaugh,,,,,,,,,,,...,4,J1,3,J1,J1,J1,4J2,1,J1,J1
Kennedy,J1,J1,J1,J1,J1,J1,J1,J1,J1,J4,...,,,,,,,,,,
O'Connor,J1,J1,J1,J1,J1,J1,J1,J1,J1,J1J2,...,,,,,,,,,,
Rehnquist,J1,J1,J1,J1,J1,J1,J1,J1,J1,1,...,,,,,,,,,,
Roberts,,,,,,,,,,,...,J1,J1,1X,J1,J1,1X,2,J1,1,J1
