In [5]:
import numpy as np
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
import datetime
from functools import reduce
import random
import time

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By

pd.options.display.max_columns = 100
pd.options.display.max_rows = 1500

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

# Getting links for each volume of scotus cases

In [98]:
page = requests.get('https://supreme.justia.com/cases/federal/us/volume')
soup = BeautifulSoup(page.content, 'html.parser')

list_div = soup.find_all('div', {'class': 'list-fixed-columns'})[0]
list_div_link_spans = list_div.find_all('span')

justia_volume_urls = ['https://supreme.justia.com' + span.find_all('a')[0].get('href') for span in list_div_link_spans][::-1]

# justia_volume_urls

['https://supreme.justia.com/cases/federal/us/1/',
 'https://supreme.justia.com/cases/federal/us/2/',
 'https://supreme.justia.com/cases/federal/us/3/',
 'https://supreme.justia.com/cases/federal/us/4/',
 'https://supreme.justia.com/cases/federal/us/5/',
 'https://supreme.justia.com/cases/federal/us/6/',
 'https://supreme.justia.com/cases/federal/us/7/',
 'https://supreme.justia.com/cases/federal/us/8/',
 'https://supreme.justia.com/cases/federal/us/9/',
 'https://supreme.justia.com/cases/federal/us/10/',
 'https://supreme.justia.com/cases/federal/us/11/',
 'https://supreme.justia.com/cases/federal/us/12/',
 'https://supreme.justia.com/cases/federal/us/13/',
 'https://supreme.justia.com/cases/federal/us/14/',
 'https://supreme.justia.com/cases/federal/us/15/',
 'https://supreme.justia.com/cases/federal/us/16/',
 'https://supreme.justia.com/cases/federal/us/17/',
 'https://supreme.justia.com/cases/federal/us/18/',
 'https://supreme.justia.com/cases/federal/us/19/',
 'https://supreme.jus

# Getting case links and basic info from volume pages

In [1]:
def scrape_justia_volume_page(volume_page_url, data):
    print(volume_page_url)
    page = requests.get(volume_page_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    results = soup.find_all('div', {'class': 'search-result'})
    
    for result in results:
        index = len(data)
        data.loc[index] = np.nan
        
        data.loc[index, 'volume_page_url'] = volume_page_url
        case_name = result.find_all('a', {'class': 'case-name'})[0]
        
        data.loc[index, 'case_name'] = case_name.find_all('strong')[0].get_text()
        case_url = case_name.get('href')
        if str(case_url)[0] == '/':
            case_url = 'https://supreme.justia.com' + case_url
        data.loc[index, 'case_url'] = case_url
                
        columns = result.find_all('strong')[1:]
        for column in columns:
            column_text = column.get_text()[:-1].lower()
            if column_text in data.columns:
                data.loc[index, column_text] = column.next_sibling

In [102]:
justia_vol_page_details = pd.DataFrame(columns=['volume_page_url', 'case_name', 'case_url', 'court', 'citation', 'date'])

for url in justia_volume_urls:
    scrape_justia_volume_page(url, justia_vol_page_details)

justia_vol_page_details

https://supreme.justia.com/cases/federal/us/1/
https://supreme.justia.com/cases/federal/us/2/
https://supreme.justia.com/cases/federal/us/3/
https://supreme.justia.com/cases/federal/us/4/
https://supreme.justia.com/cases/federal/us/5/
https://supreme.justia.com/cases/federal/us/6/
https://supreme.justia.com/cases/federal/us/7/
https://supreme.justia.com/cases/federal/us/8/
https://supreme.justia.com/cases/federal/us/9/
https://supreme.justia.com/cases/federal/us/10/
https://supreme.justia.com/cases/federal/us/11/
https://supreme.justia.com/cases/federal/us/12/
https://supreme.justia.com/cases/federal/us/13/
https://supreme.justia.com/cases/federal/us/14/
https://supreme.justia.com/cases/federal/us/15/
https://supreme.justia.com/cases/federal/us/16/
https://supreme.justia.com/cases/federal/us/17/
https://supreme.justia.com/cases/federal/us/18/
https://supreme.justia.com/cases/federal/us/19/
https://supreme.justia.com/cases/federal/us/20/
https://supreme.justia.com/cases/federal/us/21/
h

https://supreme.justia.com/cases/federal/us/171/
https://supreme.justia.com/cases/federal/us/172/
https://supreme.justia.com/cases/federal/us/173/
https://supreme.justia.com/cases/federal/us/174/
https://supreme.justia.com/cases/federal/us/175/
https://supreme.justia.com/cases/federal/us/176/
https://supreme.justia.com/cases/federal/us/177/
https://supreme.justia.com/cases/federal/us/178/
https://supreme.justia.com/cases/federal/us/179/
https://supreme.justia.com/cases/federal/us/180/
https://supreme.justia.com/cases/federal/us/181/
https://supreme.justia.com/cases/federal/us/182/
https://supreme.justia.com/cases/federal/us/183/
https://supreme.justia.com/cases/federal/us/184/
https://supreme.justia.com/cases/federal/us/185/
https://supreme.justia.com/cases/federal/us/186/
https://supreme.justia.com/cases/federal/us/187/
https://supreme.justia.com/cases/federal/us/188/
https://supreme.justia.com/cases/federal/us/189/
https://supreme.justia.com/cases/federal/us/190/
https://supreme.just

https://supreme.justia.com/cases/federal/us/339/
https://supreme.justia.com/cases/federal/us/340/
https://supreme.justia.com/cases/federal/us/341/
https://supreme.justia.com/cases/federal/us/342/
https://supreme.justia.com/cases/federal/us/343/
https://supreme.justia.com/cases/federal/us/344/
https://supreme.justia.com/cases/federal/us/345/
https://supreme.justia.com/cases/federal/us/346/
https://supreme.justia.com/cases/federal/us/347/
https://supreme.justia.com/cases/federal/us/348/
https://supreme.justia.com/cases/federal/us/349/
https://supreme.justia.com/cases/federal/us/350/
https://supreme.justia.com/cases/federal/us/351/
https://supreme.justia.com/cases/federal/us/352/
https://supreme.justia.com/cases/federal/us/353/
https://supreme.justia.com/cases/federal/us/354/
https://supreme.justia.com/cases/federal/us/355/
https://supreme.justia.com/cases/federal/us/356/
https://supreme.justia.com/cases/federal/us/357/
https://supreme.justia.com/cases/federal/us/358/
https://supreme.just

https://supreme.justia.com/cases/federal/us/507/
https://supreme.justia.com/cases/federal/us/508/
https://supreme.justia.com/cases/federal/us/509/
https://supreme.justia.com/cases/federal/us/510/
https://supreme.justia.com/cases/federal/us/511/
https://supreme.justia.com/cases/federal/us/512/
https://supreme.justia.com/cases/federal/us/513/
https://supreme.justia.com/cases/federal/us/514/
https://supreme.justia.com/cases/federal/us/515/
https://supreme.justia.com/cases/federal/us/516/
https://supreme.justia.com/cases/federal/us/517/
https://supreme.justia.com/cases/federal/us/518/
https://supreme.justia.com/cases/federal/us/519/
https://supreme.justia.com/cases/federal/us/520/
https://supreme.justia.com/cases/federal/us/521/
https://supreme.justia.com/cases/federal/us/522/
https://supreme.justia.com/cases/federal/us/523/
https://supreme.justia.com/cases/federal/us/524/
https://supreme.justia.com/cases/federal/us/525/
https://supreme.justia.com/cases/federal/us/526/
https://supreme.just

Unnamed: 0,volume_page_url,case_name,case_url,court,citation,date
0,https://supreme.justia.com/cases/federal/us/1/,HYAM'S LESSEE v. EDWARDS,https://supreme.justia.com/cases/federal/us/1/1/,US Supreme Court  ...,1 U.S. 1,
1,https://supreme.justia.com/cases/federal/us/1/,BETHEL v. LLOYD,https://supreme.justia.com/cases/federal/us/1/2/,US Supreme Court  ...,1 U.S. 2,
2,https://supreme.justia.com/cases/federal/us/1/,STEVENSON v. PEMBERTON,https://supreme.justia.com/cases/federal/us/1/3/,US Supreme Court  ...,1 U.S. 3,
3,https://supreme.justia.com/cases/federal/us/1/,ASHETON v. ASHETON,https://supreme.justia.com/cases/federal/us/1/4/,US Supreme Court  ...,1 U.S. 4,
4,https://supreme.justia.com/cases/federal/us/1/,KING v. LUKENS,https://supreme.justia.com/cases/federal/us/1/5/,US Supreme Court  ...,1 U.S. 5,
5,https://supreme.justia.com/cases/federal/us/1/,WALLACE v. CHILD AND STYLES,https://supreme.justia.com/cases/federal/us/1/7/,US Supreme Court  ...,1 U.S. 7,
6,https://supreme.justia.com/cases/federal/us/1/,PRICE v. WATKINS,https://supreme.justia.com/cases/federal/us/1/8/,US Supreme Court  ...,1 U.S. 8,
7,https://supreme.justia.com/cases/federal/us/1/,DAVEY v. TURNER,https://supreme.justia.com/cases/federal/us/1/11/,US Supreme Court  ...,1 U.S. 11,
8,https://supreme.justia.com/cases/federal/us/1/,BOEHM AND SHITZ v. ENGLE,https://supreme.justia.com/cases/federal/us/1/15/,US Supreme Court  ...,1 U.S. 15,
9,https://supreme.justia.com/cases/federal/us/1/,RICHE AND RICHARDS v. BROADFIELD,https://supreme.justia.com/cases/federal/us/1/16/,US Supreme Court  ...,1 U.S. 16,


# Scraping individual case pages into justia_case_details

In [133]:
def scrape_justia_case_page(case_page_url, data):
    browser.get(case_page_url)
    html = browser.page_source
    
    index = len(data)
    data.loc[index] = np.nan
    
#     page = requests.get(case_page_url)
#     soup = BeautifulSoup(page.content, 'html.parser')
    soup = BeautifulSoup(html, 'html.parser')
    
    sidebar = soup.find_all('div', {'id': 'primary-sidebar'})[0]
    sidebar_info_block = sidebar.find_all('aside', {'class': 'annotation'})[1]
    
    data.loc[index, 'case_url'] = case_page_url
    columns = sidebar_info_block.find_all('p')
    if 'Download PDF' in columns[0].get_text():
        case_pdf_url = columns[0].find_all('a')[0].get('href')
        data.loc[index, 'pdf_url'] = case_pdf_url
        columns.pop(0)
        
    column_names = []
    for column in columns:
        column_text = column.get_text().lower().replace(' ', '_')
        if column_text in column_names:
            column_text = column_text + '+'
        
        if column_text not in data.columns:
            data[column_text] = np.nan
        
        data.loc[index, column_text] = column.find_next_sibling('span').get_text().replace('\n', '').replace('  ', '')
    
    case_info_main = soup.find_all('div', {'class': 'primary-content'})[0]
    first_info_div = case_info_main.find('div').find('div').find('div')
    
    if first_info_div['class'] != ['tabbed-content']:
        case_summary_annotations = []
        for child in first_info_div.contents:
            if str(type(child)) == "<class 'bs4.element.NavigableString'>":
                continue
            case_summary_annotations.append(child.get_text())
        data.loc[index, 'summary_annotations'] = str(case_summary_annotations)
    
    navbar = case_info_main.find_all('nav')[0]
    navbar_spans = navbar.find_all('span')
    nav_items = []
    for item in navbar_spans:
        nav_items.append(item.get_text())
    data.loc[index, 'nav_items'] = str(nav_items) 
    
    if 'Audio & Media' in nav_items:
        tab_content = soup.find_all('div', {'id': 'tab-audio-and-media'})[0]
        tab_info = tab_content.find_all('table')[0].find_all('a')
        
        for a in tab_info:
            a_info_url = a.get('href')
            a_info_text = a.get_text()
            a_info_type = a.get_text().split(' - ')[0].lower().replace(' ', '_')
            if a_info_type in data.columns:
                data.loc[index, a_info_type] = str([a_info_text, a_info_url])
            else:
                print('missed: ' + str(a_info_type))
        
    case_html = soup.find_all('div', {'id': 'tab-opinion'})[0].encode(u'utf8')
    unique_case_id = data.loc[index, 'official_citation_'] + str(data.loc[index, 'docket_no.'])
    file_name = 'justia_html_' + unique_case_id.replace(' ', '%') + '.html'
    with open('out/justia_case_html/' + file_name, 'w') as file:
        file.write(str(case_html))

In [28]:
# justia_case_details = pd.DataFrame(columns=['case_url', 'first_party', 'second_party', 'official_citation_', 'docket_no.', 
#                            'nav_items', 'oral_argument', 'oral_reargument', 'opinion_announcement', 
#                            'summary_annotations', 'pdf_url'])
# browser = webdriver.Chrome()

# errors = []
# justia_case_urls = justia_vol_page_details['case_url']

# for url in justia_case_urls:
#     try:
#         scrape_justia_case_page(url, justia_case_details)
#     except:
#         errors.append(url)

# browser.close()

# justia_case_details.to_csv('out/justia_case_details.csv')
justia_case_details

Unnamed: 0,case_url,first_party,second_party,official_citation_,docket_no.,nav_items,oral_argument,oral_reargument,opinion_announcement,summary_annotations,pdf_url,argued,decided,advocates,reargued,granted,rehearing_granted,opinion_announced,affirmed_by_an_equally_divided_court,"rehearing_granted,_judgment_vacated,_and_restored_to_the_calendar","rehearing_granted,_judgment_vacated_and_case_restored_for_reargument",question_certified_to_the_supreme_court_of_florida,question_certified,certified_question_to_arizona_supreme_court,juris_postponed
0,https://supreme.justia.com/cases/federal/us/1/1/,,,1 U.S. 1,,['Opinions'],,,,,,,,,,,,,,,,,,,
1,https://supreme.justia.com/cases/federal/us/1/2/,,,1 U.S. 2,,['Opinions'],,,,,,,,,,,,,,,,,,,
2,https://supreme.justia.com/cases/federal/us/1/3/,,,1 U.S. 3,,['Opinions'],,,,,,,,,,,,,,,,,,,
3,https://supreme.justia.com/cases/federal/us/1/4/,,,1 U.S. 4,,['Opinions'],,,,,,,,,,,,,,,,,,,
4,https://supreme.justia.com/cases/federal/us/1/5/,,,1 U.S. 5,,['Opinions'],,,,,,,,,,,,,,,,,,,
5,https://supreme.justia.com/cases/federal/us/1/7/,,,1 U.S. 7,,['Opinions'],,,,,,,,,,,,,,,,,,,
6,https://supreme.justia.com/cases/federal/us/1/8/,,,1 U.S. 8,,['Opinions'],,,,,,,,,,,,,,,,,,,
7,https://supreme.justia.com/cases/federal/us/1/11/,,,1 U.S. 11,,['Opinions'],,,,,,,,,,,,,,,,,,,
8,https://supreme.justia.com/cases/federal/us/1/15/,,,1 U.S. 15,,['Opinions'],,,,,,,,,,,,,,,,,,,
9,https://supreme.justia.com/cases/federal/us/1/16/,,,1 U.S. 16,,['Opinions'],,,,['\n Justia Opinion Summary and Ann...,,,,,,,,,,,,,,,


In [None]:
justia_case_details = pd.read_csv('out/justia_case_details.csv').drop(columns='Unnamed: 0')

### Cleaning up missed page details

The two cases that failed the scraping function are 549 US 1328 and 585 141-orig. The first, Boumediene v. Bush, made it back to the Supreme Court, and the link is broken. So, the information is ultimately recorded in another case, and we can't do anything about it anyway, so we'll leave that one alone (just add the broken link back into the dataframe). The second one is an original jurisdiction case between Texas and New Mexico/Colorado, and it looks like our function actually ended up scraping all the relevant information from that case, so I think that one is fine too. I think the reason the second one broke the function is because it only has a page for oral argument audio, and not case text.

In [143]:
print(errors)
justia_case_details.loc[30726, ['case_url', 'official_citation_']] = ['https://supreme.justia.com/cases/federal/us/549/1328/dissent.html', '549 U.S. 1328']
justia_case_details.loc[justia_case_details['case_url']=='https://supreme.justia.com/cases/federal/us/585/141-orig/']

['https://supreme.justia.com/cases/federal/us/549/1328/dissent.html', 'https://supreme.justia.com/cases/federal/us/585/141-orig/']


Unnamed: 0,case_url,first_party,second_party,official_citation_,docket_no.,nav_items,oral_argument,oral_reargument,opinion_announcement,summary_annotations,pdf_url,argued,decided,advocates,reargued,granted,rehearing_granted,opinion_announced,affirmed_by_an_equally_divided_court,"rehearing_granted,_judgment_vacated,_and_restored_to_the_calendar","rehearing_granted,_judgment_vacated_and_case_restored_for_reargument",question_certified_to_the_supreme_court_of_florida,question_certified,certified_question_to_arizona_supreme_court,juris_postponed
31655,https://supreme.justia.com/cases/federal/us/58...,State of Texas,State of New Mexico and State of Colorado,585 U.S. __ (0),141-orig,['Audio & Media'],"['Oral Argument - January 08, 2018', 'https://...",,"['Opinion Announcement - March 05, 2018', 'htt...",,,"January 7, 2018","March 4, 2018",Scott A. Keller Frederick R. Yarger Ann O'Conn...,,"October 9, 2017",,,,,,,,,


### Clean up columns with just one or two data entries

In [29]:
columns_to_reduce = []
for column in justia_case_details.columns:
    if len(justia_case_details) - justia_case_details[column].isnull().sum() < 5:
        columns_to_reduce.append(column)

dt_columns = ['argued', 'decided', 'reargued', 'granted', 'rehearing_granted', 'opinion_announced', 'affirmed_by_an_equally_divided_court', 'rehearing_granted,_judgment_vacated,_and_restored_to_the_calendar',
              'rehearing_granted,_judgment_vacated_and_case_restored_for_reargument', 'question_certified_to_the_supreme_court_of_florida', 'question_certified', 'certified_question_to_arizona_supreme_court', 'juris_postponed']
for dt_column in dt_columns:
    justia_case_details[dt_column] = pd.to_datetime(justia_case_details[dt_column])
    
justia_case_details[columns_to_reduce] = justia_case_details[columns_to_reduce].fillna('None')
def reduce_columns(data, columns):
    combined_data = []
    for column in columns:
        if data[column] == 'None':
            continue
        combined_data.append([column, data[column]])
    if combined_data == []:
        return np.nan
    else:
        return list(combined_data)

justia_case_details['misc_dt_data'] = justia_case_details.apply(lambda x: reduce_columns(x, columns_to_reduce),axis=1)    
justia_case_details = justia_case_details.drop(columns=columns_to_reduce)
justia_case_details.loc[justia_case_details['misc_dt_data'].isnull()==False]

# df = justia_case_details.copy()
# df.columns
# columns_to_reduce = []
# for column in df.columns:
#     if len(df) - df[column].isnull().sum() > 5:
#         continue
#     columns_to_reduce.append(column)

# dt_columns = ['argued', 'decided', 'reargued', 'granted', 'rehearing_granted', 'opinion_announced', 'affirmed_by_an_equally_divided_court', 'rehearing_granted,_judgment_vacated,_and_restored_to_the_calendar',
#               'rehearing_granted,_judgment_vacated_and_case_restored_for_reargument', 'question_certified_to_the_supreme_court_of_florida', 'question_certified', 'certified_question_to_arizona_supreme_court', 'juris_postponed']
# for dt_column in dt_columns:
#     df[dt_column] = pd.to_datetime(df[dt_column])
    
# df[columns_to_reduce] = df[columns_to_reduce].fillna('None')
# def reduce_columns(df, columns):
#     combined_data = []
#     for column in columns:
#         if df[column] == 'None':
#             continue
#         combined_data.append([column, df[column]])
#     if combined_data == []:
#         return np.nan
#     else:
#         return list(combined_data)

# df['misc_dt_data'] = df.apply(lambda x: reduce_columns(x, columns_to_reduce),axis=1)    
# df = df.drop(columns=columns_to_reduce)
# df.loc[df['misc_dt_data'].isnull()==False]

Unnamed: 0,case_url,first_party,second_party,official_citation_,docket_no.,nav_items,oral_argument,oral_reargument,opinion_announcement,summary_annotations,pdf_url,argued,decided,advocates,reargued,granted,misc_dt_data
21473,https://supreme.justia.com/cases/federal/us/35...,"Curtis Reid, Superintendent of the District of...",Claris Covert,354 U.S. 1,,"['Opinions', 'Audio & Media']","['Oral Argument - May 03, 1956', 'https://www....","['Oral Reargument - February 27, 1957 (Part 2)...",,['\n Justia Opinion Summary and Ann...,,1956-05-02,1957-06-09,Marvin Frankel Lee Rankin,1957-02-26,NaT,"[[rehearing_granted, 1956-11-04 00:00:00]]"
21756,https://supreme.justia.com/cases/federal/us/35...,"William G. Cooper et al., Members of the Board...","John Aaron, et al.",358 U.S. 1,,"['Opinions', 'Audio & Media']","['Oral Argument - September 11, 1958 (Part 2)'...",,"['Opinion Announcement - September 12, 1958', ...",['\n Justia Opinion Summary and Ann...,,1958-09-10,1958-09-11,,NaT,NaT,"[[opinion_announced, 1958-09-28 00:00:00]]"
21804,https://supreme.justia.com/cases/federal/us/35...,Lovander Ladner,United States,358 U.S. 169,,"['Opinions', 'Audio & Media']","['Oral Argument - November 19, 1957', 'https:/...","['Oral Reargument - October 22, 1958 (Part 2)'...",,,,1957-11-18,1958-12-14,Harold Rosenwald Leonard Sand,1958-10-21,NaT,"[[affirmed_by_an_equally_divided_court, 1958-0..."
21872,https://supreme.justia.com/cases/federal/us/35...,Alfonse Bartkus,Illinois,359 U.S. 121,,"['Opinions', 'Audio & Media']","['Oral Argument - November 19, 1957', 'https:/...","['Oral Reargument - October 22, 1958', 'https:...",,,,1957-11-18,1959-03-29,,1958-10-21,NaT,"[[rehearing_granted,_judgment_vacated_and_case..."
22072,https://supreme.justia.com/cases/federal/us/36...,Flora,United States,362 U.S. 145,,"['Opinions', 'Audio & Media']","['Oral Argument - May 20, 1958', 'https://www....","['Oral Reargument - November 12, 1959', 'https...",,,,1958-05-19,1960-03-20,,1959-11-11,NaT,"[[rehearing_granted, 1959-06-21 00:00:00]]"
23178,https://supreme.justia.com/cases/federal/us/37...,Israel Dresner et al.,City of Tallahassee,378 U.S. 539,,"['Opinions', 'Audio & Media']","['Oral Argument - October 23, 1963', 'https://...",,,,,1963-10-22,1964-06-21,,NaT,NaT,[[question_certified_to_the_supreme_court_of_f...
30245,https://supreme.justia.com/cases/federal/us/53...,William Fiore,"Gregory White, Warden, et al.",531 U.S. 225,,"['Opinions', 'Audio & Media']","['Oral Argument - October 12, 1999', 'https://...",,"['Opinion Announcement - November 30, 1999', '...",,https://supreme.justia.com/cases/federal/us/53...,1999-10-11,2001-01-08,,NaT,NaT,"[[question_certified, 1999-11-29 00:00:00]]"
30404,https://supreme.justia.com/cases/federal/us/53...,"Terry L. Stewart, Director, Arizona Department...",Robert Douglas Smith,536 U.S. 856,,['Opinions'],,,,,https://supreme.justia.com/cases/federal/us/53...,NaT,2002-06-27,,NaT,NaT,"[[certified_question_to_arizona_supreme_court,..."
31648,https://supreme.justia.com/cases/federal/us/58...,Beverly R. Gill,William Whitford,585 U.S. __ (2018),16-1161,"['Opinions', 'Briefs and Filings', 'Audio & Me...","['Oral Argument - October 03, 2017', 'https://...",,,['\n Justia Opinion Summary and Ann...,https://supreme.justia.com/cases/federal/us/58...,2017-10-03,2018-06-18,,NaT,NaT,"[[juris_postponed, 2017-06-19 00:00:00]]"


# Downloading pdfs from justia pdf links

In [20]:
def scrape_justia_pdf(data):
    url = data['pdf_url']
    page = requests.get(url, stream = True)
    
    file_name = data['official_citation_'].replace(' ', '_') + '.pdf'

    with open("out/justia_pdfs/" + file_name,"wb") as pdf:
        for chunk in page.iter_content(chunk_size=1024):
            if chunk:
                pdf.write(chunk)

In [22]:
errors = []
for row in justia_case_details.loc[justia_case_details['pdf_url'].isnull()==False].iterrows():
    try:
        scrape_justia_pdf(row[1])
    except:
        errors.append(list([row[0], row[1]['pdf_url']]))
errors

[]

In [30]:
justia_case_details.to_csv('out/justia_case_details.csv')

# Downloading pdfs from supremecourt.gov

In [40]:
page = requests.get('https://www.supremecourt.gov/opinions/boundvolumes.aspx')
soup = BeautifulSoup(page.content, 'html.parser')

volume_pdf_urls = ['https://www.supremecourt.gov/opinions/' + li.find_all('a')[0].get('href') for li in soup.find('div', {'class': 'panel-group'}).find_all('li')]

['https://www.supremecourt.gov/opinions/boundvolumes/569BV.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/568BV.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/567BV.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/566BV.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/565BV.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/564BV.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/563BV.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/562BV.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/561BV.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/560bv.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/559bv.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/558bv.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/557bv.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/556bv.pdf',
 'https://www.supremecourt.gov/opinions/boundvolumes/555bv.pdf',
 'https://www.supremecour

In [77]:
def scrape_supremecourt_volume_pdf(url, folder):
    page = requests.get(url, stream = True)
    file_name = url[-9:].lower().replace('bv', '')

    with open("out/supremecourt.gov/" + folder + file_name, "wb") as pdf:
        for chunk in page.iter_content(chunk_size=1024):
            if chunk:
                pdf.write(chunk)

scotus_volume_errors = []
for volume_pdf_url in volume_pdf_urls:
    try:
        scrape_supremecourt_volume_pdf(volume_pdf_url, "volume_pdfs/")
    except:
        scotus_volume_errors.append(volume_pdf_url)
        
scotus_volume_errors

### Opinions relating to orders

In [74]:
relating_to_orders_urls = ['https://www.supremecourt.gov/opinions/relatingtoorders/' + str(num) for num in list(range(11, 19))]

relating_to_orders = pd.DataFrame(columns=['date', 'docket', 'name', 'name_url', 'revised', 'revised_url', 'j.', 'pt.'])

def scrape_scotus_gov_orders(url, data):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    panels = soup.find_all('div', {'class', 'panel'})
    for panel in panels:
        table = panel.find_all('table')[0]

        for tr in table.find_all('tr'):
            if len(tr.find_all('th')) > 1:
                column_names = []
                for th in tr.find_all('th'):
                    th_text = th.get_text().lower()
                    if th_text not in data.columns:
                        data[th_text] = np.nan
                    column_names.append(th.get_text().lower())
            else:
                index = len(data)
                data.loc[index] = np.nan
                
                for num, td in enumerate(tr.find_all('td')):
                    column_name = column_names[num]
                    td_text = td.get_text()
                    data.loc[index, column_name] = td_text                    
                    if len(td.find_all('a')) > 0:
                        data.loc[index, column_name + '_url'] = 'https://www.supremecourt.gov' + td.find_all('a')[0].get('href')
                    
        
relating_to_orders_errors = []
for relating_to_orders_url in relating_to_orders_urls:
    url = relating_to_orders_url
    try:
        scrape_scotus_gov_orders(url, relating_to_orders)
    except:
        relating_to_orders_errors.append(url)

relating_to_orders

Unnamed: 0,date,docket,name,name_url,revised,revised_url,j.,pt.
0,6/29/12,11-1240,FCC v. CBS Corp.,https://www.supremecourt.gov/opinions/11pdf/11...,,,G,567/2
1,6/29/12,11-1240,FCC v. CBS Corp.,https://www.supremecourt.gov/opinions/11pdf/11...,,,R,567/2
2,6/25/12,11-998,Mount Soledad Memorial Assn. v. Trunk,https://www.supremecourt.gov/opinions/11pdf/11...,,,A,567/2
3,6/18/12,11-7185,Fairey v. Tucker,https://www.supremecourt.gov/opinions/11pdf/11...,,,SS,567/1
4,2/17/12,11A762,"American Tradition Partnership, Inc. v. Bullock",https://www.supremecourt.gov/opinions/11pdf/11...,,,G,565/2
5,1/09/12,10-1548,Cash v. Maxwell,https://www.supremecourt.gov/opinions/11pdf/10...,,,AS,565/1
6,1/09/12,10-1548,Cash v. Maxwell,https://www.supremecourt.gov/opinions/11pdf/10...,,,SS,565/1
7,11/21/11,11A501,Doe v. Reed,https://www.supremecourt.gov/opinions/11pdf/11...,,,A,565/1
8,11/07/11,11-6391,Buck v. Thaler,https://www.supremecourt.gov/opinions/11pdf/11...,,,SS,565/1
9,11/07/11,11-6391,Buck v. Thaler,https://www.supremecourt.gov/opinions/11pdf/11...,,,A,565/1


In [76]:
orders_urls = ['https://www.supremecourt.gov/opinions/slipopinion/' + str(num) for num in list(range(12, 19))]

orders = pd.DataFrame(columns=['r-', 'date', 'docket', 'name', 'name_url', 'revised', 'revised_url', 'j.', 'pt.'])                    
        
orders_errors = []
for orders_url in orders_urls:
    url = orders_url
    try:
        scrape_scotus_gov_orders(url, orders)
    except:
        orders_errors.append(url)

orders

Unnamed: 0,r-,date,docket,name,name_url,revised,revised_url,j.,pt.
0,79,6/26/13,12-307,United States v. Windsor,https://www.supremecourt.gov/opinions/12pdf/12...,,,K,570/2
1,78,6/26/13,12-357,Sekhar v. United States,https://www.supremecourt.gov/opinions/12pdf/12...,,,AS,570/2
2,77,6/26/13,12-144,Hollingsworth v. Perry,https://www.supremecourt.gov/opinions/12pdf/12...,,,R,570/2
3,76,6/25/13,12-399,Adoptive Couple v. Baby Girl,https://www.supremecourt.gov/opinions/12pdf/12...,,,A,570/2
4,75,6/25/13,11-1447,Koontz v. St. Johns River Water Management Dist.,https://www.supremecourt.gov/opinions/12pdf/11...,,,A,570/2
5,74,6/25/13,12-96,Shelby County v. Holder,https://www.supremecourt.gov/opinions/12pdf/12...,,,R,570/2
6,73,6/24/13,12-1084,Ryan v. Schad,https://www.supremecourt.gov/opinions/12pdf/12...,,,PC,570/2
7,72,6/24/13,12-142,Mutual Pharmaceutical Co. v. Bartlett,https://www.supremecourt.gov/opinions/12pdf/12...,,,A,570/2
8,71,6/24/13,11-556,Vance v. Ball State Univ.,https://www.supremecourt.gov/opinions/12pdf/11...,,,A,570/2
9,70,6/24/13,12-418,United States v. Kebodeaux,https://www.supremecourt.gov/opinions/12pdf/12...,,,B,570/2


In [78]:
orders.to_csv('out/supremecourt.gov/orders.csv')
relating_to_orders.to_csv('out/supremecourt.gov/relating_to_orders.csv')

In [None]:
scraping related_to_orders pdfs

In [82]:
def scrape_scotus_docs(data_row, url_type, folder):
    url = data_row[1][url_type]
    page = requests.get(url, stream = True)
    file_name = str(data_row[0]) + '_' + str(data_row[1]['docket']) + '.pdf'

    with open("out/supremecourt.gov/" + folder + file_name.replace(' ', '_'), "wb") as pdf:
        for chunk in page.iter_content(chunk_size=1024):
            if chunk:
                pdf.write(chunk)
                
relating_to_orders_pdfs = relating_to_orders.loc[(relating_to_orders['name_url'].isnull()==False) | (relating_to_orders['revised_url'].isnull()==False)]
orders_pdfs = orders.loc[(orders['name_url'].isnull()==False) | (orders['revised_url'].isnull()==False)]

relating_to_orders_pdfs_errors = []
for data_row in relating_to_orders_pdfs.iterrows():
    try:
        scrape_scotus_docs(data_row, 'name_url', 'relating_to_orders_pdfs/')
        if data_row[1]['revised_url'].isnull()==False:
#             print(data_row[1]['revised_url'])
            scrape_scotus_docs(data_row, 'revised_url', 'revised_relating_to_orders_pdfs/')
    except:
        relating_to_orders_pdfs_errors.append(data_row)

orders_pdfs_errors = []
for data_row in orders_pdfs.iterrows():
    try:
        scrape_scotus_docs(data_row, 'name_url', 'orders_pdfs/')
        if data_row[1]['revised_url'].isnull()==False:
#             print(data_row[1]['revised_url'])
            scrape_scotus_docs(data_row, 'revised_url', 'revised_orders_pdfs/')
    except:
        orders_pdfs_errors.append(data_row)


In [84]:
relating_to_orders_pdfs_errors

[(0, date                                                     6/29/12
  docket                                                   11-1240
  name                                            FCC v. CBS Corp.
  name_url       https://www.supremecourt.gov/opinions/11pdf/11...
  revised                                                      NaN
  revised_url                                                  NaN
  j.                                                             G
  pt.                                                        567/2
  Name: 0, dtype: object),
 (1, date                                                     6/29/12
  docket                                                   11-1240
  name                                            FCC v. CBS Corp.
  name_url       https://www.supremecourt.gov/opinions/11pdf/11...
  revised                                                      NaN
  revised_url                                                  NaN
  j.                         