In [1]:
import numpy as np
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
import datetime

## Scraping set of wikipedia case urls and basic case info

#### Two functions, which scrape basic case data from wikipedia index of cases, either in list (li) or table (tr) form

In [3]:
def liCaseInfo(ul, case_list, volume):
    li_list = ul.findAll('li')
    for li in li_list:
        case_name = wiki_url = justia_link = holding = np.nan
        if len(li.select('i')) > 0:
            case_name = li.select('i')[0].get_text().replace('\n', '')
        if len(li.select('a')) > 0:
            url_ref = str(li.select('a')[0].get('href'))
            if 'w/index' not in url_ref:
                if url_ref != None:
                    wiki_url = 'https://www.wikipedia.org' + url_ref

        if len(li.select('b')) > 0:
            b_field = li.select('b')[0]
            if len(b_field.findAll('a')) > 2:
                justia_link = b_field.findAll('a')[2].get('href')
        holding = li.get_text()
        case_list.loc[len(case_list.index)] = [case_name, wiki_url, volume, justia_link, holding]

def trCaseInfo(table, case_list, volume):
    tr_list = table.findAll('tr', {'class': 'vevent'})
    for tr in tr_list:
        case_name = wiki_url = justia_link = holding = np.nan
        case_title = tr.findAll('td', {'class': 'summary'})[0] 
        case_name = case_title.get_text().replace('\n', '')
        
        url_ref = str(case_title.select('a')[0].get('href'))
        if 'w/index' not in url_ref:
            if url_ref != None:
                wiki_url = 'https://www.wikipedia.org' + url_ref
        justia_link = case_title.findNext('td').findAll('a')[0].get('href').replace('\n', '')
        case_list.loc[len(case_list.index)] = [case_name, wiki_url, volume, justia_link, holding]

#### Using those two functions to scrape basic case data and urls from wiki index pages:

In [6]:
scotus_volume_urls = ['https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_' + str(num) for num in range(2, 587)]     
all_scotus_cases = pd.DataFrame(columns=['case_name', 'wiki_link', 'volume', 'justia_link', 'holding'])
volumes_to_fix = []
error_volumes = []

for url in scotus_volume_urls:
    volume = url.split('volume_')[1]
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    set_of_case_tables = soup.findAll('table', {'class': 'wikitable'})
    set_of_case_lists = soup.findAll('ul')
    
    if volume == '2':
        heading_child = soup.findAll('h2')[6]
        li_list = heading_child.findNext('ul')
        liCaseInfo(li_list, all_scotus_cases, volume)
    elif len(set_of_case_tables) > 1:
        table = soup.find('span', {'id': 'Supreme_Court_of_the_United_States'}).parent.find_next_sibling()
        trCaseInfo(table, all_scotus_cases, volume)
    elif len(set_of_case_tables) == 1:
        table = set_of_case_tables[0]
        trCaseInfo(table, all_scotus_cases, volume)
    elif len(set_of_case_lists) > 0:
        li_list = set_of_case_lists[0]
        liCaseInfo(li_list, all_scotus_cases, volume)
    else:
        volumes_to_fix.append(volume)
        continue

# all_scotus_cases.index = range(0, len(all_scotus_cases.index))
# all_scotus_cases = all_scotus_cases.join(wiki_case_list[['topic', 'sub_topic']], on='case_name')
# all_scotus_cases['wiki_link'].loc[(all_scotus_cases['wiki_link'].str.contains('.orgNone') == True)] = np.nan
# all_scotus_cases['justia_link'].loc[(all_scotus_cases['justia_link'].str.contains('/wiki/') == True)] == np.nan
# all_scotus_cases.to_csv('Desktop/all_scotus_cases.csv')
all_scotus_cases

Unnamed: 0,case_name,wiki_link,volume,justia_link,holding
0,"Appointment Of Justices,",https://www.wikipedia.orgNone,2,,"Appointment Of Justices, 2 U.S. (2 Dall.) 399 ..."
1,"Qualification Of Counsellors And Attorneys,",https://www.wikipedia.orgNone,2,,"Qualification Of Counsellors And Attorneys, 2 ..."
2,"West v. Barnes,",https://www.wikipedia.org/wiki/West_v._Barnes,2,,"West v. Barnes, 2 U.S. (2 Dall.) 401 (1791)"
3,"Oswald v. New York,",https://www.wikipedia.org/wiki/Oswald_v._New_York,2,,"Oswald v. New York, 2 U.S. (2 Dall.) 401 (1791)"
4,"Georgia v. Brailsford,",https://www.wikipedia.org/wiki/Georgia_v._Brai...,2,,"Georgia v. Brailsford, 2 U.S. (2 Dall.) 402 (1..."
5,Hayburn's Case,https://www.wikipedia.org/wiki/Hayburn%27s_Case,2,,"Hayburn's Case, 2 U.S. (2 Dall.) 409 (1792)"
6,"Georgia v. Brailsford,",https://www.wikipedia.org/wiki/Georgia_v._Brai...,2,,"Georgia v. Brailsford, 2 U.S. (2 Dall.) 415 (1..."
7,"Chisholm v. Georgia,",https://www.wikipedia.org/wiki/Chisholm_v._Geo...,2,,"Chisholm v. Georgia, 2 U.S. (2 Dall.) 419 (1793)"
8,"Appointment Of Paterson,",https://www.wikipedia.orgNone,2,,"Appointment Of Paterson, 2 U.S. (2 Dall.) 479 ..."
9,Georgia v. Brailsford,https://www.wikipedia.org/wiki/Georgia_v._Brai...,3,http://openjurist.org/3/us/1/,


In [None]:
# all_scotus_cases = pd.DataFrame(columns=['case_name', 'wiki_link', 'volume', 'justia_link', 'holding'])
# page = requests.get(scotus_volume_urls[200])
# soup = BeautifulSoup(page.content, 'html.parser')
# set_of_case_tables = soup.findAll('table', {'class': 'wikitable'})
# set_of_case_lists = soup.findAll('ul')
# intro_sentence = soup.find_all(text=re.compile('This is a list'))[0]

# u_list = intro_sentence.find_next('ul')
# liCaseInfo(u_list, all_scotus_cases, volume)
    
# all_scotus_cases

### Scraping wiki's list of "landmark" cases for "topic" and "sub_topic" information and basic "holding" data

In [14]:
columns = ['case_name', 'topic', 'sub_topic', 'wiki_url', 'justia_link', 'holding']
wiki_case_list = pd.DataFrame(columns=columns)

page = requests.get('https://en.wikipedia.org/wiki/List_of_landmark_court_decisions_in_the_United_States')
soup = BeautifulSoup(page.content, 'html.parser')

list_div = soup.findAll('div', {'class': 'mw-parser-output'})[0]
list_children = list_div.findChildren(recursive=False)

topic = ''
sub_topic = ''

for child in list_children:
    if child.name == 'h2':
        topic = child.get_text().split('[edit]')[0]
        if topic == 'See also':
            break
    if child.name == 'h3':
        sub_topic = child.get_text().split('[edit]')[0]
    if (topic == '') | (sub_topic == ''):
        continue
    if child.name == 'ul':
        li_list = child.findAll('li')
        for li in li_list:
            index = len(wiki_case_list)
            wiki_case_list.loc[index] = np.nan 
            wiki_case_list.loc[index]['topic'] = topic
            wiki_case_list.loc[index]['sub_topic'] = sub_topic
            if len(li.select('i')) > 0:
                wiki_case_list.loc[index]['case_name'] = li.select('i')[0].get_text()
            if len(li.select('a')) > 0:
                url_ref = li.select('a')[0].get('href')
                wiki_case_list.loc[index]['wiki_url'] = 'https://www.wikipedia.org' + url_ref
            if len(li.select('b')) > 0:
                b_field = li.select('b')[0]
                if len(b_field.findAll('a')) > 2:
                    wiki_case_list.loc[index]['justia_link'] = b_field.findAll('a')[2].get('href')
            wiki_case_list.loc[index]['holding'] = li.get_text()
            if 'w/index' in url_ref:
                wiki_case_list.loc[index]['wiki_url'] = np.nan

wiki_case_list

Unnamed: 0,case_name,topic,sub_topic,wiki_url,justia_link,holding
0,Dred Scott v. Sandford,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Dred_Scott_v._S...,https://supreme.justia.com/cases/federal/us/60...,"Dred Scott v. Sandford, 60 U.S. 393 (1857) Peo..."
1,Strauder v. West Virginia,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Strauder_v._Wes...,https://supreme.justia.com/cases/federal/us/10...,"Strauder v. West Virginia, 100 U.S. 303 (1880)..."
2,Civil Rights Cases,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Civil_Rights_Cases,https://supreme.justia.com/cases/federal/us/10...,"Civil Rights Cases, 109 U.S. 3 (1883) Neither ..."
3,Plessy v. Ferguson,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Plessy_v._Ferguson,https://supreme.justia.com/cases/federal/us/16...,"Plessy v. Ferguson, 163 U.S. 537 (1896) Segreg..."
4,New Negro Alliance v. Sanitary Grocery Co.,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/New_Negro_Allia...,https://supreme.justia.com/cases/federal/us/30...,"New Negro Alliance v. Sanitary Grocery Co., 30..."
5,Smith v. Allwright,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Smith_v._Allwright,https://supreme.justia.com/cases/federal/us/32...,"Smith v. Allwright, 321 U.S. 649 (1944) Primar..."
6,Korematsu v. United States,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Korematsu_v._Un...,https://supreme.justia.com/cases/federal/us/32...,"Korematsu v. United States, 323 U.S. 214 (1944..."
7,Morgan v. Virginia,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Irene_Morgan#U....,https://supreme.justia.com/cases/federal/us/32...,"Morgan v. Virginia, 328 U.S. 373 (1946) A Virg..."
8,Shelley v. Kraemer,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Shelley_v._Kraemer,https://supreme.justia.com/cases/federal/us/33...,"Shelley v. Kraemer, 334 U.S. 1 (1948) Courts m..."
9,Henderson v. United States,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Henderson_v._Un...,https://supreme.justia.com/cases/federal/us/33...,"Henderson v. United States, 339 U.S. 816 (1950..."


In [None]:
wiki_case_list.to_csv('Desktop/wiki_case_list.csv')

In [None]:
states = [
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 
    'Delaware', 'Florida', 'Georgia', 'Hawaii', "Hawai'i", 'Idaho', 'Illinois', 'Indiana', 
    'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 
    'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 
    'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 
    'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 
    'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', "Washington, D.C.", 'DC', 'D.C.', 
    'Washington DC', 'Washington D.C.', 'West Virginia', 'Wisconsin', 'Wyoming']

In [148]:
scotus_volume_urls = ['https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_' + str(num) for num in range(2, 587)]

volume_urls = ['https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_2',
              'https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_21',
              'https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_568']
failed_urls = []

columns = ['case_name', 'case_url', 'dt_decided', 'v.', 'state_v_state', 'petitioner/appellant', 'respondent/appellee', 
           'ex_rel', 'ex_parte', 'citation_text', 'citation_url' 'per_curiam']
cases = pd.DataFrame(columns=columns)

for volume_url in scotus_volume_urls[0:20]:
# for volume_url in volume_urls:
    page = requests.get(volume_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    page_body = soup.select('div.mw-parser-output')[0]
    if page_body.contents[0].name == 'p':
        element_after_scotus_header = page_body.find_all('p')[0].next_sibling.next_sibling
#         print(element_after_scotus_header)
    if element_after_scotus_header.name == 'dl':
        element_after_scotus_header = element_after_scotus_header.next_sibling.next_sibling
    if page_body.contents[0].name == 'ul':
        ul = page_body.contents[0]
        scrape_list(ul, cases)
        continue

    scotus_headers = page_body.select('span#Supreme_Court_of_the_United_States')
    if len(scotus_headers) > 0:
        page_scotus_header = scotus_headers[0].parent
        element_after_scotus_header = page_scotus_header.next_sibling.next_sibling

    if element_after_scotus_header.name == 'table':
        table = element_after_scotus_header
        scrape_table(table, cases)
    elif element_after_scotus_header.name == 'ul':
        ul = element_after_scotus_header
        scrape_list(ul, cases)
    else:
        failed_urls.append(volume_url)
        
cases

Unnamed: 0,case_name,case_url,dt_decided,v.,state_v_state,petitioner/appellant,respondent/appellee,ex_rel,ex_parte,citation_text,citation_urlper_curiam
0,,,,,,,,,,,
1,,,,,,,,,,,
2,,,,,,,,,,,
3,,,,,,,,,,,
4,,,,,,,,,,,
5,,,,,,,,,,,
6,Georgia v. Brailsford,www.wikipedia.org/wiki/Georgia_v._Brailsford_(...,,,,,,,,3 U.S. 1,
7,Glass v. The Sloop Betsey,www.wikipedia.org/wiki/Glass_v._The_Sloop_Betsey,,,,,,,,3 U.S. 6,
8,United States v. Hamilton,www.wikipedia.org/wiki/United_States_v._Hamilton,,,,,,,,3 U.S. 17,
9,Bingham v. Cabot (1795),www.wikipedia.org/wiki/Bingham_v._Cabot_(1795),,,,,,,,3 U.S. 19,


In [115]:
failed_urls

['https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_569',
 'https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_570',
 'https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_571',
 'https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_572',
 'https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_573',
 'https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_574',
 'https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_575',
 'https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_576',
 'https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_577',
 'https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_578',
 'https://en.wikipedia.org/wiki/List_of_United_States_Supreme_Court_cases,_volume_579',
 'https://en.wikipedia.org/wiki/

In [118]:
cases.loc[cases['per_curiam_url']==True]

Unnamed: 0,case_name,case_url,year,v.,state_v_state,petitioner/appellant,respondent/appellee,ex_rel,ex_parte,citation_text,per_curiam,per_curiam_url


In [109]:
def scrape_list(ul, data):
    li_list = ul.find_all('li')
    for li in li_list:
        case_title = li.contents[0]
        if case_title.contents[0].name != 'a':
            continue
        else:
            li_string = li.get_text()
            
            index = len(data)
            data.loc[index] = np.nan
            data_row = data.loc[index]
            
            if 'per curiam' in li_string:
                data_row['per_curiam'] = True
                li_string = li_string.split(' (per curiam)')[0]
            
            data_row['dt_decided'] = li_string[len(li_string)-5 : len(li_string)-1]
            li_string = li_string[0: len(li_string)-6]
            
            data_row['citation_text'] = li_string.split(', ')[len(li_string.split(', '))-1]
            data_row['citation_url'] = li.find_all('a')[len(li.find_all('a'))-1]
            li_string = li_string.split(', ' + data_row['citation_text'])[0]
            
            data_row['case_name'] = li_string
            if 'ex rel.' in data_row['case_name']:
                data_row['ex_rel'] = True 
            elif 'Ex parte' in data_row['case_name']:
                data_row['ex_parte'] = True 
            elif ' v. ' in data_row['case_name']:
                data_row['v.'] = True 
                data_row['petitioner/appellant'] = data_row['case_name'].split(' v. ')[0]
                data_row['respondent/appellee'] = data_row['case_name'].split(' v. ')[1]
                if (data_row['petitioner/appellant'] in states) and (data_row['respondent/appellee']):
                    data_row['state_v_state'] = True
                        
            if 'redlink' not in case_title.contents[0].get('href'):
                data_row['case_url'] = 'www.wikipedia.org' + case_title.contents[0].get('href')


In [147]:
def scrape_table(table, data):
    tr_list = table.select('tr.vevent')
    for tr in tr_list:
        index = len(data)
        data.loc[index] = np.nan
        data_row = data.loc[index]
        
        case_title = tr.select('td')[0]
        data_row['case_name'] = case_title.find_all('a')[0].get_text()
        
        if 'redlink' not in case_title.find_all('a')[0].get('href'):
            data_row['case_url'] = 'www.wikipedia.org' + case_title.find_all('a')[0].get('href')
        
        case_citation = tr.select('td')[1]
        data_row['citation_text'] = case_citation.find_all('a')[0].get_text()
        if len(case_citation.find_all('a')) > 0:
            data_row['citation_url'] = case_citation.find_all('a')[0].get('href')
        
        case_date = tr.select('td')[2]
        data_row['dt_decided'] = case_date.get_text()
#         data_row['']
        
        if 'ex rel.' in data_row['case_name']:
            data_row['ex_rel'] = True 
        elif 'Ex parte' in data_row['case_name']:
            data_row['ex_parte'] = True 
        elif ' v. ' in str(data_row['case_name']):
            data_row['v.'] = True 
            data_row['petitioner/appellant'] = data_row['case_name'].split(' v. ')[0]
            data_row['respondent/appellee'] = data_row['case_name'].split(' v. ')[1]
                                
        if 'term_per_curiam_opinions' in str(data_row['case_url']):
            data_row['per_curiam_url'] = True

In [None]:
def info_from_list(volume_url):
    
    page = requests.get(volume_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    page_body = soup.select('div.mw-parser-output')[0]
    
    page_scotus_header = page_body.select('span', id='Supreme_Court_of_the_United_States')
    
    page_uls = page_body.find_all('ul')
    
    
    
    case_list = page_uls[0]
    external_links = page_uls[1]
    
    

In [None]:
def get_case_info_from_wiki_page(url, data, index):
    index = len(data)
#     print(url)
    
    case = data.loc[index]
    
    for column in data.columns:
        data.loc[index] = np.nan
    
    data.loc[index]['wiki_url'] = url

    strings_not_caught = []
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    tables = soup.find_all('table', {'class': 'infobox scotus'})
    if len(tables) == 0:
#         exceptions.loc[len(exceptions)] = np.nan
#         exceptions.loc[len(exceptions)]['url'] = url
#         exceptions.loc[len(exceptions)]['exception'] = 'no scotus tables'
#         exceptions.append([url, 'no scotus tables'])
#         raise Exception('no scotus tables')
        data.loc[index]['errors'] == 'no scotus tables'
        return
    table = tables[0]
    tr_list = table.find_all('tr')
    strings_to_ignore = ['\n', '\xa0', '·', ' ']
    tr_string_list = []
    for tr in tr_list:
        string_list = []
        for string in tr.strings:
            if string not in strings_to_ignore:
                string_list.append(string)
        tr_string_list.append(string_list)
    
    plu_num = con_num = dis_num = con_dis_num = ser_num = 1
    data.loc[index]['per_curiam'] = False
    data.loc[index]['case_name'] = tr_string_list[0][0]
    if len(soup.findAll('p')) > 0:
        data.loc[index]['first_paragraph'] = soup.findAll('p')[0].get_text()
    
    previous_heading = np.nan
    for string_list in tr_string_list[2:]:
        if len(string_list) > 0:
            if ('Argued' in string_list[0]) | ('Decided' in string_list[0]):
                for string in string_list:
                    if 'Argued' in string:
                        data.loc[index]['dt_argued'] = string.split('Argued ')[1]
                        continue
                    elif 'Reargued' in string:
                        data.loc[index]['dt_reargued'] = string.split('Reargued ')[1]
                        continue
                    elif 'Decided' in string:
                        data.loc[index]['dt_decided'] = string.split('Decided ')[1]
                        continue
            elif string_list[0] == 'Full case name':
                data.loc[index]['full_case_name'] = string_list[1]
                continue
            elif string_list[0] == 'Citations':
                data.loc[index]['citations'] = string_list[1:]
                continue
            elif (string_list[0] == 'Prior history') | (string_list[0] == 'Procedural history'):
                data.loc[index]['prior_history'] = string_list[1:]
                continue
            elif string_list[0] == 'Subsequent history':
                data.loc[index]['subsequent_history'] = string_list[1:]
                continue
            elif string_list[0] == 'Claim':
                data.loc[index]['claim'] = string_list[1:]
                continue
            elif (string_list[0] == 'Holding') | (string_list[0] == 'Outcome'):
                previous_heading = 'holding'
                continue
            elif previous_heading == 'holding':
                data.loc[index]['holding'] = string_list[0:]
                previous_heading = np.nan
                continue
            elif string_list[0] == 'Questions presented':
                previous_heading = 'questions'
                continue
            elif previous_heading == 'questions':
                data.loc[index]['questions'] = string_list[0:]
                previous_heading = np.nan
                continue
            elif string_list[0] == 'Court membership':
                previous_heading = 'court membership'
                continue
            elif previous_heading == 'court membership':
                data.loc[index]['c_justice'] = string_list[1]
                data.loc[index]['a_justices'] = string_list[3:]
                previous_heading = np.nan
                continue
            elif string_list[0] == 'Case opinions':
                previous_heading = 'case opinions'                
                continue
            elif previous_heading == 'case opinions':
                opinions_list = string_list[0]
                author = opinions_list
                justices = []
                if len(string_list) > 1:
                    justices = string_list[1].split(', joined by ')
                if string_list[0] == 'Per curiam.':
                    data.loc[index]['per_curiam'] = True
                    continue
                elif string_list[0] == 'Majority':
                    data.loc[index]['majority'] = justices
                    continue
                elif string_list[0] == 'Plurality':
                    data.loc[index]['plurality_' + str(plu_num)] = justices
                    plu_num = plu_num + 1
                    continue
                elif string_list[0] == 'Concurrence':
                    data.loc[index]['concur_' + str(con_num)] = justices
                    con_num = con_num + 1
                    continue
                elif string_list[0] == 'Dissent':
                    data.loc[index]['dissent_' + str(dis_num)] = justices
                    dis_num = dis_num + 1
                    continue
                elif string_list[0] == 'Concur/Dissent':
                    data.loc[index]['concur/dissent_' + str(con_dis_num)] = justices
                    con_dis_num = con_dis_num + 1
                    continue
                elif string_list[0] == 'Seriatim opinion':
                    data.loc[index]['seriatim_' + str(ser_num)] = justices
                    ser_num = ser_num + 1
                    continue
                elif ' took no part' in string_list[0]:
                    data.loc[index]['no_part'] = string_list[0].split(' took no part')[0]
                    continue
                continue
                
            elif string_list[0] == 'Laws applied':
                previous_heading = 'laws_applied'
                continue
            elif previous_heading == 'laws_applied':
                data.loc[index]['laws_applied'] = string_list[0]
                previous_heading = np.nan
            elif string_list[0] == 'Docket nos.':
                data.loc[index]['docket_nos'] = string_list[1:]
            elif 'overturned a previous' in string_list[0]:
                previous_heading = 'overturned'
            elif previous_heading == 'overturned':
                data.loc[index]['overturned'] = string_list[0]
                previous_heading = np.nan
            elif 'Superseded' in string_list[0]:
                data.loc[index]['superseded'] = string_list[0]
                previous_heading = np.nan
                
            elif string_list[0] == 'Majority':
                data.loc[index]['majority'] = string_list[1].split(', joined by')
                continue
            elif string_list[0] == 'Plurality':
                data.loc[index]['plurality_' + str(plu_num)] = string_list[1].split(', joined by')
                plu_num = plu_num + 1
                continue
            elif string_list[0] == 'Concurrence':
                data.loc[index]['concurrence_' + str(con_num)] = string_list[1].split(', joined by')
                con_num = con_num + 1
                continue
            elif string_list[0] == 'Dissent':
                data.loc[index]['dissent_' + str(dis_num)] = string_list[1].split(', joined by')
                dis_num = dis_num + 1
                continue
            elif string_list[0] == 'Concur/dissent':
                data.loc[index]['concur/dissent_' + str(con_dis_num)] = string_list[1].split(', joined by')
                con_dis_num = con_dis_num + 1
                continue
            elif string_list[0] == 'Seriatim opinion':
                data.loc[index]['seriatim_' + str(ser_num)] = string_list[1].split(', joined by ')
                ser_num = ser_num + 1
                continue
            elif ' took no part' in string_list[0]:
                data.loc[index]['no_part'] = string_list[0].split(' took no part')[0]
                continue
            elif string_list[0] == 'Per curiam':
                data.loc[index]['per_curiam'] = True
                continue
                
            else:
                strings_not_caught.append(string_list[0])

    if strings_not_caught != []:
        data.loc[index]['strings_not_caught'] = strings_not_caught

In [None]:
columns = ['case_name', 'wiki_url', 'full_case_name', 'dt_argued', 'dt_reargued', 'dt_decided', 'full_case_name',
        'citations', 'prior_history', 'subsequent_history', 'questions', 'claim', 'holding', 'c_justice', 'a_justices', 
        'majority', 'plurality_1', 'plurality_2', 'plurality_3', 'plurality_4', 'plurality_5',
        'dissent_1', 'dissent_2', 'dissent_3', 'dissent_4', 'dissent_5', 'dissent_6',
        'concur_1', 'concur_2', 'concur_3', 'concur_4', 'concur_5', 'concur_6', 
        'concur/dissent_1', 'concur/dissent_2', 'concur/dissent_3', 'concur/dissent_4',
        'seriatim_1', 'seriatim_2', 'seriatim_3', 'seriatim_4', 'seriatim_5', 'seriatim_6', 
        'no_part', 'per_curiam', 'laws_applied', 'docket_nos', 
        'overturned', 'superseded', 'first_paragraph', 'strings_not_caught', 'errors']

# data = pd.DataFrame(columns=columns)
wiki_url_list = all_scotus_cases['wiki_link'].loc[all_scotus_cases['wiki_link'].str.contains('wikipedia') == True]
# wiki_url_list = all_scotus_cases['wiki_link'].loc[all_scotus_cases['wiki_link'].str.contains('wikipedia') == True][0:2000]
# wiki_url_list = all_scotus_cases['wiki_link'].loc[all_scotus_cases['wiki_link'].str.contains('wikipedia') == True][0:200]

data3 = pd.DataFrame(columns=columns)
exceptions = pd.DataFrame(columns=['url', 'exception'])
start_time = datetime.datetime.now()
for wiki_url in wiki_url_list:
#     get_case_info_from_wiki_page3(wiki_url, data3)
    try:
        get_case_info_from_wiki_page3(wiki_url, data3)
    except Exception:
        display(Exception)

end_time = datetime.datetime.now()
time_length = end_time - start_time
print(time_length)
display(exceptions)
data3

In [None]:
# case_names = ['Abbott Laboratories v. Gardner',
#                 'Abbott v. Burke',
#                 'Adamson v. California',
#                 'Adkins v. Childrens Hospital',
#                 'American Insurance Association v. Garamendi',
#                 'American Textile Manufacturers Institute v. Donovan',
#                 'Baker v. Carr',
#                 'Baker v. Carr',
#                 'Barron v. Baltimore',
#                 'Bi-Metallic Investment Co. v. State Board of Equalization of Colorado',
#                 'Board of Curators of the University of Missouri v. Horowitz',
#                 'Board of Trustees of University of Alabama v. Garrett',
#                 'Boumediene v. Bush',
#                 'Bowsher v. Synar',
#                 'Brandeis: Ashwander',
#                 'Brown v. Board of Education',
#                 'Chamber of Commerce v. Whiting',
#                 'Champion v. Ames',
#                 'Charles River Bridge v. Warren Bridge',
#                 'Chevron v. Natural Resource Defense Council, Inc.',
#                 'Chisolm v. Georgia',
#                 'Christopher v. SmithKline Beecham Corp.',
#                 'Citizens to Preserve Overton Park v. Volpe',
#                 'Clapper v. Amnesty International USA',
#                 'Clinton v. City of New York',
#                 'Clinton v. Jones',
#                 'Cohens v. Virginia',
#                 'Colegrove v. Green',
#                 'Colegrove v. Green',
#                 'Cooley v. Board of Wardens',
#                 'Coyle v. Smith',
#                 'Dartmouth College v. Woodward',
#                 'DeFunis v. Odegaard',
#                 'Department of Air Force v. Rose',
#                 'District of Columbia v. Heller',
#                 'Dolan v. City of Tigard',
#                 'Dolan v. City of Tigard',
#                 'Dow Chemical Co. v. U.S.',
#                 'Dred Scott v. Sandford',
#                 'Enterprise Fund v. Public Company Accounting Oversight Board',
#                 'Ex Parte McCardle',
#                 'Ex Parte Milligan',
#                 'FDA v. Brown & Williamson',
#                 'Ferguson v. Skrupa',
#                 'Field v. Clark',
#                 'Flast v. Cohen',
#                 'Garcia v. San Antonio Metro',
#                 'Gibbons v. Ogden',
#                 'Gitlow v. New York',
#                 'Goldberg v. Kelly',
#                 'Gomillion v. Lightfoot',
#                 'Gonzales v. Raich',
#                 'Goss v. Lopez',
#                 'Griswold v. Connecticut',
#                 'Haig v. Agee',
#                 'Hamdan v. Rumsfeld',
#                 'Hamdi v. Rumsfeld',
#                 'Hammer v. Dagenhart',
#                 'Hampton & Co. v. U.S.',
#                 'Hawaii Housing Authority v. Midkiff',
#                 'Heart of Atlanta Motel v. U.S.',
#                 'Heckler v. Chaney',
#                 'Hutchinson v. Proxmire',
#                 'INS v. Chadha',
#                 'INS v. Chadha',
#                 'INS v. Lopez-Mendoza',
#                 'Kelo v. City of New London',
#                 'Kelo v. City of New London',
#                 'Korematsu v. U.S.',
#                 'Korematsu v. U.S.',
#                 'Lochner v. New York',
#                 'Lochner v. New York',
#                 'Lujan v. Defenders of Wildlife',
#                 'Luther v. Borden',
#                 'Marathon Oil v. Environmental Protection Agency',
#                 'Marbury v. Madison',
#                 'Marshall v. Barlows Inc.',
#                 'Martin v. Hunters Lessee',
#                 'Massachusetts v. Mellon',
#                 'Frothingham v. Mellon',
#                 'Mathew v. Eldridge',
#                 'McCulloch v. Maryland',
#                 'Milner v. Department of the Navy',
#                 'Mistretta v. U.S.',
#                 'Morgan v. U.S.',
#                 'Motor Vehicle Manufacturers Association v. State Farm',
#                 'Munn v. Illinois',
#                 'Munn v. Illinois',
#                 'NAACP v. Federal Power Commission',
#                 'NRDC v. Vermont Yankee Nuclear Power Corp.',
#                 'National Labor Relations Board v. Jones & Laughlin',
#                 'National Labor Relations Board v. Jones & Laughlin Steel Corporation',
#                 'National League of Cities v. Usery',
#                 'Nebbia v. New York',
#                 'New York Times v. U.S.',
#                 'Nollan v. California Coastal Commission',
#                 'Nollan v. California Coastal Commission',
#                 'Palko v. Connecticut',
#                 'Pennsylvania v. Nelson',
#                 'Plessy v. Ferguson',
#                 'Plyer v. Doe',
#                 'Powell v. McCormack',
#                 'Printz v. U.S.',
#                 'Rachel v. Walker',
#                 'Reno v. Condon',
#                 'Rose v. Council for Better Education',
#                 'San Antonio School District v. Rodriguez',
#                 'San Antonio School District v. Rodriguez',
#                 'Schechter Poultry Corp. v. U.S.',
#                 'Scott v. Emerson',
#                 'Shaw v. Reno',
#                 'Shelby County v. Holder',
#                 'Slaughterhouse Cases',
#                 'South Central Timber Development Inc. v. Wunnicke',
#                 'St. Josephs Abbey v. State of Louisiana Board of Embalmers and Funeral Directors',
#                 'Stop the Beach Renourishment v. Florida Department of Environmental Protection',
#                 'Strader v. Graham',
#                 'Texas v. White',
#                 'The Prize Cases',
#                 'U.S. v. Butler',
#                 'U.S. v. California',
#                 'U.S. v. Carolene Products',
#                 'U.S. v. Comstock',
#                 'U.S. v. Curtiss-Wright',
#                 'U.S. v. Darby',
#                 'U.S. v. E.C. Knight',
#                 'U.S. v. Grimaud',
#                 'U.S. v. Lopez',
#                 'U.S. v. Morrison',
#                 'U.S. v. Nixon',
#                 'U.S. v. U.S. District Court',
#                 'Valley Forge Christian College v. Americans United',
#                 'Vance v. Ball State University',
#                 'Ventura v. Shalala',
#                 'Vermont Yankee Nuclear Power Corp. v. NRDC',
#                 'Walters v. National Association of Radiation Survivors',
#                 'Watkins v. U.S. / Barenblatt v. U.S.',
#                 'West Coast Hotel v. Parrish',
#                 'Wickard v. Filburn',
#                 'Wyman v. James',
#                 'Youngstown v. Sawyer']


# def makeBingUrls(addedString):
#     case_urls = []
#     for case_name in case_names:
#         word = case_name + ' ' + addedString
#         words = word.split(' ')
#         search_query = reduce((lambda x, y: x + '+' + y), words)
#         url = 'https://www.bing.com/search?q=' + search_query
#         case_urls.append([case_name, url])
#     return case_urls

# wiki_urls = makeBingUrls('wikipedia')
# wiki_urls

In [None]:
# def get_case_info_from_wiki_page(url, data):
    
#     case_name = full_case_name = dt_argued = dt_reargued = dt_decided = full_case_name = np.nan
#     citations = prior_history = holding = c_justice = a_justices = majority = pluralities = np.nan
#     concurrences = dissents = concur_dissents = seriatim = no_part = per_curiam = laws_applied = np.nan
#     docket_nos = overturned = superseded = np.nan
    
#     strings_not_caught = []
    
#     page = requests.get(url)
#     soup = BeautifulSoup(page.content, 'html.parser')
#     table = soup.find_all('table', {'class': 'infobox scotus'})[0]
#     tr_list = table.find_all('tr')
#     strings_to_ignore = ['\n', '\xa0', '·', ' ']
#     tr_string_list = []
#     for tr in tr_list:
#         string_list = []
#         for string in tr.strings:
#             if string not in strings_to_ignore:
#                 string_list.append(string)
#         tr_string_list.append(string_list)
    
#     case_name = tr_string_list[0][0]
    
#     previous_heading = np.nan
#     for string_list in tr_string_list[2:]:
# #         print(string_list)
#         if len(string_list) > 0:
#             if ('Argued' in string_list[0]) | ('Decided' in string_list[0]):
#                 for string in string_list:
#                     if 'Argued' in string:
#                         dt_argued = string.split('Argued ')[1]
#                         continue
#                     elif 'Reargued' in string:
#                         dt_reargued = string.split('Reargued ')[1]
#                         continue
#                     elif 'Decided' in string:
#                         dt_decided = string.split('Decided ')[1]
#                         continue
#             elif string_list[0] == 'Full case name':
#                 full_case_name = string_list[1]
#                 continue
#             elif string_list[0] == 'Citations':
#                 citations = string_list[1:]
#                 continue
#             elif string_list[0] == 'Prior history':
#                 prior_history = string_list[1:]
#                 continue
#             elif string_list[0] == 'Holding':
#                 previous_heading = 'holding'
#                 continue
#             elif previous_heading == 'holding':
#                 holding = string_list[0:]
#                 previous_heading = np.nan
#                 continue
#             elif string_list[0] == 'Court membership':
#                 previous_heading = 'court membership'
#                 continue
#             elif previous_heading == 'court membership':
#                 c_justice = string_list[1]
#                 a_justices = string_list[3:]
#                 previous_heading = np.nan
#                 continue
#             elif string_list[0] == 'Case opinions':
#                 previous_heading = 'case opinions'

#                 majority = []
#                 pluralities = [] 
#                 concurrences = []
#                 dissents = []
#                 concur_dissents = []
#                 seriatim = []
#                 no_part = []

#                 per_curiam = False
#                 continue
#             elif previous_heading == 'case opinions':
#                 opinions_list = string_list[0]
#                 author = opinions_list
#     #             joined = []
#                 joined = []
#                 if len(string_list) > 1:
#                     joined = string_list[1].split(', joined by ')

#     #             author = string_list[1].split(', joined by')[0]
#     #             joined = string_list[1].split(', joined by')[1].split(', ')

#                 if string_list[0] == 'Per curiam.':
#                     per_curiam = True
#                     continue
#                 elif string_list[0] == 'Majority':
#                     majority.append([author, joined])
#                     continue
#                 elif string_list[0] == 'Plurality':
#                     pluralities.append([author, joined])
#                     continue
#                 elif string_list[0] == 'Concurrence':
#                     concurrences.append([author, joined])
#                     continue
#                 elif string_list[0] == 'Dissent':
#                     dissents.append([author, joined])
#                     continue
#                 elif string_list[0] == 'Concur/Dissent':
#                     concur_dissents.append([author, joined])
#                     continue
#                 elif string_list[0] == 'Seriatim opinion':
#                     seriatim.append([author, joined])
#                     continue
#                 elif ' took no part' in string_list[0]:
#                     no_part.append(string_list[0].split(' took no part')[0])
#                     continue
#                 continue
#             elif string_list[0] == 'Laws applied':
#                 previous_heading = 'laws_applied'
#                 continue
#             elif previous_heading == 'laws_applied':
#                 laws_applied = string_list[0]
#     #             laws_applied = sum(string_list[0])
#                 previous_heading = np.nan
#             elif string_list[0] == 'Docket nos.':
#                 docket_nos = string_list[1:]
#             elif 'overturned a previous' in string_list[0]:
#                 previous_heading = 'overturned'
#             elif previous_heading == 'overturned':
#                 overturned = string_list[0]
#                 previous_heading = np.nan
#             elif 'Superseded' in string_list[0]:
#                 superseded = string_list[0]
#                 previous_heading = np.nan
#             else:
#                 strings_not_caught.append(string_list[0])

#     array = [case_name, full_case_name, dt_argued, dt_reargued, dt_decided, full_case_name,
#         citations, prior_history, holding, c_justice, a_justices, majority, pluralities,
#         concurrences, dissents, concur_dissents, seriatim, no_part, per_curiam, laws_applied,
#         docket_nos, overturned, superseded, strings_not_caught]

#     data[str(case_name)] = array

In [None]:
# columns = ['case_name', 'full_case_name', 'dt_argued', 'dt_reargued', 'dt_decided', 'full_case_name',
#         'citations', 'prior_history', 'holding', 'c_justice', 'a_justices', 'majority', 'pluralities',
#         'concurrences', 'dissents', 'concur_dissents', 'seriatim', 'no_part', 'per_curiam', 'laws_applied',
#         'docket_nos', 'overturned', 'superseded', 'strings_not_caught']

# data = pd.DataFrame(index=columns)
# # wiki_url_list = all_scotus_cases['wiki_link'].loc[all_scotus_cases['wiki_link'].str.contains('wikipedia') == True]

# # wiki_url_list
# for wiki_url in wiki_url_list:
#     try:
#         get_case_info_from_wiki_page(wiki_url, data)
#     except Exception:
#         print(Exception)
#     else:
#         continue
# # [get_case_info_from_wiki_page(wiki_url, data) for wiki_url in wiki_url_list]
# data

In [None]:
# def get_case_info_from_wiki_page2(url, data):
#     print(url)
#     for column in data.columns:
#         data.loc[url][column] = np.nan
    
#     strings_not_caught = []
    
#     page = requests.get(url)
#     soup = BeautifulSoup(page.content, 'html.parser')
#     tables = soup.find_all('table', {'class': 'infobox scotus'})
# #     if len(tables) == 0:
# #         raise Exception('no scotus tables')
# #         return
#     table = tables[0]
#     tr_list = table.find_all('tr')
#     strings_to_ignore = ['\n', '\xa0', '·', ' ']
#     tr_string_list = []
#     for tr in tr_list:
#         string_list = []
#         for string in tr.strings:
#             if string not in strings_to_ignore:
#                 string_list.append(string)
#         tr_string_list.append(string_list)
    
#     data.loc[url]['case_name'] = tr_string_list[0][0]
    
#     previous_heading = np.nan
#     for string_list in tr_string_list[2:]:
#         if len(string_list) > 0:
#             if ('Argued' in string_list[0]) | ('Decided' in string_list[0]):
#                 for string in string_list:
#                     if 'Argued' in string:
#                         data.loc[url]['dt_argued'] = string.split('Argued ')[1]
#                         continue
#                     elif 'Reargued' in string:
#                         data.loc[url]['dt_reargued'] = string.split('Reargued ')[1]
#                         continue
#                     elif 'Decided' in string:
#                         data.loc[url]['dt_decided'] = string.split('Decided ')[1]
#                         continue
#             elif string_list[0] == 'Full case name':
#                 data.loc[url]['full_case_name'] = string_list[1]
#                 continue
#             elif string_list[0] == 'Citations':
#                 data.loc[url]['citations'] = string_list[1:]
#                 continue
#             elif string_list[0] == 'Prior history':
#                 data.loc[url]['prior_history'] = string_list[1:]
#                 continue
#             elif string_list[0] == 'Subsequent history':
#                 data.loc[url]['subsequent_history'] = string_list[1:]
#                 continue
#             elif string_list[0] == 'Holding':
#                 previous_heading = 'holding'
#                 continue
#             elif previous_heading == 'holding':
#                 data.loc[url]['holding'] = string_list[0:]
#                 previous_heading = np.nan
#                 continue
#             elif string_list[0] == 'Court membership':
#                 previous_heading = 'court membership'
#                 continue
#             elif previous_heading == 'court membership':
#                 data.loc[url]['c_justice'] = string_list[1]
#                 data.loc[url]['a_justices'] = string_list[3:]
#                 previous_heading = np.nan
#                 continue
#             elif string_list[0] == 'Case opinions':
#                 previous_heading = 'case opinions'                
#                 plu_num = con_num = dis_num = con_dis_num = ser_num = 1
#                 data.loc[url]['per_curiam'] = False
#                 continue
#             elif previous_heading == 'case opinions':
#                 opinions_list = string_list[0]
#                 author = opinions_list
#                 justices = []
#                 if len(string_list) > 1:
#                     justices = string_list[1].split(', joined by ')
#                 if string_list[0] == 'Per curiam.':
#                     data.loc[url]['per_curiam'] = True
#                     continue
#                 elif string_list[0] == 'Majority':
#                     data.loc[url]['majority'] = justices
#                     continue
#                 elif string_list[0] == 'Plurality':
#                     data.loc[url]['plurality_' + str(plu_num)] = justices
#                     plu_num = plu_num + 1
#                     continue
#                 elif string_list[0] == 'Concurrence':
#                     data.loc[url]['concur_' + str(con_num)] = justices
#                     con_num = con_num + 1
#                     continue
#                 elif string_list[0] == 'Dissent':
#                     data.loc[url]['dissent_' + str(dis_num)] = justices
#                     dis_num = dis_num + 1
#                     continue
#                 elif string_list[0] == 'Concur/Dissent':
#                     data.loc[url]['concur/dissent_' + str(con_dis_num)] = justices
#                     con_dis_num = con_dis_num + 1
#                     continue
#                 elif string_list[0] == 'Seriatim opinion':
#                     data.loc[url]['seriatim_' + str(ser_num)] = justices
#                     ser_num = ser_num + 1
#                     continue
#                 elif ' took no part' in string_list[0]:
#                     data.loc[url]['no_part'] = string_list[0].split(' took no part')[0]
#                     continue
#                 continue
#             elif string_list[0] == 'Laws applied':
#                 previous_heading = 'laws_applied'
#                 continue
#             elif previous_heading == 'laws_applied':
#                 data.loc[url]['laws_applied'] = string_list[0]
#                 previous_heading = np.nan
#             elif string_list[0] == 'Docket nos.':
#                 data.loc[url]['docket_nos'] = string_list[1:]
#             elif 'overturned a previous' in string_list[0]:
#                 previous_heading = 'overturned'
#             elif previous_heading == 'overturned':
#                 data.loc[url]['overturned'] = string_list[0]
#                 previous_heading = np.nan
#             elif 'Superseded' in string_list[0]:
#                 data.loc[url]['superseded'] = string_list[0]
#                 previous_heading = np.nan
#             else:
#                 strings_not_caught.append(string_list[0])

#     data.loc[index]['strings_not_caught'] = strings_not_caught

In [None]:
# columns = ['case_name', 'wiki_url', 'full_case_name', 'dt_argued', 'dt_reargued', 'dt_decided', 'full_case_name',
#         'citations', 'prior_history', 'subsequent_history', 'holding', 'c_justice', 'a_justices', 
#         'majority', 'plurality_1', 'plurality_2', 'plurality_3', 'plurality_4', 'plurality_5',
#         'dissent_1', 'dissent_2', 'dissent_3', 'dissent_4', 'dissent_5', 'dissent_6',
#         'concur_1', 'concur_2', 'concur_3', 'concur_4', 'concur_5', 'concur_6', 
#         'concur/dissent_1', 'concur/dissent_2', 'concur/dissent_3', 'concur/dissent_4',
#         'seriatim_1', 'seriatim_2', 'seriatim_3', 'seriatim_4', 'seriatim_5', 'seriatim_6', 
#         'no_part', 'per_curiam', 
#         'laws_applied', 'docket_nos', 'overturned', 'superseded', 'strings_not_caught']

# # data = pd.DataFrame(columns=columns)
# wiki_url_list = all_scotus_cases['wiki_link'].loc[all_scotus_cases['wiki_link'].str.contains('wikipedia') == True]
# # wiki_url_list = all_scotus_cases['wiki_link'].loc[all_scotus_cases['wiki_link'].str.contains('wikipedia') == True][0:100]

# data2 = pd.DataFrame(index=wiki_url_list, columns=columns)

# for wiki_url in wiki_url_list:
# #     get_case_info_from_wiki_page2(wiki_url, data2)
#     try:
#         get_case_info_from_wiki_page2(wiki_url, data2)
#     except Exception:
#         print(Exception)
#     else:
#         continue
# # [get_case_info_from_wiki_page(wiki_url, data) for wiki_url in wiki_url_list]
# data2