## Basic python imports and setup stuff

In [1]:
import numpy as np
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
import datetime

In [189]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

# Scraping for case data

### Oyez scraping


#### Getting "oyez_cases_by_year_urls"

(Urls with lists of urls for all cases for each year)

In [10]:
oyez_cases_by_year_urlTags = ['1789-1850', '1850-1900', '1900-1940', '1940-1955']

for year in list(range(1955, int(datetime.datetime.now().year))):
    oyez_cases_by_year_urlTags.append(str(year))

url_base = 'https://www.oyez.org/cases/'
oyez_cases_by_year_urls = [url_base + urlTag for urlTag in oyez_cases_by_year_urlTags]

['https://www.oyez.org/cases/1789-1850',
 'https://www.oyez.org/cases/1850-1900',
 'https://www.oyez.org/cases/1900-1940',
 'https://www.oyez.org/cases/1940-1955',
 'https://www.oyez.org/cases/1955',
 'https://www.oyez.org/cases/1956',
 'https://www.oyez.org/cases/1957',
 'https://www.oyez.org/cases/1958',
 'https://www.oyez.org/cases/1959',
 'https://www.oyez.org/cases/1960',
 'https://www.oyez.org/cases/1961',
 'https://www.oyez.org/cases/1962',
 'https://www.oyez.org/cases/1963',
 'https://www.oyez.org/cases/1964',
 'https://www.oyez.org/cases/1965',
 'https://www.oyez.org/cases/1966',
 'https://www.oyez.org/cases/1967',
 'https://www.oyez.org/cases/1968',
 'https://www.oyez.org/cases/1969',
 'https://www.oyez.org/cases/1970',
 'https://www.oyez.org/cases/1971',
 'https://www.oyez.org/cases/1972',
 'https://www.oyez.org/cases/1973',
 'https://www.oyez.org/cases/1974',
 'https://www.oyez.org/cases/1975',
 'https://www.oyez.org/cases/1976',
 'https://www.oyez.org/cases/1977',
 'https:

#### Looping through url index lists and scraping urls for actual cases

In [33]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from functools import reduce
from selenium.webdriver.common.by import By
import time

oyez_cases_links = []

browser = webdriver.Chrome()
for url in oyez_cases_by_year_urls:
    try:
        browser.get(url)
        time.sleep(1)
        html = browser.page_source
        soup = BeautifulSoup(html)
        set_of_headers = soup.findAll('h2')
        for header in set_of_headers:
            href = header.select('a')[0].get('href')
            oyez_cases_links.append('https://www.oyez.org/' + href)
    except NoSuchElementException:
        pass
browser.close()
oyez_cases_links



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


['https://www.oyez.org/cases/1789-1850/19us204',
 'https://www.oyez.org/cases/1789-1850/32us243',
 'https://www.oyez.org/cases/1789-1850/36us257',
 'https://www.oyez.org/cases/1789-1850/3us386',
 'https://www.oyez.org/cases/1789-1850/2us419',
 'https://www.oyez.org/cases/1789-1850/19us264',
 'https://www.oyez.org/cases/1789-1850/10us87',
 'https://www.oyez.org/cases/1789-1850/22us1',
 'https://www.oyez.org/cases/1789-1850/40us449',
 'https://www.oyez.org/cases/1789-1850/3us171',
 'https://www.oyez.org/cases/1789-1850/21us543',
 'https://www.oyez.org/cases/1789-1850/46us215',
 'https://www.oyez.org/cases/1789-1850/6us170',
 'https://www.oyez.org/cases/1789-1850/48us1',
 'https://www.oyez.org/cases/1789-1850/5us137',
 'https://www.oyez.org/cases/1789-1850/14us304',
 'https://www.oyez.org/cases/1789-1850/36us102',
 'https://www.oyez.org/cases/1789-1850/17us316',
 'https://www.oyez.org/cases/1789-1850/11us164',
 'https://www.oyez.org/cases/1789-1850/25us213',
 'https://www.oyez.org/cases/1

### Looping through all `oyez_case_links` and scraping all case data (except opinion announcement text) into `oyez_data`

In [166]:
columns = ['oyez_name', 'oyez_link', 'documents', 'media', 'decisions', 'petitioner', 'respondent', 
           'location', 'docket_no.','decided_by', 'lower_court', 'citation', 'granted', 'dismissed',
           'argued', 'decided', 'advocates', 'appellant', 'appellee', 'facts', 'questions', 
           'conclusion', 'juris_postponed', 'affirmed_by_an_equally_divided_court', 
           'rehearing_granted', 'rehearing_granted,_judgment_vacated,_and_restored_to_the_calendar',
           'rehearing_granted,_judgment_vacated_and_case_restored_for_reargument',
           'restored_to_the_calendar_and_set_for_reargument', 'revoked_order_of_june_23_1975',
           'question_certified', 'question_certified_to_the_supreme_court_of_florida']
    
oyez_data = pd.DataFrame(columns=columns)
browser = webdriver.Chrome()
issues = []
cell_headings = []
for url in oyez_cases_links:
    index = len(oyez_data)
    oyez_data.loc[index] = np.nan
    case_documents = []
    case_media = []
    try:
        browser.get(url)
        time.sleep(2)
        html = browser.page_source
        soup = BeautifulSoup(html)
        
        sidebar = soup.findAll('div', {'class': 'full-sidebar'})[0]
        if len(sidebar.findAll('h1')) > 0:
            case_name = sidebar.findAll('h1')[0].get_text()
        else: 
            oyez_data.loc[index]['oyez_link'] = url
            issues.append(url)
            continue
        
        oyez_data.loc[index]['oyez_name'] = case_name
        oyez_data.loc[index]['oyez_link'] = url
        
        # Getting information about related documents (opinions)
        documents = sidebar.findAll('div', {'class': 'documents'})[0].findAll('li')
        for li in documents:
            a = li.findAll('a')[0]
            text = a.get_text()
            href = a.get('href')
            case_documents.append([text, href])
        
        if case_documents == []:
            case_documents = np.nan
        oyez_data.loc[index]['documents'] = case_documents
        
        # Getting information about media (opinion announcements, etc.)
        media = sidebar.findAll('div', {'class': 'media'})[0].findAll('li')
        for li in media:
            a = li.findAll('a')[0]
            text = a.get_text()
            iframe_url = str(a.get('iframe-url'))
            case_media.append([text, iframe_url])
        if case_media == []:
            case_media = np.nan
        oyez_data.loc[index]['media'] = case_media
        
        # Looping through "cells" and their "subcells" of data to 
        data_cells = soup.findAll('div', {'class': 'content-inner'})[0].findAll('div', {'class': 'cell'})
        for cell in data_cells:
            data_subcells = cell.findAll('div', {'class': 'subcell'})
            if len(data_subcells) > 0:
                for subcell in data_subcells:
                    subcell_heading = subcell.findAll('h3')[0].get_text()
                    if subcell_heading == 'Advocates':
                        advocates = []
                        advocate_divs = subcell.findAll('div', {'class': 'advocate'})
                        for advocate in advocate_divs:
                            advocate_name = advocate.findAll('a')[0].get_text()
                            advocate_url = 'https://www.oyez.org/' + advocate.findAll('a')[0].get('href')
                            advocate_description = advocate.findAll('span')[0].get_text()
                            advocates.append([advocate_name, advocate_description, advocate_url])
                        oyez_data.loc[index][subcell_heading.lower()] = advocates
                    elif len(subcell.select('h3')) > 0:
                        subcell_text = subcell.get_text().replace(subcell_heading, '')
                        subcell_heading = subcell_heading.replace(' ', '_')
                        cell_headings.append(subcell_heading)
                        if len(subcell.select('a')) > 0:
                            subcell_a = subcell.select('a')[0]
                            if subcell_a.has_attr('href'):
                                subcell_href = subcell_a.get('href')
                                if subcell_href[0] == '/':
                                    subcell_href = 'https://www.oyez.org' + subcell_href
                            elif subcell_a.has_attr('iframe-url'):
                                subcell_href = subcell_a.get('iframe-url')
                            oyez_data.loc[index][subcell_heading.lower()] = [subcell_text, subcell_href]
                        else:
                            oyez_data.loc[index][subcell_heading.lower()] = subcell_text
            
            elif len(cell.select('h3')) > 0:
                cell_heading = cell.select('h3')[0].get_text()
                cell_text = cell.get_text().replace(cell_heading, '')
                cell_heading = cell_heading.replace(' ', '_')
                cell_headings.append(cell_heading)
                if len(cell.select('a')) > 0:
                    cell_a = cell.select('a')[0]
                    if cell_a.has_attr('href'):
                        cell_href = cell_a.get('href')
                        if cell_href[0] == '/':
                            cell_href = 'https://www.oyez.org' + cell_href
                    elif cell_a.has_attr('iframe-url'):
                        cell_href = cell_a.get('iframe-url')
                    oyez_data.loc[index][cell_heading.lower()] = [cell_text, cell_href]
                else:
                    oyez_data.loc[index][cell_heading.lower()] = cell_text
        
        # Getting information about the facts of the case
        facts = soup.findAll('div', {'ng-bind-html': 'case.facts_of_the_case'})
        facts_text = []
        if len(facts) > 0:
            p_list = facts[0].findAll('p')
            for p in p_list:
                facts_text.append(p.get_text())
        if facts_text == []:
            facts_text = np.nan
        oyez_data.loc[index]['facts'] = facts_text
        
        # Getting information about the questions at the heart of a case
        questions = soup.findAll('div', {'ng-bind-html': 'case.question'})
        questions_text = []
        if len(questions) > 0:
            q_list = questions[0].findAll('li')
            for q in q_list:
                questions_text.append(q.get_text())
            q_list = questions[0].findAll('p')
            for q in q_list:
                questions_text.append(q.get_text())
        if questions_text == []:
            questions_text = np.nan
        oyez_data.loc[index]['questions'] = questions_text
        
        # Getting information about the conclusions of a case
        conclusions = soup.findAll('div', {'ng-bind-html': 'case.conclusion'})
        conclusions_text = []
        if len(conclusions) > 0:
            p_list = conclusions[0].findAll('p')
            for p in p_list:
                conclusions_text.append(p.get_text())
        if conclusions == []:
            conclusions = np.nan
        oyez_data.loc[index]['conclusion'] = conclusions_text
        
        # Scraping information about opinions and decisions
        decision_div = soup.findAll('div', {'class': 'decisions'})
        if len(decision_div) > 0:
            decision_figs = decision_div[0].findAll('figure', {'class': 'oy-decision'})
            decisions = []
            if len(decision_figs) > 0:
                for decision_fig in decision_figs:
                    vote = ''
                    if len(decision_fig.findAll('span', {'class': 'vote'})) > 0:
                        vote = decision_fig.findAll('span', {'class': 'vote'})[0].get_text()
                    winner = ''
                    if len(decision_fig.findAll('span', {'class': 'winner'})) > 0:
                        winner = decision_fig.findAll('span', {'class': 'winner'})[0].get_text() 
                    author = ''
                    if len(decision_fig.findAll('span', {'class': 'author'})) > 0:
                        author = decision_fig.findAll('span', {'class': 'author'})[0].get_text() 
                    holding = ''
                    if len(decision_fig.findAll('p', {'class': 'holding'})) > 0:
                        holding = decision_fig.findAll('p', {'class': 'holding'})[0].get_text() 
                    justices_for = []
                    justices_against = []
                    if len(decision_fig.findAll('div', {'class': 'decision-image'})) > 0:
                        decision_image = decision_fig.findAll('div', {'class': 'decision-image'})[0]    
                        justices_maj = decision_image.findAll('figure', {'class': 'majority'})
                        justices_plu = decision_image.findAll('figure', {'class': 'plurality'})
                        justices_min = decision_image.findAll('figure', {'class': 'minority'})
                        if len(justices_maj) > 0:
                            for justice in justices_maj:
                                justice_name = justice.findAll('span', {'class': 'long'})[0].get_text()
                                justices_for.append(justice_name)
                        if len(justices_plu) > 0:
                            for justice in justices_plu:
                                justice_name = justice.findAll('span', {'class': 'long'})[0].get_text()
                                justices_for.append(justice_name)
                        if len(justices_min) > 0:
                            for justice in justices_min:
                                justice_name = justice.findAll('span', {'class': 'long'})[0].get_text()
                                justices_against.append(justice_name)
                    
                    decision = [vote, winner, author, holding, justices_for, justices_against]
                    decisions.append(decision)
        if decisions == []:
            decisions = np.nan
        oyez_data.loc[index]['decisions'] = decisions
        
        # (End of loop)
        
    except NoSuchElementException:
        pass
browser.close()
oyez_data



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Unnamed: 0,oyez_name,oyez_link,documents,media,decisions,petitioner,respondent,location,docket_no.,decided_by,lower_court,citation,granted,dismissed,argued,decided,advocates,appellant,appellee,facts,questions,conclusion,juris_postponed,affirmed_by_an_equally_divided_court,rehearing_granted,"rehearing_granted,_judgment_vacated,_and_restored_to_the_calendar","rehearing_granted,_judgment_vacated_and_case_restored_for_reargument",restored_to_the_calendar_and_set_for_reargument,revoked_order_of_june_23_1975,question_certified,question_certified_to_the_supreme_court_of_florida
0,Anderson v. Dunn,https://www.oyez.org/cases/1789-1850/19us204,"[[ Syllabus , https://supreme.justia.com/case...",,,Anderson,Dunn,,,"[ Marshall Court , https://www.oyez.org/cou...",,"[ 19 US 204 (1821) , https://supreme.justi...",,,"Feb 20 - 26, 1821","Mar 2, 1821",,,,[Anderson attempted to bribe a member of Congr...,[Does the Congress have the power to punish no...,[Yes. Though the contempt power will not be fo...,,,,,,,,,
1,Barron ex rel. Tiernan v. Mayor of Baltimore,https://www.oyez.org/cases/1789-1850/32us243,"[[ Syllabus , https://supreme.justia.com/case...",,"[[Decision, for Mayor of Baltimore, , The pro...",John Barron ex rel. Tiernan,Mayor of Baltimore,,,"[ Marshall Court , https://www.oyez.org/cou...",,"[ 32 US 243 (1833) , https://supreme.justi...",,,"Feb 8 - 11, 1833","Feb 16, 1833",,,,[Baltimore wharf owner John Barron alleged tha...,[Does the Fifth Amendment deny the states as w...,"[Writing for the unanimous Court, Chief Justic...",,,,,,,,,
2,Briscoe v. Bank of Commonwealth of Kentucky,https://www.oyez.org/cases/1789-1850/36us257,"[[ Syllabus , https://supreme.justia.com/case...",,,Briscoe,Bank of Commonwealth of Kentucky,,,"[ Taney Court , https://www.oyez.org/courts...",,"[ 36 US 257 (1837) , https://supreme.justi...",,,1/28/37; 1/31/37; 2/1/37,"Feb 11, 1837",,,,[Kentucky authorized a state owned and operate...,"[By issuing notes and currency, did the bank v...",[The Court rejected Briscoe's argument. The cl...,,,,,,,,,
3,Calder v. Bull,https://www.oyez.org/cases/1789-1850/3us386,"[[ Syllabus , https://supreme.justia.com/case...",,,Calder,Bull,,,"[ Ellsworth Court , https://www.oyez.org/co...",,"[ 3 US 386 (1798) , https://supreme.justia...",,,"Feb 8 - 13, 1798","Aug 8, 1798",,,,"[Mr. and Mrs. Caleb Bull, the stated beneficia...",[Was the Connecticut legislation a violation o...,"[In a unanimous decision, the Court held that ...",,,,,,,,,
4,Chisholm v. Georgia,https://www.oyez.org/cases/1789-1850/2us419,"[[ Syllabus , https://supreme.justia.com/case...",,,Chisholm,Georgia,,,"[ Jay Court , https://www.oyez.org/courts?c...",,"[ 2 US 419 (1793) , https://supreme.justia...",,,"Feb 5, 1793","Feb 19, 1793",,,,"[In 1792, Alexander Chisholm attempted to sue ...",[Can state citizens sue state governments in f...,"[In a 4-to-1 decision, the Court ruled for the...",,,,,,,,,
5,Cohens v. Virginia,https://www.oyez.org/cases/1789-1850/19us264,"[[ Syllabus , https://supreme.justia.com/case...",,"[[Decision, for Virginia, , Jurisdiction, not...",Philip and Mendes Cohen,Virginia,[ Elizabeth River Parish (now site of Norfolk...,,"[ Marshall Court , https://www.oyez.org/cou...",State trial court,"[ 19 US 264 (1821) , https://supreme.justi...",,,2/13/21; 2/19/21; 2/20/21; 3/2/21,"Mar 5, 1821",,,,[An act of Congress authorized the operation o...,[Did the Supreme Court have the power under th...,"[In a unanimous decision, the Court held that ...",,,,,,,,,
6,Fletcher v. Peck,https://www.oyez.org/cases/1789-1850/10us87,"[[ Syllabus , https://supreme.justia.com/case...",,"[[Unanimous decision, for Peck, , Under the ...",Robert Fletcher,John Peck,,,"[ Marshall Court , https://www.oyez.org/cou...",,"[ 10 US 87 (1810) , https://supreme.justia...",,,2/15/10; 3/1/09; 3/2/09; 3/3/09; 3/4/09,"Mar 16, 1810",,,,"[In 1795, the Georgia state legislature passed...",[Could the contract between Fletcher and Peck ...,[The legislature’s repeal of the law was uncon...,,,,,,,,,
7,Gibbons v. Ogden,https://www.oyez.org/cases/1789-1850/22us1,"[[ Syllabus , https://supreme.justia.com/case...",,"[[Unanimous decision, for Gibbons, , Under th...",,,"[ Ferry line between Elizabeth, New Jersey, a...",,"[ Marshall Court , https://www.oyez.org/cou...",State appellate court,"[ 22 US 1 (1824) , https://supreme.justia....",,,"Feb 4 - 9, 1824","Mar 2, 1824","[[Daniel Webster, for Gibbons, https://www.oye...",Thomas Gibbons,Aaron Ogden,[A New York state law gave Robert R. Livingsto...,[Does the Commerce Clause give Congress author...,[Justice Marshall concluded that regulation of...,,,,,,,,,
8,Groves v. Slaughter,https://www.oyez.org/cases/1789-1850/40us449,"[[ Syllabus , https://supreme.justia.com/case...",,,Groves,Slaughter,,,"[ Taney Court , https://www.oyez.org/courts...",,"[ 40 US 449 (1841) , https://supreme.justi...",,,"Feb 12 - 19, 1841","Mar 10, 1841",,,,[Slaughter took a group of slaves to Mississip...,[Does the Mississippi constitutional provision...,[The provision did not become effective until ...,,,,,,,,,
9,Hylton v. United States,https://www.oyez.org/cases/1789-1850/3us171,"[[ Syllabus , https://supreme.justia.com/case...",,,Hylton,United States,,,"[ Ellsworth Court , https://www.oyez.org/co...",,"[ 3 US 171 (1796) , https://supreme.justia...",,,"Feb 23 - 25, 1796","Mar 8, 1796",,,,"[In 1794, Congress enacted a tax of sixteen do...","[Was the carriage tax a direct tax, which woul...",[The Court concluded that the carriage tax was...,,,,,,,,,


### Looping again through urls that were marked "pending" (which might be a sign that the data hadn't fully loaded by the time it was scraped)

In [183]:
pending_oyez = oyez_data.loc[(oyez_data['decided_by'].str.contains('pending') == True) | 
                             (oyez_data['citation'].str.contains('pending') == True) |
                             (oyez_data['oyez_name'].isnull() == True)]
# pending_oyez[['oyez_name', 'decided_by', 'citation']]
pending_links = pending_oyez['oyez_link']
oyez_pending_index = pending_links.index

browser = webdriver.Chrome()
issues = []
cell_headings = []
for index in oyez_pending_index:
    url = pending_links[index]
#     index = len(oyez_data)
    oyez_data.loc[index] = np.nan
    case_documents = []
    case_media = []
    try:
        browser.get(url)
        time.sleep(5)
        html = browser.page_source
        soup = BeautifulSoup(html)
        
        sidebar = soup.findAll('div', {'class': 'full-sidebar'})[0]
        if len(sidebar.findAll('h1')) > 0:
            case_name = sidebar.findAll('h1')[0].get_text()
        else: 
            oyez_data.loc[index]['oyez_link'] = url
            issues.append(url)
            continue
        
        oyez_data.loc[index]['oyez_name'] = case_name
        oyez_data.loc[index]['oyez_link'] = url
        
        documents = sidebar.findAll('div', {'class': 'documents'})[0].findAll('li')
        for li in documents:
            a = li.findAll('a')[0]
            text = a.get_text()
            href = a.get('href')
            case_documents.append([text, href])
        
        if case_documents == []:
            case_documents = np.nan
        oyez_data.loc[index]['documents'] = case_documents
        
        media = sidebar.findAll('div', {'class': 'media'})[0].findAll('li')
        for li in media:
            a = li.findAll('a')[0]
            text = a.get_text()
            iframe_url = str(a.get('iframe-url'))
            case_media.append([text, iframe_url])
        
        if case_media == []:
            case_media = np.nan
        oyez_data.loc[index]['media'] = case_media
        
        data_cells = soup.findAll('div', {'class': 'content-inner'})[0].findAll('div', {'class': 'cell'})
        for cell in data_cells:
            data_subcells = cell.findAll('div', {'class': 'subcell'})
            if len(data_subcells) > 0:
                for subcell in data_subcells:
                    subcell_heading = subcell.findAll('h3')[0].get_text()
                    if subcell_heading == 'Advocates':
                        advocates = []
                        advocate_divs = subcell.findAll('div', {'class': 'advocate'})
                        for advocate in advocate_divs:
                            advocate_name = advocate.findAll('a')[0].get_text()
                            advocate_url = 'https://www.oyez.org/' + advocate.findAll('a')[0].get('href')
                            advocate_description = advocate.findAll('span')[0].get_text()
                            advocates.append([advocate_name, advocate_description, advocate_url])
                        oyez_data.loc[index][subcell_heading.lower()] = advocates
                    elif len(subcell.select('h3')) > 0:
                        subcell_text = subcell.get_text().replace(subcell_heading, '')
                        subcell_heading = subcell_heading.replace(' ', '_')
                        cell_headings.append(subcell_heading)
                        if len(subcell.select('a')) > 0:
                            subcell_a = subcell.select('a')[0]
                            if subcell_a.has_attr('href'):
                                subcell_href = subcell_a.get('href')
                                if subcell_href[0] == '/':
                                    subcell_href = 'https://www.oyez.org' + subcell_href
                            elif subcell_a.has_attr('iframe-url'):
                                subcell_href = subcell_a.get('iframe-url')
                            oyez_data.loc[index][subcell_heading.lower()] = [subcell_text, subcell_href]
                        else:
                            oyez_data.loc[index][subcell_heading.lower()] = subcell_text
            
            elif len(cell.select('h3')) > 0:
                cell_heading = cell.select('h3')[0].get_text()
                cell_text = cell.get_text().replace(cell_heading, '')
                cell_heading = cell_heading.replace(' ', '_')
                cell_headings.append(cell_heading)
                if len(cell.select('a')) > 0:
                    cell_a = cell.select('a')[0]
                    if cell_a.has_attr('href'):
                        cell_href = cell_a.get('href')
                        if cell_href[0] == '/':
                            cell_href = 'https://www.oyez.org' + cell_href
                    elif cell_a.has_attr('iframe-url'):
                        cell_href = cell_a.get('iframe-url')
                    oyez_data.loc[index][cell_heading.lower()] = [cell_text, cell_href]
                else:
                    oyez_data.loc[index][cell_heading.lower()] = cell_text
                
        facts = soup.findAll('div', {'ng-bind-html': 'case.facts_of_the_case'})
        facts_text = []
        if len(facts) > 0:
            p_list = facts[0].findAll('p')
            for p in p_list:
                facts_text.append(p.get_text())
        if facts_text == []:
            facts_text = np.nan
        oyez_data.loc[index]['facts'] = facts_text
    
        questions = soup.findAll('div', {'ng-bind-html': 'case.question'})
        questions_text = []
        if len(questions) > 0:
            q_list = questions[0].findAll('li')
            for q in q_list:
                questions_text.append(q.get_text())
            q_list = questions[0].findAll('p')
            for q in q_list:
                questions_text.append(q.get_text())
        if questions_text == []:
            questions_text = np.nan
        oyez_data.loc[index]['questions'] = questions_text
        
        conclusions = soup.findAll('div', {'ng-bind-html': 'case.conclusion'})
        conclusions_text = []
        if len(conclusions) > 0:
            p_list = conclusions[0].findAll('p')
            for p in p_list:
                conclusions_text.append(p.get_text())
        if conclusions == []:
            conclusions = np.nan
        oyez_data.loc[index]['conclusion'] = conclusions_text
        
        decision_div = soup.findAll('div', {'class': 'decisions'})
        if len(decision_div) > 0:
            decision_figs = decision_div[0].findAll('figure', {'class': 'oy-decision'})
            decisions = []
            if len(decision_figs) > 0:
                for decision_fig in decision_figs:
                    vote = ''
                    if len(decision_fig.findAll('span', {'class': 'vote'})) > 0:
                        vote = decision_fig.findAll('span', {'class': 'vote'})[0].get_text()
                    winner = ''
                    if len(decision_fig.findAll('span', {'class': 'winner'})) > 0:
                        winner = decision_fig.findAll('span', {'class': 'winner'})[0].get_text() 
                    author = ''
                    if len(decision_fig.findAll('span', {'class': 'author'})) > 0:
                        author = decision_fig.findAll('span', {'class': 'author'})[0].get_text() 
                    holding = ''
                    if len(decision_fig.findAll('p', {'class': 'holding'})) > 0:
                        holding = decision_fig.findAll('p', {'class': 'holding'})[0].get_text() 
                    justices_for = []
                    justices_against = []
                    if len(decision_fig.findAll('div', {'class': 'decision-image'})) > 0:
                        decision_image = decision_fig.findAll('div', {'class': 'decision-image'})[0]    
                        justices_maj = decision_image.findAll('figure', {'class': 'majority'})
                        justices_plu = decision_image.findAll('figure', {'class': 'plurality'})
                        justices_min = decision_image.findAll('figure', {'class': 'minority'})
                        if len(justices_maj) > 0:
                            for justice in justices_maj:
                                justice_name = justice.findAll('span', {'class': 'long'})[0].get_text()
                                justices_for.append(justice_name)
                        if len(justices_plu) > 0:
                            for justice in justices_plu:
                                justice_name = justice.findAll('span', {'class': 'long'})[0].get_text()
                                justices_for.append(justice_name)
                        if len(justices_min) > 0:
                            for justice in justices_min:
                                justice_name = justice.findAll('span', {'class': 'long'})[0].get_text()
                                justices_against.append(justice_name)
                    
                    decision = [vote, winner, author, holding, justices_for, justices_against]
                    decisions.append(decision)
        if decisions == []:
            decisions = np.nan
        oyez_data.loc[index]['decisions'] = decisions
        
    except NoSuchElementException:
        pass
browser.close()
oyez_data



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


NoSuchWindowException: Message: no such window: window was already closed
  (Session info: chrome=71.0.3578.98)
  (Driver info: chromedriver=2.43.600229 (3fae4d0cda5334b4f533bede5a4787f7b832d052),platform=Mac OS X 10.14.2 x86_64)


In [191]:
oyez_data.to_csv('Desktop/oyez_data_1_26.csv')

In [185]:
oyez_data = data_oyez
oyez_data

Unnamed: 0,oyez_name,oyez_link,documents,media,decisions,petitioner,respondent,location,docket_no.,decided_by,lower_court,citation,granted,dismissed,argued,decided,advocates,appellant,appellee,facts,questions,conclusion,juris_postponed,affirmed_by_an_equally_divided_court,rehearing_granted,"rehearing_granted,_judgment_vacated,_and_restored_to_the_calendar","rehearing_granted,_judgment_vacated_and_case_restored_for_reargument",restored_to_the_calendar_and_set_for_reargument,revoked_order_of_june_23_1975,question_certified,question_certified_to_the_supreme_court_of_florida
0,Anderson v. Dunn,https://www.oyez.org/cases/1789-1850/19us204,"[[ Syllabus , https://supreme.justia.com/case...",,,Anderson,Dunn,,,"[ Marshall Court , https://www.oyez.org/cou...",,"[ 19 US 204 (1821) , https://supreme.justi...",,,"Feb 20 - 26, 1821","Mar 2, 1821",,,,[Anderson attempted to bribe a member of Congr...,[Does the Congress have the power to punish no...,[Yes. Though the contempt power will not be fo...,,,,,,,,,
1,Barron ex rel. Tiernan v. Mayor of Baltimore,https://www.oyez.org/cases/1789-1850/32us243,"[[ Syllabus , https://supreme.justia.com/case...",,"[[Decision, for Mayor of Baltimore, , The pro...",John Barron ex rel. Tiernan,Mayor of Baltimore,,,"[ Marshall Court , https://www.oyez.org/cou...",,"[ 32 US 243 (1833) , https://supreme.justi...",,,"Feb 8 - 11, 1833","Feb 16, 1833",,,,[Baltimore wharf owner John Barron alleged tha...,[Does the Fifth Amendment deny the states as w...,"[Writing for the unanimous Court, Chief Justic...",,,,,,,,,
2,Briscoe v. Bank of Commonwealth of Kentucky,https://www.oyez.org/cases/1789-1850/36us257,"[[ Syllabus , https://supreme.justia.com/case...",,,Briscoe,Bank of Commonwealth of Kentucky,,,"[ Taney Court , https://www.oyez.org/courts...",,"[ 36 US 257 (1837) , https://supreme.justi...",,,1/28/37; 1/31/37; 2/1/37,"Feb 11, 1837",,,,[Kentucky authorized a state owned and operate...,"[By issuing notes and currency, did the bank v...",[The Court rejected Briscoe's argument. The cl...,,,,,,,,,
3,Calder v. Bull,https://www.oyez.org/cases/1789-1850/3us386,"[[ Syllabus , https://supreme.justia.com/case...",,,Calder,Bull,,,"[ Ellsworth Court , https://www.oyez.org/co...",,"[ 3 US 386 (1798) , https://supreme.justia...",,,"Feb 8 - 13, 1798","Aug 8, 1798",,,,"[Mr. and Mrs. Caleb Bull, the stated beneficia...",[Was the Connecticut legislation a violation o...,"[In a unanimous decision, the Court held that ...",,,,,,,,,
4,Chisholm v. Georgia,https://www.oyez.org/cases/1789-1850/2us419,"[[ Syllabus , https://supreme.justia.com/case...",,,Chisholm,Georgia,,,"[ Jay Court , https://www.oyez.org/courts?c...",,"[ 2 US 419 (1793) , https://supreme.justia...",,,"Feb 5, 1793","Feb 19, 1793",,,,"[In 1792, Alexander Chisholm attempted to sue ...",[Can state citizens sue state governments in f...,"[In a 4-to-1 decision, the Court ruled for the...",,,,,,,,,
5,Cohens v. Virginia,https://www.oyez.org/cases/1789-1850/19us264,"[[ Syllabus , https://supreme.justia.com/case...",,"[[Decision, for Virginia, , Jurisdiction, not...",Philip and Mendes Cohen,Virginia,[ Elizabeth River Parish (now site of Norfolk...,,"[ Marshall Court , https://www.oyez.org/cou...",State trial court,"[ 19 US 264 (1821) , https://supreme.justi...",,,2/13/21; 2/19/21; 2/20/21; 3/2/21,"Mar 5, 1821",,,,[An act of Congress authorized the operation o...,[Did the Supreme Court have the power under th...,"[In a unanimous decision, the Court held that ...",,,,,,,,,
6,Fletcher v. Peck,https://www.oyez.org/cases/1789-1850/10us87,"[[ Syllabus , https://supreme.justia.com/case...",,"[[Unanimous decision, for Peck, , Under the ...",Robert Fletcher,John Peck,,,"[ Marshall Court , https://www.oyez.org/cou...",,"[ 10 US 87 (1810) , https://supreme.justia...",,,2/15/10; 3/1/09; 3/2/09; 3/3/09; 3/4/09,"Mar 16, 1810",,,,"[In 1795, the Georgia state legislature passed...",[Could the contract between Fletcher and Peck ...,[The legislature’s repeal of the law was uncon...,,,,,,,,,
7,Gibbons v. Ogden,https://www.oyez.org/cases/1789-1850/22us1,"[[ Syllabus , https://supreme.justia.com/case...",,"[[Unanimous decision, for Gibbons, , Under th...",,,"[ Ferry line between Elizabeth, New Jersey, a...",,"[ Marshall Court , https://www.oyez.org/cou...",State appellate court,"[ 22 US 1 (1824) , https://supreme.justia....",,,"Feb 4 - 9, 1824","Mar 2, 1824","[[Daniel Webster, for Gibbons, https://www.oye...",Thomas Gibbons,Aaron Ogden,[A New York state law gave Robert R. Livingsto...,[Does the Commerce Clause give Congress author...,[Justice Marshall concluded that regulation of...,,,,,,,,,
8,Groves v. Slaughter,https://www.oyez.org/cases/1789-1850/40us449,"[[ Syllabus , https://supreme.justia.com/case...",,,Groves,Slaughter,,,"[ Taney Court , https://www.oyez.org/courts...",,"[ 40 US 449 (1841) , https://supreme.justi...",,,"Feb 12 - 19, 1841","Mar 10, 1841",,,,[Slaughter took a group of slaves to Mississip...,[Does the Mississippi constitutional provision...,[The provision did not become effective until ...,,,,,,,,,
9,Hylton v. United States,https://www.oyez.org/cases/1789-1850/3us171,"[[ Syllabus , https://supreme.justia.com/case...",,,Hylton,United States,,,"[ Ellsworth Court , https://www.oyez.org/co...",,"[ 3 US 171 (1796) , https://supreme.justia...",,,"Feb 23 - 25, 1796","Mar 8, 1796",,,,"[In 1794, Congress enacted a tax of sixteen do...","[Was the carriage tax a direct tax, which woul...",[The Court concluded that the carriage tax was...,,,,,,,,,


In [188]:
oyez_data.loc[oyez_data['rehearing_granted,_judgment_vacated,_and_restored_to_the_calendar'].isnull()==False]

Unnamed: 0,oyez_name,oyez_link,documents,media,decisions,petitioner,respondent,location,docket_no.,decided_by,lower_court,citation,granted,dismissed,argued,decided,advocates,appellant,appellee,facts,questions,conclusion,juris_postponed,affirmed_by_an_equally_divided_court,rehearing_granted,"rehearing_granted,_judgment_vacated,_and_restored_to_the_calendar","rehearing_granted,_judgment_vacated_and_case_restored_for_reargument",restored_to_the_calendar_and_set_for_reargument,revoked_order_of_june_23_1975,question_certified,question_certified_to_the_supreme_court_of_florida
666,Ladner v. United States,https://www.oyez.org/cases/1958/2,"[[ Syllabus , https://supreme.justia.com/case...","[[Oral Reargument - October 22, 1958 (Part 1),...","[[8–1 decision, for Ladner, , An assault with...",Lovander Ladner,United States,,2,"[ Warren Court , https://www.oyez.org/court...",,"[ 358 US 169 (1958) , https://supreme.just...",,,"Nov 19, 1957","Dec 15, 1958","[[Harold Rosenwald, for the petitioner, https:...",,,,,[],,"Jan 6, 1958",,"May 26, 1958",,,,,


NameError: name 'scotus_volume_urls' is not defined

Unnamed: 0,case_name,wiki_link,volume,justia_link,holding,topic,sub_topic
0,"Appointment Of Justices,",,2,,"Appointment Of Justices, 2 U.S. (2 Dall.) 399 ...",,
1,"Qualification Of Counsellors And Attorneys,",,2,,"Qualification Of Counsellors And Attorneys, 2 ...",,
2,"West v. Barnes,",https://www.wikipedia.org/wiki/West_v._Barnes,2,,"West v. Barnes, 2 U.S. (2 Dall.) 401 (1791)",,
3,"Oswald v. New York,",https://www.wikipedia.org/wiki/Oswald_v._New_York,2,,"Oswald v. New York, 2 U.S. (2 Dall.) 401 (1791)",,
4,"Georgia v. Brailsford,",https://www.wikipedia.org/wiki/Georgia_v._Brai...,2,,"Georgia v. Brailsford, 2 U.S. (2 Dall.) 402 (1...",,
5,Hayburn's Case,https://www.wikipedia.org/wiki/Hayburn%27s_Case,2,,"Hayburn's Case, 2 U.S. (2 Dall.) 409 (1792)",,
6,"Georgia v. Brailsford,",https://www.wikipedia.org/wiki/Georgia_v._Brai...,2,,"Georgia v. Brailsford, 2 U.S. (2 Dall.) 415 (1...",,
7,"Chisholm v. Georgia,",https://www.wikipedia.org/wiki/Chisholm_v._Geo...,2,,"Chisholm v. Georgia, 2 U.S. (2 Dall.) 419 (1793)",,
8,"Appointment Of Paterson,",,2,,"Appointment Of Paterson, 2 U.S. (2 Dall.) 479 ...",,
9,Georgia v. Brailsford,https://www.wikipedia.org/wiki/Georgia_v._Brai...,3,http://openjurist.org/3/us/1/,,,


Unnamed: 0,case_name,topic,sub_topic,wiki_url,justia_link,holding
Dred Scott v. Sandford,Dred Scott v. Sandford,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Dred_Scott_v._S...,https://supreme.justia.com/cases/federal/us/60...,"Dred Scott v. Sandford, 60 U.S. 393 (1857) Peo..."
Strauder v. West Virginia,Strauder v. West Virginia,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Strauder_v._Wes...,https://supreme.justia.com/cases/federal/us/10...,"Strauder v. West Virginia, 100 U.S. 303 (1880)..."
Civil Rights Cases,Civil Rights Cases,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Civil_Rights_Cases,https://supreme.justia.com/cases/federal/us/10...,"Civil Rights Cases, 109 U.S. 3 (1883) Neither ..."
Plessy v. Ferguson,Plessy v. Ferguson,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Plessy_v._Ferguson,https://supreme.justia.com/cases/federal/us/16...,"Plessy v. Ferguson, 163 U.S. 537 (1896) Segreg..."
New Negro Alliance v. Sanitary Grocery Co.,New Negro Alliance v. Sanitary Grocery Co.,First Amendment rights,Freedom of speech and of the press,https://www.wikipedia.org/wiki/New_Negro_Allia...,https://supreme.justia.com/cases/federal/us/30...,"New Negro Alliance v. Sanitary Grocery Co., 30..."
Smith v. Allwright,Smith v. Allwright,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Smith_v._Allwright,https://supreme.justia.com/cases/federal/us/32...,"Smith v. Allwright, 321 U.S. 649 (1944) Primar..."
Korematsu v. United States,Korematsu v. United States,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Korematsu_v._Un...,https://supreme.justia.com/cases/federal/us/32...,"Korematsu v. United States, 323 U.S. 214 (1944..."
Morgan v. Virginia,Morgan v. Virginia,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Irene_Morgan#U....,https://supreme.justia.com/cases/federal/us/32...,"Morgan v. Virginia, 328 U.S. 373 (1946) A Virg..."
Shelley v. Kraemer,Shelley v. Kraemer,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Shelley_v._Kraemer,https://supreme.justia.com/cases/federal/us/33...,"Shelley v. Kraemer, 334 U.S. 1 (1948) Courts m..."
Henderson v. United States,Henderson v. United States,Individual rights,Discrimination based on race and ethnicity,https://www.wikipedia.org/wiki/Henderson_v._Un...,https://supreme.justia.com/cases/federal/us/33...,"Henderson v. United States, 339 U.S. 816 (1950..."


In [192]:
data3.to_csv('Desktop/data3.csv')

NameError: name 'data3' is not defined

In [None]:
    case_name = np.nan
    full_case_name = 
    dt_argued = 
    dt_reargued = 
    dt_decided = 
    full_case_name = 
    citations = 
    prior_history = 
    holding = 
    c_justice = 
    a_justices =
    maj_op = maj_join = 
    
    plu_op = plu_join = plu_2_op = plu_2_join = plu_3_op = plu_3_join = np.nan
    dis_op = dis_join = dis_2_op = dis_2_join = dis_3_op = dis_3_join = dis_4_op = dis_4_join = dis_5_op = dis_5_join = np.nan
    con_op = con_join = con_2_op = con_2_join = con_3_op = con_3_join = con_4_op = con_4_join = con_5_op = con_5_join = np.nan
    con_dis_op = con_dis_join = con_dis_2_op = con_dis_2_join = con_dis_3_op = con_dis_3_join = con_dis_4_op = con_dis_4_join = np.nan
    ser_op = ser_2_op = ser_3_op = ser_4_op = ser_5_op = np.nan
    
    con_dis_op
    
    




data = pd.DataFrame(columns=['will_case'], index=case_names)

# for url in urls:
#     data['case_url'].loc[url[0]] = url[1]
#     print(url[1])
# #     print(requests.get(url[1]))
#     page = requests.get(url[1])
#     soup = BeautifulSoup(page.content, 'html.parser')
# #     if soup.select('h3')[0].get_text():
# #     if soup.select('h3')[0].get('href'):
# #     print(soup.findAll('div', {'class', 'r'}))
#     url_b = soup.select('h2')[0].select('a')[0].get('href')
# #     data['case_url'][url[0]] = url_b
#     print(soup.select('h2')[0].select('a')[0].get('href'))
# #         print(soup.select('h3')[0].get_text())

data['will_case'] = data['will_case'] = True
data

NameError: name 'data' is not defined

In [None]:
case_data = pd.DataFrame()

error_urls = []
headers = []

def scrapeUrls(url_list):
    case_data = pd.DataFrame()
    for url in url_list:
        try:
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')

            headers.append(soup.select('h1')[0].get_text())
    #         if soup.select('h1')[0].get_text() == "Search Results":
    #             continue

            th_array = []
            td_array = []
            array = []
            previous_heading = ''
            opinions = []

            for tr in soup.select('table tr'):
                children_th = tr.findChildren('th')
                children_td = tr.findChildren('td')
                children_dl = tr.findChildren('dl')
                if children_dl != []:
                    children_dl_dt = children_dl[0].findChildren('dt')
                    children_dl_dd = children_dl[0].findChildren('dd')

                if children_td == []:
                    children_td = ''
                else:
                    children_td = children_td[0].get_text()

                if children_th == []:
                    children_th = ''
                else:
                    children_th = children_th[0].get_text()

                if array == []:
                    scotus_table = soup.find_all('table', {'class': 'scotus'})
#                     print(soup.find_all('table', {'class': 'scotus'}))
                    print('length: ' + str(len(scotus_table)))
                    case_title = str(soup.select('h1')[0].get_text())
                    
#                     print(soup.select('h1')[0].get_text())
                    
                    if str(len(scotus_table)) != '1':
                        error_urls.append(url)
                        print(url)
                        print(soup.select('h1')[0].get_text())
                        
#                     if str(len(scotus_table)) == '1':
#                         children_th = scotus_table.select('th')[0].get_text()
#                     array.append(['case_name', children_th])
                    array.append(['case_name', case_title])

                if children_th.split(' ')[0] == 'Argued':
                    children_th = children_th.replace('Argued ', '')
                    children_th = children_th.replace('Decided ', 'DELIMITER')
                    children_th = children_th.split('DELIMITER')
                    array.append(['argued', children_th[0]])
                    array.append(['decided', children_th[1]])

                if children_th == 'Full case name':
                    array.append(['case_name_full', children_td])

                if children_th == 'Citations':
                    array.append(['citations', children_td.split('; ')])

                if children_th == 'Prior history':
                    array.append(['prior_history', children_td])

                if children_th == 'Subsequent history':
                    array.append(['subsequent_history', children_td])

                if children_th == 'Holding':
                    previous_heading = 'holding'
                    continue

                if (children_th == '') & (previous_heading == 'holding'):
                    array.append([previous_heading, children_td])

                if children_th == 'Court membership':
                    previous_heading = 'court_members'
                    continue

                if (children_th == '') & (previous_heading == 'court_members'):        
                    chief_justice = [a.get_text() for a in children_dl_dd[0].findChildren('a')]
                    associate_justices = [a.get_text() for a in children_dl_dd[1].findChildren('a')]
                    array.append(['chief_justice', chief_justice[0]])
                    justice_num = 1
                    for justice in associate_justices:
                        justice_id = 'associate_justice' + str(justice_num)
                        array.append([justice_id, justice])
                        justice_num = justice_num + 1

                if (children_th == 'Case opinions'):
                    previous_heading = 'opinions'
                    continue

                if (previous_heading == 'opinions') & ((children_th == 'Majority') | 
                                                       (children_th == 'Dissent') | 
                                                       (children_th == 'Concurrence') | 
                                                       (children_th == 'Seriatim opinion')):
                    justices_list = children_td.split(', ')
                    print(justices_list)
                    opinions.append([children_th, children_td])
                    continue

                if children_th == 'Laws applied':
                    previous_heading = 'applied_laws'
                    continue

                if (previous_heading == 'applied_laws') & (children_th == ''):
                    array.append([previous_heading, children_td])
                    previous_heading = ''

                if children_td == 'Overruled by':
                    previous_heading = 'overruled_by'
                    continue

                if children_td == 'This case overturned a previous ruling or rulings':
                    previous_heading = 'overruled'
                    continue

                if children_td == 'Superseded by':
                    previous_heading = 'superseded'
                    continue

                if (previous_heading == 'overruled_by') & (children_th == ''):
                    array.append([previous_heading, children_td])
                    previous_heading = ''

                if (previous_heading == 'overruled') & (children_th == ''):
                    array.append([previous_heading, children_td])
                    previous_heading = ''

                if (previous_heading == 'superseded') & (children_th == ''):
                    array.append([previous_heading, children_td])
                    previous_heading = ''

            maj_num = concur_num = dissent_num = seriatim_num = 1 
            for opinion in opinions:
                opinion_id = ''
                if opinion[0] == 'Majority':
                    opinion_id = 'opinion_maj'
                if opinion[0] == 'Concurrence':
                    opinion_id = 'opinion_concur_' + str(concur_num)
                    concur_num = concur_num + 1
                if opinion[0] == 'Dissent':
                    opinion_id = 'opinion_dissent_' + str(dissent_num)
                    dissent_num = dissent_num + 1
                if opinion[0] == 'Seriatim opinion':
                    opinion_id = 'opinion_seriatim_' + str(seriatim_num)
                    seriatim_num = seriatim_num + 1
                array.append([opinion_id, opinion[1]])

#             print('length: ' + str(len(array)))
            
            if len(array) > 1:
                
                df = pd.DataFrame(array)
                df.index = df[0]
                df = df.drop(columns=[0])
                df = df.T
                case_data = pd.concat([case_data, df], sort=False)

            if len(array) == 0:
                error_urls.append(url)

        except Exception:
            error_urls.append(url)
            pass
    
    return case_data


In [None]:
data = scrapeUrls(case_urls)
data
headers
error_urls

In [None]:
def fixErrorUrls(url_list):
    fixed_urls = []
    for url in url_list:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')

        next_url_li = soup.select('li')[4]
        next_url_a = next_url_li.select('a')[0]
#         nex
        fixed_url = 'https://en.wikipedia.org' + next_url_a.get('href')
        fixed_urls.append(fixed_url)
#         print([url, next_url_a.get('href')])
#         headers.append(soup.select('h1')[0].get_text())
    return fixed_urls
fixed_urls = fixErrorUrls(error_urls)

fixed_urls

In [None]:
newer_data = scrapeUrls(fixed_urls)

newer_data

In [None]:


newer_data = pd.concat([data, newer_data], sort=False)

newer_data

In [None]:
newer_data['reargued'] = np.where(newer_data['argued'].str.contains('Reargued') == True, 
                                  newer_data['argued'].str.split('Reargued ')[1],
                                  np.nan)
# newer_data['argued'] = np.where(newer_data['argued'].str.contains('Reargued') == True,
#                                 newer_data['argued'].str.split('Reargued ')[0],
#                                 newer_data['argued'])
newer_data

In [None]:
newer_data.to_csv('Desktop/newer_data.csv')

In [None]:
th_array = []
td_array = []
array = []
previous_heading = ''
opinions = []

for tr in soup.select('table tr'):
    children_th = tr.findChildren('th')
    children_td = tr.findChildren('td')
    children_dl = tr.findChildren('dl')
    if children_dl != []:
        children_dl_dt = children_dl[0].findChildren('dt')
        children_dl_dd = children_dl[0].findChildren('dd')

    if children_td == []:
        children_td = ''
    else:
        children_td = children_td[0].get_text()
    
    if children_th == []:
        children_th = ''
    else:
        children_th = children_th[0].get_text()
    
    if array == []:
        array.append(['case_name', children_th])
    
    if children_th.split(' ')[0] == 'Argued':
        children_th = children_th.replace('Argued ', '')
        children_th = children_th.replace('Decided ', 'DELIMITER')
        children_th = children_th.split('DELIMITER')
        array.append(['argued', children_th[0]])
        array.append(['decided', children_th[1]])
        
    if children_th == 'Full case name':
        array.append(['case_name_full', children_td])
        
    if children_th == 'Citations':
        array.append(['citations', children_td.split('; ')])
        
    if children_th == 'Prior history':
        array.append(['prior_history', children_td])
        
    if children_th == 'Subsequent history':
        array.append(['subsequent_history', children_td])
    
    if children_th == 'Holding':
        previous_heading = 'holding'
        continue
        
    if (children_th == '') & (previous_heading == 'holding'):
        array.append([previous_heading, children_td])
        
    if children_th == 'Court membership':
        previous_heading = 'court_members'
        continue
        
    if (children_th == '') & (previous_heading == 'court_members'):        
        chief_justice = [a.get_text() for a in children_dl_dd[0].findChildren('a')]
        associate_justices = [a.get_text() for a in children_dl_dd[1].findChildren('a')]
        array.append(['chief_justice', chief_justice[0]])
        justice_num = 1
        for justice in associate_justices:
            justice_id = 'associate_justice' + str(justice_num)
            array.append([justice_id, justice])
            justice_num = justice_num + 1
        
    if (children_th == 'Case opinions'):
        previous_heading = 'opinions'
        continue
    
    if (previous_heading == 'opinions') & ((children_th == 'Majority') | 
                                           (children_th == 'Dissent') | 
                                           (children_th == 'Concurrence') | 
                                           (children_th == 'Seriatim opinion')):
        opinions.append([children_th, children_td])
        continue
    
    if children_th == 'Laws applied':
        previous_heading = 'applied_laws'
        continue
    
    if (previous_heading == 'applied_laws') & (children_th == ''):
        array.append([previous_heading, children_td])
        previous_heading = ''
        
    if children_td == 'Overruled by':
        previous_heading = 'overruled_by'
        continue
        
    if children_td == 'This case overturned a previous ruling or rulings':
        previous_heading = 'overruled'
        continue
        
    if children_td == 'Superseded by':
        previous_heading = 'superseded'
        continue
        
    if (previous_heading == 'overruled_by') & (children_th == ''):
        array.append([previous_heading, children_td])
        previous_heading = ''
        
    if (previous_heading == 'overruled') & (children_th == ''):
        array.append([previous_heading, children_td])
        previous_heading = ''
        
    if (previous_heading == 'superseded') & (children_th == ''):
        array.append([previous_heading, children_td])
        previous_heading = ''
    
maj_num = concur_num = dissent_num = seriatim_num = 1 
for opinion in opinions:
    opinion_id = ''
    if opinion[0] == 'Majority':
        opinion_id = 'opinion_maj'
    if opinion[0] == 'Concurrence':
        opinion_id = 'opinion_concur_' + str(concur_num)
        concur_num = concur_num + 1
    if opinion[0] == 'Dissent':
        opinion_id = 'opinion_dissent_' + str(dissent_num)
        dissent_num = dissent_num + 1
    if opinion[0] == 'Seriatim opinion':
        opinion_id = 'opinion_seriatim_' + str(seriatim_num)
        seriatim_num = seriatim_num + 1
    array.append([opinion_id, opinion[1]])

columns = ['case_name', 'argued_decided', 'case_name_full', 'citations', 'prior_history', 'holding', 'court_members', 'applied_laws', 'overruled_by']

df = pd.DataFrame(array)
df.index = df[0]
df = df.drop(columns=[0])
# df = df.drop(columns=['data_type'], axis=1)
df = df.T
df

In [None]:
def checkForData(table):
    print(table.get_text)

tbl = soup.find_all('table', class_='scotus')[0]    

checkForData(tbl)

In [None]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from functools import reduce
from selenium.webdriver.common.by import By

browser = webdriver.Chrome()
# for a in soup.f ind_all('a', {'href': re.compile("^/watch")}):
for case_name in case_names:
    case_name_words = case_name.split(' ')
    search_query = reduce((lambda x, y: x + '+' + y), case_name_words)
#     print(query)
    try:
        url = 'https://en.wikipedia.org/w/index.php?search=' + search_query
        browser.get(url)
    except NoSuchElementException:
        pass
browser.close()