# Web Scraping Data from HTML

In [1]:
import requests #library used to connect to a website

In [2]:
#specify the url
URL = "https://simple.wikipedia.org/wiki/List_of_U.S._state_capitals"

In [3]:
#connect to the website as the variable 'page'
page = requests.get(URL)

In [4]:
#verify successful connection to website
page.status_code

200

In [5]:
#save string format of website HTML into a variable
HTMLstr = page.text

In [6]:
#import the Beautiful soup functions to parse the data returned from the website
from bs4 import BeautifulSoup

In [7]:
#Parse the html in the 'page' variable, and store it in Beautiful Soup format
soup = BeautifulSoup(HTMLstr, "lxml")

In [8]:
#look at contents of page - wall of text
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of U.S. state capitals - Simple English Wikipedia, the free encyclopedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_U.S._state_capitals","wgTitle":"List of U.S. state capitals","wgCurRevisionId":6669739,"wgRevisionId":6669739,"wgArticleId":18635,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["State capitals in the United States","Lists of cities in the United States"],"wgBreakFrames":!1,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","J

In [9]:
#format page contents to include indentation
print (soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of U.S. state capitals - Simple English Wikipedia, the free encyclopedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_U.S._state_capitals","wgTitle":"List of U.S. state capitals","wgCurRevisionId":6669739,"wgRevisionId":6669739,"wgArticleId":18635,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["State capitals in the United States","Lists of cities in the United States"],"wgBreakFrames":!1,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonth

In [10]:
# soup.<tag>: Return content between opening and closing tag including tag.
soup.title

<title>List of U.S. state capitals - Simple English Wikipedia, the free encyclopedia</title>

In [11]:
# soup.<tag>.string: Return string within given tag
print(soup.title.string)

List of U.S. state capitals - Simple English Wikipedia, the free encyclopedia


In [12]:
#shows the first <a> tag on the page
soup.a

<a id="top"></a>

In [13]:
#finds all <a> tags on the page
soup.find_all("a")

[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a href="/wiki/United_States" title="United States">United States</a>,
 <a href="/wiki/U.S._state" title="U.S. state">state</a>,
 <a class="mw-redirect" href="/wiki/Capital_(city)" title="Capital (city)">capital</a>,
 <a href="/wiki/City" title="City">cities</a>,
 <a href="/wiki/Legislature" title="Legislature">capitol</a>,
 <a href="#cite_note-1">[1]</a>,
 <a class="image" href="/wiki/File:US_states_in_which_the_capital_is_the_largest_city.svg"><img alt="" class="thumbimage" data-file-height="593" data-file-width="959" decoding="async" height="216" src="//upload.wikimedia.org/wikipedia/commons/thumb/e/e0/US_states_in_which_the_capital_is_the_largest_city.svg/350px-US_states_in_which_the_capital_is_the_largest_city.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/e/e0/US_states_in_which_the_capital_is_the_largest_city.svg/525p

In [14]:
#show hyperlink reference for all <a> tags
all_links=soup.find_all("a")
for link in all_links:
    print (link.get("href"))

None
#mw-head
#p-search
/wiki/United_States
/wiki/U.S._state
/wiki/Capital_(city)
/wiki/City
/wiki/Legislature
#cite_note-1
/wiki/File:US_states_in_which_the_capital_is_the_largest_city.svg
/wiki/File:US_states_in_which_the_capital_is_the_largest_city.svg
/wiki/United_States_Constitution
/wiki/List_of_United_States_cities_by_population
/w/index.php?title=List_of_metropolitan_statistical_areas&action=edit&redlink=1
/wiki/Alabama
/wiki/Montgomery,_Alabama
/wiki/Birmingham,_Alabama
/wiki/Alaska
/wiki/Juneau,_Alaska
/wiki/Arizona
/wiki/Phoenix,_Arizona
/wiki/Arkansas
/wiki/Little_Rock,_Arkansas
/wiki/California
/wiki/Sacramento,_California
/wiki/Colorado
/wiki/Denver,_Colorado
/wiki/Connecticut
/wiki/Hartford,_Connecticut
/wiki/Delaware
/wiki/Dover,_Delaware
/wiki/Florida
/wiki/Tallahassee,_Florida
/wiki/Georgia_(U.S._state)
/wiki/Atlanta
/wiki/Hawaii
/wiki/Honolulu
/wiki/Idaho
/wiki/Boise,_Idaho
/wiki/Illinois
/wiki/Springfield,_Illinois
/wiki/Springfield,_Illinois
/wiki/Abraham_Lincoln
/

In [15]:
#fina all the <table> tags
all_tables=soup.find_all('table')
all_tables

[<table class="wikitable sortable">
 <caption>State capitals of the United States
 </caption>
 <tbody><tr>
 <th rowspan="2">State</th>
 <th rowspan="2">Abr.</th>
 <th rowspan="2">State-hood</th>
 <th rowspan="2">Capital</th>
 <th rowspan="2">Capital since</th>
 <th rowspan="2">Area (mi²)</th>
 <th colspan="4">Population (2010)</th>
 <th rowspan="2">Notes
 </th></tr>
 <tr>
 <th><a href="/wiki/List_of_United_States_cities_by_population" title="List of United States cities by population">Municipal</a> (Within city proper boundaries)
 </th>
 <th><a class="new" href="/w/index.php?title=List_of_metropolitan_statistical_areas&amp;action=edit&amp;redlink=1" title="List of metropolitan statistical areas (not yet started)">Metropolitan</a> (Both within the capital city proper and the surrounding area of the city proper)
 </th>
 <th>Rank in state
 </th>
 <th>Rank in US
 </th></tr>
 <tr>
 <td><a href="/wiki/Alabama" title="Alabama">Alabama</a></td>
 <td>AL</td>
 <td align="center">1819</td>
 <td><

In [16]:
#get the <table> tag that contains the data we want to scrape
right_table=soup.find('table', class_='wikitable sortable')
right_table

<table class="wikitable sortable">
<caption>State capitals of the United States
</caption>
<tbody><tr>
<th rowspan="2">State</th>
<th rowspan="2">Abr.</th>
<th rowspan="2">State-hood</th>
<th rowspan="2">Capital</th>
<th rowspan="2">Capital since</th>
<th rowspan="2">Area (mi²)</th>
<th colspan="4">Population (2010)</th>
<th rowspan="2">Notes
</th></tr>
<tr>
<th><a href="/wiki/List_of_United_States_cities_by_population" title="List of United States cities by population">Municipal</a> (Within city proper boundaries)
</th>
<th><a class="new" href="/w/index.php?title=List_of_metropolitan_statistical_areas&amp;action=edit&amp;redlink=1" title="List of metropolitan statistical areas (not yet started)">Metropolitan</a> (Both within the capital city proper and the surrounding area of the city proper)
</th>
<th>Rank in state
</th>
<th>Rank in US
</th></tr>
<tr>
<td><a href="/wiki/Alabama" title="Alabama">Alabama</a></td>
<td>AL</td>
<td align="center">1819</td>
<td><a href="/wiki/Montgomery,_A

In [17]:
#set empty lists to hold data of each column
A=[]
B=[]
C=[]
D=[]
E=[]
F=[]
G=[]
H=[]
I=[]
J=[]
K=[]

#find all <tr> tags in the table and go through each one (row)
for row in right_table.findAll("tr"):
    
    #get all the <td> tags for each <tr> tag
    cells = row.findAll('td')
    
    #if there are 11 <td> tags
    if len(cells)==11: 
        
        A.append(cells[0].find(text=True)) #gets info in State column and adds it to list A
        B.append(cells[1].find(text=True)) # gets info from Abr. column and adds it to list B
        C.append(cells[2].find(text=True)) # gets info from Statehood column; add it to list C
        D.append(cells[3].find(text=True)) # gets info from Capital column and adds it to list D
        E.append(cells[4].find(text=True)) # gets info from Capital since column and adds it to list E
        F.append(cells[5].find(text=True)) # gets info from Area column and adds it to list F
        G.append(cells[6].find(text=True)) # gets info from Municipal column and adds it to list G
        H.append(cells[7].find(text=True)) # gets info from Metropolitan column and adds it to list H
        I.append(cells[8].find(text=True)) # gets info from Rank in state column and adds it to list I
        J.append(cells[9].find(text=True)) # gets info from Rank in US column and adds it to list J
        K.append(cells[10].find(text=True)) # gets info from Notes column and adds it to list K

In [18]:
#verify data in list A
A

['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming']

In [21]:
len(A)

50

In [19]:
#import pandas to convert list to data frame
import pandas as pd

df=pd.DataFrame(A, columns=['State']) #turn list A into dataframe first

#add other lists as new columns
df['Abr'] = B
df['Statehood'] = C
df['Capital'] = D
df['Capital_Since'] = E
df['Area'] = F
df['Municipal'] = G
df['Metropolitan'] = H
df['StateRank'] = I
df['USRank'] = J
df['Notes'] = K

#show first 5 rows of created dataframe
df.head()

Unnamed: 0,State,Abr,Statehood,Capital,Capital_Since,Area,Municipal,Metropolitan,StateRank,USRank,Notes
0,Alabama,AL,1819,Montgomery,1846,155.4,205764,374536,2,102.0,Birmingham
1,Alaska,AK,1959,Juneau,1906,2716.7,31275,710231,3,,Largest capital by municipal land area.
2,Arizona,AZ,1912,Phoenix,1889,474.9,1445632,4192887,1,6.0,Phoenix is the most populous capital city in t...
3,Arkansas,AR,1836,Little Rock,1821,116.2,193524,699757,1,117.0,
4,California,CA,1850,Sacramento,1854,97.2,466488,2149127,6,35.0,


In [22]:
#export scraped data to a csv file
df.to_csv("CapitalList.csv", index=False)