In [1]:
import requests
from bs4 import BeautifulSoup
import pandas

What we want to get: https://www.softschools.com/social_studies/state_abbreviations/

The page https://www.softschools.com/robots.txt contains
    
<pre>
User-agent: *
Disallow: /
</pre>

The "User-agent: *" means this section applies to all robots. The "Disallow: /" tells the robot that it should not visit any pages on the site.

https://www.softschools.com/terms_conditions.jsp

Let's try loading the data into pandas directly

In [2]:
page_content =pandas.read_html("https://www.softschools.com/social_studies/state_abbreviations/")
type(page_content)

list

In [3]:
len(page_content)

4

In [4]:
type(page_content[0])

pandas.core.frame.DataFrame

In [5]:
page_content[0].shape

(1, 1)

In [6]:
page_content[0].head()

Unnamed: 0,0
0,State Abbreviations State Abbreviations List ...


Since Pandas finds the table in HTML but doesn't do it well, let's see what requests+bs4 can do

In [7]:
response = requests.get("https://www.softschools.com/social_studies/state_abbreviations/")
response.ok
#response.text

True

In [8]:
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" >
<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <title>
   State Abbreviations list : 50 states abbreviations
  </title>
  <meta content="State Abbreviations list : 50 states abbreviations" name="DESCRIPTION"/>
  <meta content="State Abbreviations list , 50 states abbreviations" name="KEYWORDS"/>
  <meta content="text/html;charset=utf-8" http-equiv="content-type">
   <meta content="text/css" http-equiv="Content-Style-Type">
    <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
    <link href="/css2/rd/bootstrap.css" rel="stylesheet" type="text/css"/>
    <link href="/css2/rd/ssdefault300Nav.css" rel="stylesheet" type="text/css"/>
    <link href="/css2/rd/responsivenav.css" rel="stylesheet" type="text/css"/>
    <script src="/js2/rd/jquery-1.11.1.min.js" type="text/javascript">
    </script>
    <script src="/js

If we knew in advance we only cared about tables, we could the SoupStrainer class to choose which parts of an incoming document are parsed.

In [9]:
every_table = soup.findAll('table')
print(type(every_table))
print(len(every_table))

<class 'bs4.element.ResultSet'>
4


Reminder: the list "every_table" contains BeautifulSoup Tags

In [10]:
type(every_table[0])

bs4.element.Tag

Look at the HTML contents

In [11]:
every_table[0]

<table border="0" cellpadding="0" cellspacing="0" class="colorBgGreen">
<tr>
<td>
<table border="0" cellpadding="2" cellspacing="1">
<tr>
<td align="center" class="colorBgGreen" colspan="1"><font face="Arial,Helvetica,sans-serif" size="+1"><strong class="colorBgGreenText2"> State Abbreviations<strong></strong></strong></font></td>
</tr>
<tr>
<td align="left" class="colorBgGreenLight" style="padding-left:15px">
<strong class="colorBgGreenText">State Abbreviations List</strong><br>
<br/>
<span class="myFont14"><br/>
<br/>
<table>
<tr align="left">
<th>

 State </th>
<th align="CENTER">
  Abbreviation</th>
</tr>
<tr align="left">
<td>ALABAMA</td>
<td>AL</td>
</tr>
<tr align="left">
<td>ALASKA</td>
<td>AK</td>
</tr>
<tr align="left">
<td>ARIZONA </td>
<td>AZ</td>
</tr>
<tr align="left">
<td>ARKANSAS</td>
<td>AR</td>
</tr>
<tr align="left">
<td>CALIFORNIA </td>
<td>CA</td>
</tr>
<tr align="left">
<td>COLORADO </td>
<td>CO</td>
</tr>
<tr align="left">
<td>CONNECTICUT</td>
<td>CT</td>
</tr>
<

Careful observation reveals that the page has nested HTML tables.

Once we realize this, we can extract the table from the table

In [12]:
inner_table = every_table[0].find_all('table')
print(inner_table)

[<table border="0" cellpadding="2" cellspacing="1">
<tr>
<td align="center" class="colorBgGreen" colspan="1"><font face="Arial,Helvetica,sans-serif" size="+1"><strong class="colorBgGreenText2"> State Abbreviations<strong></strong></strong></font></td>
</tr>
<tr>
<td align="left" class="colorBgGreenLight" style="padding-left:15px">
<strong class="colorBgGreenText">State Abbreviations List</strong><br>
<br/>
<span class="myFont14"><br/>
<br/>
<table>
<tr align="left">
<th>

 State </th>
<th align="CENTER">
  Abbreviation</th>
</tr>
<tr align="left">
<td>ALABAMA</td>
<td>AL</td>
</tr>
<tr align="left">
<td>ALASKA</td>
<td>AK</td>
</tr>
<tr align="left">
<td>ARIZONA </td>
<td>AZ</td>
</tr>
<tr align="left">
<td>ARKANSAS</td>
<td>AR</td>
</tr>
<tr align="left">
<td>CALIFORNIA </td>
<td>CA</td>
</tr>
<tr align="left">
<td>COLORADO </td>
<td>CO</td>
</tr>
<tr align="left">
<td>CONNECTICUT</td>
<td>CT</td>
</tr>
<tr align="left">
<td>DELAWARE</td>
<td>DE</td>
</tr>
<tr align="left">
<td>FLORID

Let's validate the content is what we seek

In [13]:
table_rows = inner_table[0].findAll('tr')
print(table_rows[0])

<tr>
<td align="center" class="colorBgGreen" colspan="1"><font face="Arial,Helvetica,sans-serif" size="+1"><strong class="colorBgGreenText2"> State Abbreviations<strong></strong></strong></font></td>
</tr>


That's not relevant. Look at the second table in our list

In [14]:
print(table_rows[1])

<tr>
<td align="left" class="colorBgGreenLight" style="padding-left:15px">
<strong class="colorBgGreenText">State Abbreviations List</strong><br>
<br/>
<span class="myFont14"><br/>
<br/>
<table>
<tr align="left">
<th>

 State </th>
<th align="CENTER">
  Abbreviation</th>
</tr>
<tr align="left">
<td>ALABAMA</td>
<td>AL</td>
</tr>
<tr align="left">
<td>ALASKA</td>
<td>AK</td>
</tr>
<tr align="left">
<td>ARIZONA </td>
<td>AZ</td>
</tr>
<tr align="left">
<td>ARKANSAS</td>
<td>AR</td>
</tr>
<tr align="left">
<td>CALIFORNIA </td>
<td>CA</td>
</tr>
<tr align="left">
<td>COLORADO </td>
<td>CO</td>
</tr>
<tr align="left">
<td>CONNECTICUT</td>
<td>CT</td>
</tr>
<tr align="left">
<td>DELAWARE</td>
<td>DE</td>
</tr>
<tr align="left">
<td>FLORIDA</td>
<td>FL</td>
</tr>
<tr align="left">
<td>GEORGIA</td>
<td>GA</td>
</tr>
<tr align="left">
<td>HAWAII</td>
<td>HI</td>
</tr>
<tr align="left">
<td>IDAHO</td>
<td>ID</td>
</tr>
<tr align="left">
<td>ILLINOIS</td>
<td>IL</td>
</tr>
<tr align="left">
<td>I

That's what we wanted. 

In [15]:
print(table_rows[2])

<tr align="left">
<th>

 State </th>
<th align="CENTER">
  Abbreviation</th>
</tr>


In [16]:
print(table_rows[3])

<tr align="left">
<td>ALABAMA</td>
<td>AL</td>
</tr>


Write a loop to extract what we desire

In [17]:
list_of_table_content=[]
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    list_of_table_content.append(row)

In [18]:
len(list_of_table_content)

56

In [19]:
list_of_table_content[0]

[' State Abbreviations']

In [20]:
list_of_table_content[1]

['\nState Abbreviations List\n\n\n\n\n\n\n\n State \n\n  Abbreviation\n\n\nALABAMA\nAL\n\n\nALASKA\nAK\n\n\nARIZONA \nAZ\n\n\nARKANSAS\nAR\n\n\nCALIFORNIA \nCA\n\n\nCOLORADO \nCO\n\n\nCONNECTICUT\nCT\n\n\nDELAWARE\nDE\n\n\nFLORIDA\nFL\n\n\nGEORGIA\nGA\n\n\nHAWAII\nHI\n\n\nIDAHO\nID\n\n\nILLINOIS\nIL\n\n\nINDIANA\nIN\n\n\nIOWA\nIA\n\n\nKANSAS\nKS\n\n\nKENTUCKY\nKY\n\n\nLOUISIANA\nLA\n\n\nMAINE\nME\n\n\nMARYLAND\nMD\n\n\nMASSACHUSETTS\nMA\n\n\nMICHIGAN\nMI\n\n\nMINNESOTA\nMN\n\n\nMISSISSIPPI\nMS\n\n\nMISSOURI\nMO\n\n\nMONTANA\nMT\n\n\nNEBRASKA\nNE\n\n\nNEVADA\nNV\n\n\nNEW HAMPSHIRE\nNH\n\n\nNEW JERSEY\nNJ\n\n\nNEW MEXICO\nNM\n\n\nNEW YORK\nNY\n\n\nNORTH CAROLINA\nNC\n\n\nNORTH DAKOTA\nND\n\n\nOHIO\nOH\n\n\nOKLAHOMA\nOK\n\n\nOREGON\nOR\n\n\nPENNSYLVANIA\nPA\n\n\nRHODE ISLAND\nRI\n\n\nSOUTH CAROLINA\nSC\n\n\nSOUTH DAKOTA\nSD\n\n\nTENNESSEE\nTN\n\n\nTEXAS\nTX\n\n\nUTAH\nUT\n\n\nVERMONT\nVT\n\n\nVIRGINIA \nVA\n\n\nWASHINGTON\nWA\n\n\nWEST VIRGINIA\nWV\n\n\nWISCONSIN\nWI\n\n\nWYOMING\nWY\n\n 

In [21]:
list_of_table_content[2]

[]

In [22]:
list_of_table_content[3]

['ALABAMA', 'AL']

In [23]:
list_of_table_content[4]

['ALASKA', 'AK']

In [24]:
list_of_table_content[3:len(list_of_table_content)]

[['ALABAMA', 'AL'],
 ['ALASKA', 'AK'],
 ['ARIZONA ', 'AZ'],
 ['ARKANSAS', 'AR'],
 ['CALIFORNIA ', 'CA'],
 ['COLORADO ', 'CO'],
 ['CONNECTICUT', 'CT'],
 ['DELAWARE', 'DE'],
 ['FLORIDA', 'FL'],
 ['GEORGIA', 'GA'],
 ['HAWAII', 'HI'],
 ['IDAHO', 'ID'],
 ['ILLINOIS', 'IL'],
 ['INDIANA', 'IN'],
 ['IOWA', 'IA'],
 ['KANSAS', 'KS'],
 ['KENTUCKY', 'KY'],
 ['LOUISIANA', 'LA'],
 ['MAINE', 'ME'],
 ['MARYLAND', 'MD'],
 ['MASSACHUSETTS', 'MA'],
 ['MICHIGAN', 'MI'],
 ['MINNESOTA', 'MN'],
 ['MISSISSIPPI', 'MS'],
 ['MISSOURI', 'MO'],
 ['MONTANA', 'MT'],
 ['NEBRASKA', 'NE'],
 ['NEVADA', 'NV'],
 ['NEW HAMPSHIRE', 'NH'],
 ['NEW JERSEY', 'NJ'],
 ['NEW MEXICO', 'NM'],
 ['NEW YORK', 'NY'],
 ['NORTH CAROLINA', 'NC'],
 ['NORTH DAKOTA', 'ND'],
 ['OHIO', 'OH'],
 ['OKLAHOMA', 'OK'],
 ['OREGON', 'OR'],
 ['PENNSYLVANIA', 'PA'],
 ['RHODE ISLAND', 'RI'],
 ['SOUTH CAROLINA', 'SC'],
 ['SOUTH DAKOTA', 'SD'],
 ['TENNESSEE', 'TN'],
 ['TEXAS', 'TX'],
 ['UTAH', 'UT'],
 ['VERMONT', 'VT'],
 ['VIRGINIA ', 'VA'],
 ['WASHINGTON',