## Data Scraping Using Beautiful Soup

- Beautiful Soup
- Make a GET Request to Fetch Page Data
- Parse HTML
- Filter Relevant Parts

### 1. Fetching Data

In [1]:
# Installation
# !pip install bs4

In [2]:
import pandas as pd
from urllib.request import urlopen


In [3]:
android_url = "https://en.wikipedia.org/wiki/Android_version_history"

In [4]:
android_data = urlopen(android_url)
print(type(android_data))

<class 'http.client.HTTPResponse'>


In [5]:
android_html = android_data.read()
# print(android_html)

In [6]:
android_data.close()

### 2. Parsing Data

In [7]:
from bs4 import BeautifulSoup as soup

In [8]:
android_soup = soup(android_html, 'html.parser')
# print(android_soup)

In [9]:
print(type(android_soup))

<class 'bs4.BeautifulSoup'>


In [10]:
print(android_soup.h1)

<h1 class="firstHeading" id="firstHeading" lang="en">Android version history</h1>


In [11]:
print(android_soup.findAll('h1', {}))

[<h1 class="firstHeading" id="firstHeading" lang="en">Android version history</h1>]


In [12]:
tables = android_soup.findAll('table', {'class':'wikitable'})
print(len(tables))

31


In [13]:
android_table = tables[0]
print(android_table)

<table class="wikitable">
<tbody><tr>
<th>Name
</th>
<th>Version number(s)
</th>
<th>Initial release date
</th>
<th>Supported
</th>
<th>API level
</th>
<th>References
</th></tr>
<tr>
<td rowspan="2">No codename
</td>
<td>1.0
</td>
<td>September 23, 2008
</td>
<td>No
</td>
<td>1
</td>
<td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-1"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup>
</td></tr>
<tr>
<td>1.1
</td>
<td>February 9, 2009
</td>
<td>No
</td>
<td>2
</td>
<td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-2"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup><sup class="reference" id="cite_ref-11"><a href="#cite_note-11">[11]</a></sup>
</td></tr>
<tr>
<td><a href="/wiki/Android_Cupcake" title="Android Cupcake">Cupcake</a>
</td>
<td>1.5
</td>
<td>April 27, 2009
</td>
<td>No
</td>
<td>3
</td>
<td>
</td></tr>
<tr>
<td><a href="/wiki/Android_Donut" title="Android Donut">Donut</a>
</td>
<td>

### 3. Extracting Useful Information

- Remove Undesired Tags
- Extract Table Header & Data

In [14]:
headers = android_table.findAll('th')
print(headers)

[<th>Name
</th>, <th>Version number(s)
</th>, <th>Initial release date
</th>, <th>Supported
</th>, <th>API level
</th>, <th>References
</th>]


In [15]:
print(len(headers))

6


In [16]:
print(headers[0].text)

Name



In [17]:
column_title = [ct.text[:-1] for ct in headers]
print(column_title)

['Name', 'Version number(s)', 'Initial release date', 'Supported', 'API level', 'References']


In [18]:
rows_data = android_table.findAll('tr')[1:]
print(len(rows_data))

18


In [19]:
print(rows_data[0])

<tr>
<td rowspan="2">No codename
</td>
<td>1.0
</td>
<td>September 23, 2008
</td>
<td>No
</td>
<td>1
</td>
<td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-1"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup>
</td></tr>


In [20]:
first_row = rows_data[0].findAll('td', {})
for d in first_row:
    print(d.text)

No codename

1.0

September 23, 2008

No

1

[9]



In [21]:
table_rows = []
for row in rows_data:
    current_row = []
    row_data = row.findAll('td', {})
    for idx, data in enumerate(row_data):
        current_row.append(data.text[:-1])
    table_rows.append(current_row)

In [22]:
print(table_rows)

[['No codename', '1.0', 'September 23, 2008', 'No', '1', '[9]'], ['1.1', 'February 9, 2009', 'No', '2', '[9][11]'], ['Cupcake', '1.5', 'April 27, 2009', 'No', '3', ''], ['Donut', '1.6', 'September 15, 2009', 'No', '4', '[12]'], ['Eclair', '2.0 – 2.1', 'October 26, 2009', 'No', '5 – 7', '[13]'], ['Froyo', '2.2 – 2.2.3', 'May 20, 2010', 'No', '8', '[14]'], ['Gingerbread', '2.3 – 2.3.7', 'December 6, 2010', 'No', '9 – 10', '[15]'], ['Honeycomb', '3.0 – 3.2.6', 'February 22, 2011', 'No', '11 – 13', '[16]'], ['Ice Cream Sandwich', '4.0 – 4.0.4', 'October 18, 2011', 'No', '14 – 15', '[17]'], ['Jelly Bean', '4.1 – 4.3.1', 'July 9, 2012', 'No', '16 – 18', '[18]'], ['KitKat', '4.4 – 4.4.4', 'October 31, 2013', 'No', '19 – 20', '[19]'], ['Lollipop', '5.0 – 5.1.1', 'November 12, 2014', 'No', '21 – 22', '[20]'], ['Marshmallow', '6.0 – 6.0.1', 'October 5, 2015', 'No', '23', '[21]'], ['Nougat', '7.0 – 7.1.2', 'August 22, 2016', 'No', '24 – 25', '[22][23][24][25]'], ['Oreo', '8.0 – 8.1', 'August 21, 

### 4. Writing & Reading CSV Files

In [23]:
filename = "Android_Version_History.csv"

with open(filename, 'w', encoding='utf-8') as f:
    
    # Write The Header
    header_string = ",".join(column_title)
    header_string += '\n'
    f.write(header_string)
    
    # Write Row Data
    for row in table_rows:
        row_string = ""
        row_string = ','.join(row)
        row_string += '\n'
        f.write(row_string)

### 5. Data Cleaning

- Remove Unwanted Commas & Symbols
- Remove Undesired Information

In [24]:
filename = "Android_Version_History.csv"

with open(filename, 'w', encoding='utf-8') as f:
    
    # Write The Header
    header_string = ",".join(column_title)
    header_string += '\n'
    f.write(header_string)
    
    # Write Row Data
    for idx, row in enumerate(table_rows):
        if idx==1:
            row_string = "No codename,"
        else:
            row_string = ""
        for w in row:
            w = w.replace(',', '')
            row_string += w + ","
        row_string = row_string[:-1]
        row_string += '\n'
        f.write(row_string)

In [25]:
df = pd.read_csv("Android_Version_History.csv")
df.head()

Unnamed: 0,Name,Version number(s),Initial release date,Supported,API level,References
0,No codename,1.0,September 23 2008,No,1,[9]
1,No codename,1.1,February 9 2009,No,2,[9][11]
2,Cupcake,1.5,April 27 2009,No,3,
3,Donut,1.6,September 15 2009,No,4,[12]
4,Eclair,2.0 – 2.1,October 26 2009,No,5 – 7,[13]


In [26]:
df.head(n=15)

Unnamed: 0,Name,Version number(s),Initial release date,Supported,API level,References
0,No codename,1.0,September 23 2008,No,1,[9]
1,No codename,1.1,February 9 2009,No,2,[9][11]
2,Cupcake,1.5,April 27 2009,No,3,
3,Donut,1.6,September 15 2009,No,4,[12]
4,Eclair,2.0 – 2.1,October 26 2009,No,5 – 7,[13]
5,Froyo,2.2 – 2.2.3,May 20 2010,No,8,[14]
6,Gingerbread,2.3 – 2.3.7,December 6 2010,No,9 – 10,[15]
7,Honeycomb,3.0 – 3.2.6,February 22 2011,No,11 – 13,[16]
8,Ice Cream Sandwich,4.0 – 4.0.4,October 18 2011,No,14 – 15,[17]
9,Jelly Bean,4.1 – 4.3.1,July 9 2012,No,16 – 18,[18]


### 6. Loading Local Files

In [27]:
# with open("android.html", encoding='utf-8') as f:
#     page_soup = soup(f, 'html.parser')