## Data Scraping using Beautiful Soup
- Import Beautiful soup
- Make a GET request to fetch Page Data
-Parse HTML
- Filter Relevant Parts

In [11]:
!pip install bs4



## GET request

In [12]:
from urllib.request import urlopen

In [13]:
android_url="https://en.wikipedia.org/wiki/Android_version_history"

In [14]:
android_data=urlopen(android_url)
print(type(android_data))

<class 'http.client.HTTPResponse'>


In [15]:
android_html=android_data.read()
#print(android_html)

In [16]:
android_data.close()

## Extracting HTML

In [17]:
from bs4 import BeautifulSoup as soup

In [18]:
android_soup=soup(android_html,'html.parser')
print(android_soup)

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Android version history - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"32370298-a98d-4117-9e12-72a5cf5cd264","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Android_version_history","wgTitle":"Android version history","wgCurRevisionId":1041572562,"wgRevisionId":1041572562,"wgArticleId":30752816,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Wikipedia indefinitely semi-protected pages","Articles with short description","Short description is different from Wikidata","Use mdy 

In [19]:
print(type(android_soup))

<class 'bs4.BeautifulSoup'>


In [20]:
android_soup.findAll('h1',{})

[<h1 class="firstHeading" id="firstHeading">Android version history</h1>]

In [22]:
tables=android_soup.findAll('table',{'class':'wikitable'})
print(len(tables))

32


In [23]:
android_table=tables[0]
print(android_table)

<table class="wikitable">
<tbody><tr>
<th>Name
</th>
<th>Internal codename
</th>
<th>Version number(s)
</th>
<th>Initial stable<br/>release date
</th>
<th>Supported<br/>(security fixes)
</th>
<th>API level
</th>
<th>References
</th></tr>
<tr>
<td><a class="mw-redirect" href="/wiki/Android_1.0" title="Android 1.0">Android 1.0</a>
</td>
<td class="table-na" data-sort-value="" style="background: #ececec; color: #2C2C2C; vertical-align: middle; text-align: center;">N/A
</td>
<td>1.0
</td>
<td>September 23, 2008
</td>
<td class="table-no" style="background:#F99;vertical-align:middle;text-align:center;">No
</td>
<td>1
</td>
<td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-1"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup><sup class="reference" id="cite_ref-:0_14-0"><a href="#cite_note-:0-14">[14]</a></sup>
</td></tr>
<tr>
<td><a class="mw-redirect" href="/wiki/Android_1.1" title="Android 1.1">Android 1.1</a>
</td>
<td>Petit Four
</td>
<td>1.1

## Extract uselful info from HTML

In [24]:
#extract table header
headers=android_table.findAll('th')
print(headers)

[<th>Name
</th>, <th>Internal codename
</th>, <th>Version number(s)
</th>, <th>Initial stable<br/>release date
</th>, <th>Supported<br/>(security fixes)
</th>, <th>API level
</th>, <th>References
</th>]


In [27]:
#extract data in this tag
column_titles=[ct.text[:-1] for ct in headers]
print(column_titles)

['Name', 'Internal codename', 'Version number(s)', 'Initial stablerelease date', 'Supported(security fixes)', 'API level', 'References']


In [33]:
rows_data=android_table.findAll('tr')[1:]
print(len(rows_data))
first_row=rows_data[0].findAll('td',{})
for d in first_row:
    print(d.text)

31
Android 1.0

N/A

1.0

September 23, 2008

No

1

[9][14]



In [57]:
table_rows=[]
for row in rows_data:
    current_row=[]
    row_data=row.findAll('td',{})
    for idx,data in enumerate(row_data):
        
        current_row.append(data.text[:-1])
       
    table_rows.append(current_row)

In [58]:
print(table_rows)

[['Android 1.0', 'N/A', '1.0', 'September 23, 2008', 'No', '1', '[9][14]'], ['Android 1.1', 'Petit Four', '1.1', 'February 9, 2009', 'No', '2', '[9][14][15]'], ['Android Cupcake', 'Cupcake', '1.5', 'April 27, 2009', 'No', '3', '[14][16]'], ['Android Donut', 'Donut', '1.6', 'September 15, 2009', 'No', '4', '[14][17]'], ['Android Eclair', 'Eclair', '2.0', 'October 27, 2009', 'No', '5', '[14][18][19]'], ['2.0.1', 'December 3, 2009', 'No', '6', ''], ['2.1', 'January 11, 2010', 'No', '7', '[20]'], ['Android Froyo', 'Froyo', '2.2 – 2.2.3', 'May 20, 2010', 'No', '8', '[14][21]'], ['Android Gingerbread', 'Gingerbread', '2.3 – 2.3.2', 'December 6, 2010', 'No', '9', '[14][22]'], ['2.3.3 - 2.3.7', 'February 9, 2011', 'No', '10', ''], ['Android Honeycomb', 'Honeycomb', '3.0', 'February 22, 2011', 'No', '11', '[14][23]'], ['3.1', 'May 10, 2011', 'No', '12', ''], ['3.2 - 3.2.6', 'July 15, 2011', 'No', '13', ''], ['Android Ice Cream Sandwich', 'Ice Cream Sandwich', '4.0 – 4.0.2', 'October 18, 2011', 

### Writing and Reading CSV Files -- 
Comma Separated files

In [84]:
filename='android_version_history.csv'
with open(filename,'w',encoding='utf-8') as f:
    #write the header
    header_string=','.join(column_titles)
    header_string+='\n'
    f.write(header_string)
    
    for row in table_rows[:-1]:
        row_string=""
        for w in row:
            w=w.replace(', ',' ')
            row_string+=w+','
        row_string=row_string[:-1]
        row_string+='\n'
        f.write(row_string)

### Data Cleaning
- remove unwanted commas& symbols
-undesired information

In [85]:
import pandas as pd

In [86]:
df=pd.read_csv('android_version_history.csv')

In [87]:
df.head()

Unnamed: 0,Name,Internal codename,Version number(s),Initial stablerelease date,Supported(security fixes),API level,References
0,Android 1.0,,1.0,September 23 2008,No,1.0,[9][14]
1,Android 1.1,Petit Four,1.1,February 9 2009,No,2.0,[9][14][15]
2,Android Cupcake,Cupcake,1.5,April 27 2009,No,3.0,[14][16]
3,Android Donut,Donut,1.6,September 15 2009,No,4.0,[14][17]
4,Android Eclair,Eclair,2.0,October 27 2009,No,5.0,[14][18][19]
