# Web Scraping
A technique of extracting information from websites, mostly focuses on the transformation of unstructured data (HTM L format) on the web into structured data (spreadsheet or database).

In [1]:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
wiki = "https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals_in_India"

In [3]:
http = urllib3.PoolManager()
r = http.request('GET', wiki)
# r.status
page = r.data



Parse the html in the 'page' variable, and store it in Beautiful Soup format

In [4]:
soup = BeautifulSoup(page, 'lxml')

Working with HTML tags

In [5]:
soup.title

<title>List of state and union territory capitals in India - Wikipedia</title>

In [6]:
soup.title.string

'List of state and union territory capitals in India - Wikipedia'

In [7]:
soup.a

<a id="top"></a>

In [8]:
soup.find_all('a')

[<a id="top"></a>,
 <a href="/wiki/Wikipedia:Featured_lists" title="This is a featured list. Click here for more information."><img alt="This is a featured list. Click here for more information." data-file-height="438" data-file-width="462" height="19" src="//upload.wikimedia.org/wikipedia/en/thumb/e/e7/Cscr-featured.svg/20px-Cscr-featured.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/e/e7/Cscr-featured.svg/30px-Cscr-featured.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/e/e7/Cscr-featured.svg/40px-Cscr-featured.svg.png 2x" width="20"/></a>,
 <a href="#mw-head">navigation</a>,
 <a href="#p-search">search</a>,
 <a href="/wiki/States_and_union_territories_of_India" title="States and union territories of India">States and union<br/>
 territories of India</a>,
 <a class="image" href="/wiki/File:Flag_of_India.svg"><img alt="Flag of India.svg" data-file-height="900" data-file-width="1350" height="47" src="//upload.wikimedia.org/wikipedia/en/thumb/4/41/Flag_of_India.svg

the above code shows all the links including titles, links and other information.

In [9]:
all_links = soup.find_all('a')
for link in all_links:
    print(link.get('href'))

None
/wiki/Wikipedia:Featured_lists
#mw-head
#p-search
/wiki/States_and_union_territories_of_India
/wiki/File:Flag_of_India.svg
/wiki/List_of_states_and_union_territories_of_India_by_area
/wiki/List_of_states_and_union_territories_of_India_by_population
/wiki/ISO_3166-2:IN
None
/wiki/List_of_Indian_states_by_Child_Nutrition
/wiki/Indian_states_and_territories_ranking_by_crime_rate
/wiki/Indian_states_ranking_by_households_having_electricity
/wiki/Indian_states_ranking_by_fertility_rate
/wiki/Forest_cover_by_state_in_India
/wiki/List_of_Indian_states_and_union_territories_by_GDP
/wiki/List_of_Indian_states_by_GDP_per_capita
/wiki/List_of_Indian_states_and_territories_by_highest_point
/wiki/Indian_states_ranked_by_HIV_awareness
/wiki/List_of_Indian_states_and_territories_by_Human_Development_Index
/wiki/Indian_states_ranking_by_families_owning_house
/wiki/Indian_states_ranking_by_household_size
/wiki/Indian_states_and_territories_ranked_by_incidents_of_human_trafficking
/wiki/Indian_stat

In [10]:
all_tables = soup.find_all('table')
right_table = soup.find('table', class_='wikitable sortable plainrowheaders')
right_table

<table class="wikitable sortable plainrowheaders">
<tr>
<th scope="col">No.</th>
<th scope="col">State or<br/>
union territory</th>
<th scope="col">Administrative capitals</th>
<th scope="col">Legislative capitals</th>
<th scope="col">Judiciary capitals</th>
<th scope="col">Year capital was established</th>
<th scope="col">The Former capital</th>
</tr>
<tr>
<td>1</td>
<th scope="row"><a href="/wiki/Andaman_and_Nicobar_Islands" title="Andaman and Nicobar Islands">Andaman and Nicobar Islands</a> <img alt="union territory" data-file-height="14" data-file-width="9" height="14" src="//upload.wikimedia.org/wikipedia/commons/3/37/Dagger-14-plain.png" width="9"/></th>
<td><b><a href="/wiki/Port_Blair" title="Port Blair">Port Blair</a></b></td>
<td>Port Blair</td>
<td>Kolkata</td>
<td>1955</td>
<td>Calcutta (1945–1955)</td>
</tr>
<tr>
<td>2</td>
<th scope="row"><a href="/wiki/Andhra_Pradesh" title="Andhra Pradesh">Andhra Pradesh</a></th>
<td><a class="mw-redirect" href="/wiki/Hyderabad,_India" 

It can be seen that in each <tr> (table row) there are 6 <td> and 1 <th> so we need to take care of this. To access value of each element, we will use find(text=True).

In [11]:
# List for each column of a table
A = []; B= []; C = []; D=[]; E=[]; F=[]; G=[]

In [12]:
for row in right_table.findAll('tr'):
    cells = row.findAll('td')
    states = row.findAll('th')
    if len(cells) == 6:
        A.append(cells[0].find(text=True))
        B.append(states[0].find(text=True))
        C.append(cells[1].find(text=True))
        D.append(cells[2].find(text=True))
        E.append(cells[3].find(text=True))
        F.append(cells[4].find(text=True))
        G.append(cells[5].find(text=True))

In [13]:
df=pd.DataFrame(A,columns=['Number'])
df['State/UT']=B
df['Admin_Capital']=C
df['Legislative_Capital']=D
df['Judiciary_Capital']=E
df['Year_Capital']=F
df['Former_Capital']=G
df

Unnamed: 0,Number,State/UT,Admin_Capital,Legislative_Capital,Judiciary_Capital,Year_Capital,Former_Capital
0,1,Andaman and Nicobar Islands,Port Blair,Port Blair,Kolkata,1955,Calcutta (1945–1955)
1,2,Andhra Pradesh,Hyderabad,Amaravati,Hyderabad,1956,Kurnool
2,3,Arunachal Pradesh,Itanagar,Itanagar,Guwahati,1986,
3,4,Assam,Dispur,Guwahati,Guwahati,1975,Shillong
4,5,Bihar,Patna,Patna,Patna,1912,
5,6,Chandigarh,Chandigarh,—,Chandigarh,1966,—
6,7,Chhattisgarh,Naya Raipur,Raipur,Bilaspur,2000,—
7,8,Dadra and Nagar Haveli,Silvassa,—,Mumbai,1945,Mumbai (1954–1961)
8,9,Daman and Diu,Daman,—,Mumbai,1987,Ahmedabad
9,10,National Capital Territory of Delhi,New Delhi,New Delhi,New Delhi,1931,—


In [14]:
UT_list = ['Chandigarh', 'Dadra and Nagar Haveli', 'Daman and Diu', 'Lakshadweep',
           'Puducherry', 'Andaman and Nicobar Islands','National Capital Territory of Delhi']

In [15]:
UT = df[df['State/UT'].isin(UT_list)]
UT.reset_index(drop=True)

Unnamed: 0,Number,State/UT,Admin_Capital,Legislative_Capital,Judiciary_Capital,Year_Capital,Former_Capital
0,1,Andaman and Nicobar Islands,Port Blair,Port Blair,Kolkata,1955,Calcutta (1945–1955)
1,6,Chandigarh,Chandigarh,—,Chandigarh,1966,—
2,8,Dadra and Nagar Haveli,Silvassa,—,Mumbai,1945,Mumbai (1954–1961)
3,9,Daman and Diu,Daman,—,Mumbai,1987,Ahmedabad
4,10,National Capital Territory of Delhi,New Delhi,New Delhi,New Delhi,1931,—
5,19,Lakshadweep,Kavaratti,Kavaratti,Kochi,1956,
6,27,Puducherry,Puducherry,Puducherry,Chennai,1954,Madras


In [16]:
States = df[df['State/UT'].isin(UT_list)==False]
States.reset_index(drop=True)

Unnamed: 0,Number,State/UT,Admin_Capital,Legislative_Capital,Judiciary_Capital,Year_Capital,Former_Capital
0,2,Andhra Pradesh,Hyderabad,Amaravati,Hyderabad,1956,Kurnool
1,3,Arunachal Pradesh,Itanagar,Itanagar,Guwahati,1986,
2,4,Assam,Dispur,Guwahati,Guwahati,1975,Shillong
3,5,Bihar,Patna,Patna,Patna,1912,
4,7,Chhattisgarh,Naya Raipur,Raipur,Bilaspur,2000,—
5,11,Goa,Panaji,Porvorim,Mumbai,1961,Panaji (1961–1987)
6,12,Gujarat,Gandhinagar,Gandhinagar,Ahmedabad,1960,Ahmedabad
7,13,Haryana,Chandigarh,Chandigarh,Chandigarh,1966,—
8,14,Himachal Pradesh,Shimla,Shimla (Summer),Shimla,1971,Bilaspur
9,15,Jammu and Kashmir,Srinagar,Srinagar (Summer),Srinagar (Summer),1947,—
