# Data Extraction and Preparation
## 1. Install and Load required libraries

In [None]:
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes
!conda install -c conda-forge beautifulsoup4 --yes

In [3]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
#import geocoder # to get coordinates

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print("Libraries imported.")

Libraries imported.


## 2. Data Extaraction - (A) Delhi district and population data

In [10]:
link=("https://www.census2011.co.in/census/state/districtlist/delhi.html")
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find("table")
output_rows = []
for table_row in table.findAll('tr'):
    columns = table_row.findAll('td')
    output_row = []
    for column in columns:
        output_row.append(column.text)
    output_rows.append(output_row)
names = ['#','District','Sub-District','Population','Increase','Sex Ratio','Literacy','Density']
df = pd.DataFrame(data=output_rows,columns = names)
df = df.replace('\n','', regex=True)
df = df.dropna(how='any', axis=0) 
df.drop(["#","Sub-District"],axis=1,inplace=True)
df    

Unnamed: 0,District,Population,Increase,Sex Ratio,Literacy,Density
1,North West Delhi,3656539,27.81 %,865,84.45 %,8254
2,South Delhi,2731929,20.51 %,862,86.57 %,11060
3,West Delhi,2543243,19.46 %,875,86.98 %,19563
4,South West Delhi,2292958,30.65 %,840,88.28 %,5446
5,North East Delhi,2241624,26.78 %,886,83.09 %,36155
6,East Delhi,1709346,16.79 %,884,89.31 %,27132
7,North Delhi,887978,13.62 %,869,86.85 %,14557
9,Central Delhi,582320,-9.91 %,892,85.14 %,27730
10,New Delhi,142004,-20.72 %,822,88.34 %,4057


## (B) List of Sub Division & Headquareters

In [15]:
link=("https://en.wikipedia.org/wiki/List_of_districts_of_Delhi")
wikipedia_page = requests.get(link)
soup = BeautifulSoup(wikipedia_page.content, 'html.parser')
table = soup.find('table', {'class':'wikitable'})
output_rows = []
for table_row in table.findAll('tr'):
    columns = table_row.findAll('td')
    output_row = []
    for column in columns:
        output_row.append(column.text)
    output_rows.append(output_row)
names = ['#','District','Headquarter','Sub_divisions_1','Sub_divisions_2','Sub_divisions_3']
df1 = pd.DataFrame(data=output_rows,columns = names)
df1 = df1.replace('\n','', regex=True)
df1 = df1.dropna(how='any', axis=0)
df1.drop(["#"],axis=1,inplace=True)
df1


Unnamed: 0,District,Headquarter,Sub_divisions_1,Sub_divisions_2,Sub_divisions_3
1,New Delhi,Connaught Place,Chanakyapuri,Delhi Cantonment,Vasant Vihar
2,North Delhi,Alipur,Model Town[3],Narela,Alipur
3,North West Delhi,Kanjhawala,Rohini,Kanjhawala,Saraswati Vihar
4,West Delhi,Rajouri Garden,Patel Nagar,Punjabi Bagh,Rajouri Garden
5,South West Delhi,Dwarka,Dwarka,Najafgarh,Kapashera
6,South Delhi,Saket,Saket,Hauz Khas,Mehrauli
7,South East Delhi,Defence Colony,Defence Colony,Kalkaji,Sarita Vihar
8,Central Delhi,Daryaganj,Kotwali,Civil Lines,Karol Bagh
9,North East Delhi,Nand Nagri,Seelampur,Yamuna Vihar,Karawal Nagar
10,Shahdara,Shahdara,Shahdara,Seemapuri,Vivek Vihar


In [26]:
Data=pd.merge(df,df1,on="District")
Data

Unnamed: 0,District,Population,Increase,Sex Ratio,Literacy,Density,Headquarter,Sub_divisions_1,Sub_divisions_2,Sub_divisions_3
0,North West Delhi,3656539,27.81 %,865,84.45 %,8254,Kanjhawala,Rohini,Kanjhawala,Saraswati Vihar
1,South Delhi,2731929,20.51 %,862,86.57 %,11060,Saket,Saket,Hauz Khas,Mehrauli
2,West Delhi,2543243,19.46 %,875,86.98 %,19563,Rajouri Garden,Patel Nagar,Punjabi Bagh,Rajouri Garden
3,North East Delhi,2241624,26.78 %,886,83.09 %,36155,Nand Nagri,Seelampur,Yamuna Vihar,Karawal Nagar
4,East Delhi,1709346,16.79 %,884,89.31 %,27132,Preet Vihar,Gandhi Nagar,Preet Vihar,Mayur Vihar
5,North Delhi,887978,13.62 %,869,86.85 %,14557,Alipur,Model Town[3],Narela,Alipur
6,Central Delhi,582320,-9.91 %,892,85.14 %,27730,Daryaganj,Kotwali,Civil Lines,Karol Bagh
7,New Delhi,142004,-20.72 %,822,88.34 %,4057,Connaught Place,Chanakyapuri,Delhi Cantonment,Vasant Vihar


## (C) Number of health care facilities already present in each distric

In [33]:
# Reference for this data : https://data.gov.in/catalog/district-wise-availability-health-centres-india-0?filters%5Bfield_catalog_reference%5D=95534&format=json&offset=0&limit=6&sort%5Bcreated%5D=desc
# Data is downloaded from this source and cleaned using MS-Excel to get the below list
df2 = pd.read_csv(r'Hospital_District.csv')
df2

Unnamed: 0,District,Health Facility
0,Central Delhi,415
1,East Delhi,232
2,New Delhi,267
3,North Delhi,323
4,North West Delhi,464
5,North East Delhi,205
6,Shahdara,227
7,South Delhi,187
8,South East Delhi,282
9,South West Delhi,268


## Final Data Frame 

In [28]:
Data = pd.merge(Data,df2,on="District")
Data

Unnamed: 0,District,Population,Increase,Sex Ratio,Literacy,Density,Headquarter,Sub_divisions_1,Sub_divisions_2,Sub_divisions_3,Health Facility
0,North West Delhi,3656539,27.81 %,865,84.45 %,8254,Kanjhawala,Rohini,Kanjhawala,Saraswati Vihar,464
1,South Delhi,2731929,20.51 %,862,86.57 %,11060,Saket,Saket,Hauz Khas,Mehrauli,187
2,West Delhi,2543243,19.46 %,875,86.98 %,19563,Rajouri Garden,Patel Nagar,Punjabi Bagh,Rajouri Garden,492
3,North East Delhi,2241624,26.78 %,886,83.09 %,36155,Nand Nagri,Seelampur,Yamuna Vihar,Karawal Nagar,205
4,East Delhi,1709346,16.79 %,884,89.31 %,27132,Preet Vihar,Gandhi Nagar,Preet Vihar,Mayur Vihar,232
5,North Delhi,887978,13.62 %,869,86.85 %,14557,Alipur,Model Town[3],Narela,Alipur,323
6,Central Delhi,582320,-9.91 %,892,85.14 %,27730,Daryaganj,Kotwali,Civil Lines,Karol Bagh,415
7,New Delhi,142004,-20.72 %,822,88.34 %,4057,Connaught Place,Chanakyapuri,Delhi Cantonment,Vasant Vihar,267


In [29]:
Data.to_csv("DelhiDistrictData.csv")