# Simple Scraper

## Global Imports

In [9]:
import urllib.request
from bs4 import BeautifulSoup

## Accessing Website

In [28]:
url = "https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals_in_India"
page = urllib.request.urlopen(url)
page.status

200

## Converting Webpage To bs4 Object

In [29]:
soup = BeautifulSoup(page,"lxml")

In [33]:
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of state and union territory capitals in India - Wikipedia</title>
<script>document.documentElement.className=document.documentElement.className.replace(/(^|\s)client-nojs(\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_state_and_union_territory_capitals_in_India","wgTitle":"List of state and union territory capitals in India","wgCurRevisionId":904438853,"wgRevisionId":904438853,"wgArticleId":2371868,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Use dmy dates from November 2018","Use Indian English from November 2018","All Wikipedia articles written in Indian English","States and union territories of India-related lists","Indian capital cities","Lists of cities in India","Lists of capitals of country subdivisions","Cities and towns in 

## Extracting Page Title

In [35]:
soup.title.text

'List of state and union territory capitals in India - Wikipedia'

## Find All Links From The Page

In [40]:
linksList = []
allLinks= soup.findAll("a")
for link in allLinks:
    linksList.append(link.get("href"))
linksList

[None,
 '#mw-head',
 '#p-search',
 '/wiki/States_and_union_territories_of_India',
 '/wiki/File:Flag_of_India.svg',
 '/wiki/List_of_states_and_union_territories_of_India_by_area',
 '/wiki/List_of_states_and_union_territories_of_India_by_population',
 '/wiki/List_of_Indian_states_and_union_territories_by_GDP',
 '/wiki/List_of_Indian_states_and_union_territories_by_GDP_per_capita',
 '/wiki/ISO_3166-2:IN',
 None,
 '/wiki/List_of_Indian_states_by_Child_Nutrition',
 '/wiki/List_of_states_and_union_territories_of_India_by_crime_rate',
 '/wiki/List_of_states_and_union_territories_of_India_by_households_having_electricity',
 '/wiki/List_of_states_and_union_territories_of_India_by_fertility_rate',
 '/wiki/Forest_cover_by_state_in_India',
 '/wiki/Ease_of_doing_business_ranking_of_states_of_India',
 '/wiki/List_of_Indian_states_and_territories_by_highest_point',
 '/wiki/Indian_states_ranked_by_HIV_awareness',
 '/wiki/List_of_Indian_states_and_territories_by_Human_Development_Index',
 '/wiki/Indian

## Finding Table Element

In [41]:
right_table=soup.find('table', class_='wikitable sortable plainrowheaders')

## Extracting Information From Table

In [47]:
statesList = []
for row in right_table.findAll("tr"):
    cells = row.findAll('td')
    states=row.findAll('th') #To store second column data
    stateInfo = {}
    if len(cells)==6: #Only extract table body not heading
        stateInfo['State/UT']= states[0].find(text=True)
        stateInfo['Admin_Capital']= cells[1].find(text=True)
        stateInfo['Legislative_Capital']= cells[2].find(text=True)
        stateInfo['Judiciary_Capital']= cells[3].find(text=True)
        stateInfo['Year_Capital']= cells[4].find(text=True)
        stateInfo['Former_Capital']= cells[5].find(text=True)
        statesList.append(stateInfo)

In [48]:
statesDf = pandas.DataFrame(statesList)
statesDf.to_csv("states_details.csv",index=False,encoding='utf-8')

# Advanced Scraping Practice

## Global Imports

In [1]:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import requests
import pandas

## Initializing Webdriver

In [2]:
driver = webdriver.Chrome(executable_path="chromedriver.exe")

## Setting Useragent and Accessing Website

In [3]:
headers =  {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
driver.get("https://angel.co/education")

## Finding An Element And Clicking It Ten Times

In [4]:
for i in range(1,10):
    time.sleep(2)
    more = driver.find_element_by_xpath('//div[contains(@class, "more hidden")]')
    more.click()

## Accessing All Companies

In [5]:
baseLinks = driver.find_elements_by_xpath('//div[contains(@class, "base item")]')

## Getting Each Company Details

In [6]:
companiesList = []
for baseLink in baseLinks:
    companyDict = {}
    companyDict["Name"] = baseLink.find_element_by_xpath('descendant-or-self::div[contains(@class, "name")]').text
    companyDict["Description"] = baseLink.find_element_by_xpath('descendant-or-self::div[contains(@class, "blurb")]').text
    companyDict["Location"] = baseLink.find_element_by_xpath('descendant-or-self::div[contains(@class, "tags")]').text
    companyDict["Links"] = baseLink.find_element_by_xpath('descendant-or-self::a[contains(@class, "startup-link")]').get_attribute("href")
    companiesList.append(companyDict)

## Wrtiting Out Company Details To CSV

In [49]:
companyDf = pandas.DataFrame(companiesList)
companyDf.to_csv("company_details.csv",index=False,encoding='utf-8')

# Advanced Company Details

In [77]:
newList = []
for companyDetails in companiesList:
    try:
        time.sleep(2)
        companyName = companyDetails["name"].lower().replace(" ","-")
        response  = requests.get("https://angel.co/"+companyName,headers=headers).text
        soup = BeautifulSoup(response)
        links = soup.find_all('div', class_="show section")
        for link in links:
            linknum = links.index(link) + 1
            companyDetails["FundRaised_" +str(linknum)] = link.find("div",class_="raised").text.strip()
            companyDetails["Date_"+str(linknum)] = link.find("div",class_="date_display").text.strip()
        joblinks = soup.find_all('div', class_="job")
        print companyName,len(joblinks)
        for joblink in joblinks:
            joblinknum = joblinks.index(joblink) + 1
            companyDetails["JobTitle_" +str(joblinknum)] = joblink.find("div",class_="job_title").text.strip()
            companyDetails["JobLocation_" +str(joblinknum)]= joblink.findAll("div",class_="tags")[0].text.strip()
            companyDetails["Salary_" +str(joblinknum)]= joblink.findAll("div",class_="tags")[1].text.strip()
        newList.append(companyDetails)
    except Exception,e:
        print e,companyName
        newList.append(companyDetails)



miso-media 0
skillshare 8
looksharp



 0
elevate 0
motion-math



 1
chromatik 3
colingo



 0
after-school 0
udemy



 1
memrise 4
open-air-publishing



 0
showme 0
coachup



 0
remind 2
tutorspree



 0
knewton 0
minted



 6
cornerstone-ondemand 0
simpletuition



 0
flat-world 0
wikihow



 0
edmodo 0
silverrail-technologies



 0
gamesalad 0
voxy



 0
altius-education 0
kno



 0
piazza 0
dailybreak



 0
boundless 0
codecademy



 4
hullabalu 3
classdojo



 2
brit-+-co 0
peertransfer



 0
treehouse 0
mightybell



 0
'NoneType' object has no attribute 'text' hireart
lore



 0
nitrous 2
nitrous



 2
minerva-project 2
clever



 13
noredink 1
wufoo



 0
okpanda 2
duolingo



 0
one-month 2
panorama-education



 8
padlet 1
takelessons



 0
evertrue 3
smack



 0
kiwi-crate 0
verbling



 0
seamless-toy-company 0
tioki



 0
allclasses 0
codefights



 4
brilliant 1
make-school



 0
benchprep 0
learnsprout



 0
'NoneType' object has no attribute 'text' vittana
englishcentral



 4
uvize 0
masteryconnect



 0
codespark 0
codewars



 0
codehs 4
highlighter



 0
everfi 0
untangle



 0
educents 20
diy



 0
enuma 0
codementor



 2
classkick 0
kidblog



 0
verbalizeit 0
cambly



 0
learnup 6
teachable



 0
brightbytes 0
enki



 4
descomplica 1
flashnotes



 0
camperoo 0
kidaptive



 2
ctrl+console 0
checkio



 0
tinybop 0
betterlesson



 0
speakaboos 0
tynker



 0
nextlesson 0
campus-explorer



 0
popexpert 2
schoolmint



 5
astrid-francesca-walk 0
togetherville



 0
skilljar 5
yogome



 0
schoolzilla 0
magoosh



 2
littlebits 0
scorebeyond



 10
greengar 0
5min-media



 0
tuition.io 0
front-row-education



 4
accredible 0
instaedu



 0
chalkable 0
coursehorse



 6
bloc 0
educreations



 0
grovo 2
startupdigest



 1
mystery-science 0
homer



 0
quad 0
tykoon



 0
newsela 0
dailydrip



 0
schoola 0
curriculet



 0
panopen 4
wheelhouse.io



 0
degreed 0
mentormob



 0
brightwheel 5
berecruited



 0
keepy 2
mytonomy



 0
rentlord 0
top-hat



 14
koofers 0
wonder-workshop



 1
edify 0
peak



 0
khan-academy 0
prosky



 2
iq-technologies 0
learnzillion



 0
socrative 0
studysoup



 12
uguru.me 0
collegefeed



 0
plastiq 9
zeal



 1
avanoo 2
curios.me



 0
brainscape 0
ranku



 0
girls-in-tech 0
puzzle-piece



 0
timbuktu-labs 0
designlab



 2
nanoracks 0
cognifit



 1
floqq 0
storypanda



 0
storybird 0
instinct



 0
kodable 2
socratic



 0
volunteerspot 0
campusquad



 0
visitdays 2
pathgather



 2
edsurge 2
aristotle-circle



 0
openstudy 0
zoobean



 0
veduca 0
envoynow



 0
codecombat 0
mediacore-(acquired-by-workday)



 0
bravenew 0
creativelive



 5
inkling 0
thread



 0
clutch-prep 1
general-assembly



 24
hopscotch 1
jamalon



 0
busuu 5
fullstack-academy



 1
savvy 0
feast



 0
gojimo 6
notehall-(acquired-by-chegg-6/11)



 0
betterup 0
kibin



 0
kitereaders 0
admission-table



 1
academize 0
espark-learning



 6
yamli 0
platzi



 0
swivl 2
joytunes



 0
firsthand 5
hunie



 0
chesscademy 0
rota-dos-concursos



 0
springboard 7
kira-talent



 0
binu 0
ubi-interactive



 0
junyo 0
startup-institute



 1
quizlet 12
brainrewards



 0
branching-minds 1
italki



 0
blendspace-by-tes 0
opencurriculum



 0
piper 6
zzish



 0
cloudacademy 0
datacamp



 8
chalk-schools 0
novoed



 4
kaizena 0
parachute



 2
applykit 0
playsay



 0
dreambox-learning 0
mindmixer



 0
curiosityville 0
curious-hat



 0
shearwater-international-(techstars-‘15) 0
penpal-schools



 0
gotit! 0
digital-currency-council



 0
primo-toys 0
isoccer



 0
craftsy 0
admittedly



 0
linguatrip 0
springest



 0
skilloop 0
playrific



 0
authorbee 0
allovue



 0
learnmetrics 0
policygenius



 9
abpathfinder 0
omada-health



 0
udacity 5
'NoneType' object has no attribute 'text'



 citelighter
zyante 0
smartup.io



 0
sage-hero 1
tales2go



 0
goalbook 1
edupath



 1
modern-guild 0
schooladmissions



 0
opensesame 0
uncubed



 0
exo-labs 0
oneroom



 0
always-prepped 0
admitsee



 3
quipper 0
studyroom



 0
memorang 2
ginkgotree



 0
if-you-can 0
cybrary



 0
handshake 2
higher-learning-technologies



 0
twigtale 0
apptuto



 0
straighterline 0
mendeley



 0
fidelis-education 0
slader



 0
guidekick 0
myedmatch



 0
culturealley 0
acceptd



 0
trinket 0
meducation



 0
gradible 0
ilos



 1
tactilize 0
lightup



 0
course-hero 8
seelio



 0
wikibrains 0
gap-year



 0
crowdmark 0
clusterflunk



 0
asku 0
edthena



 1
room-choice 4
curiosity.com



 0
codarica 0
sokanu



 5
neverware 0
gitbook



 0
jumble 0
bookrenter



 0
freetextbooks 3
edukart



 0
raise.me 0
reach-robotics



 0
sagecrowd 0
edpuzzle



 3
expii 6
masterclass



 6
braingenie 0
4soils



 0
akdemia 0
aceable



 0
testmax 1
gummii



 0
storyjumper 0
bridgeu



 3
getbonkers 0
albert.io



 0
classowl 0
desmos



 1
marcopolo-learning 0
121nexus



 0
hustle-con 0
gigabryte-(formerly-tinkertags)



 0
three-ring 0
lingvist



 0
betterfly 0
principly



 0
nearpod 0
kaymbu



 2
junior-explorers 2
testive



 1
opened 2
student-loan-hero



 0
zaption 0
showbie



 0
meritful 0
monkimun



 4
pragya-systems 0
learncore



 5
unbound-concepts 0
yaklass



 0
oneclass 3
goldbean



 1
scholarpro 0
schoology



 8
dabble 0
swagger-(formerly-co-ed-supply)



 0
edools 0
booktrack



 0
thinkful 1
switchboard



 0
pi 0
edutise



 0
transtutors.com 0
cuethink



 0
cursostotales.com 0
education-funding-partners



 0
actively-learn 0
solidoodle



 0
i3zif.com 0
tackl



 0
launchpilots 0
impact-hub-oakland



 0
mision-admision 0
lesson.ly



 0
board-vitals 0
fluent-city



 4
sokikom 1
classbadges



 0
raisy 0
6dot-innovations



 0
tuva-labs 0
pathsource



 2
kinderloop 0
formative



 2
tanaza 5
ardusat



 0
packback 3
tradecraft



 0
entangled-ventures 0
inclass



 0
spirit-shop 0
deeno



 0
yellowdig 3
eduadvisor



 0
kinobi 0
edshelf



 0
fullbox 0
the-lab-miami



 0
kami-(formerly-notable-pdf) 0
oohlala-mobile



 0
log(n) 0
