### Web Scrapping Through existing HTML file

In [None]:
# Web Scraping with BeautifulSoup and Requests

# Parsing the content from the website and 
# pulling the exact information you want
# Introduce to the page for Web Scrapping 

# pip install beautifulsoup4
# pip install lxml
# pip install html5lib

"""
Introduce the concept of basic HTML tags
HTML
  HEAD
    
  HEAD

  BODY
    
  BODY
HTML
"""


In [None]:
from bs4 import BeautifulSoup

# Create simple html files and 
# parse that using bs4 to make the students understand with title, div etc

html_file = open("mrdoob.html")    ## existing html file 
soup = BeautifulSoup(html_file, "lxml")   # html5lib is another parser

# print (soup)
print (soup.prettify())


In [8]:
print (soup.title)
print (soup.title.text)

<title>Mr.doob</title>
Mr.doob


In [19]:
# Crome browser ( use the inspect tool to use the find function )
match = soup.find('div')  ## give first occurance of div
print (match)

<div id="nav">
<div id="logo">
<map name="logo">
<area alt="Mr.doob" coords="12,13,91,32" href="/" shape="rect"/>
<area alt="Blog" coords="12,32,44,46" href="https://ello.co/mrdoob" shape="rect"/>
<area alt="Twitter" coords="47,32,88,46" href="https://twitter.com/mrdoob" shape="rect"/>
</map>
<img alt="logo" src="files/showcase/logo.svg" usemap="#logo" width="105"/>
</div>
<div id="projects"></div>
<div id="expand"><span></span><span></span><span></span></div>
<!-- <a href="/blog"><img src="files/showcase/more.png" width="60" style="float:left" /></a> -->
</div>


In [22]:
match = soup.find("div", class_= "footer")
print (match)

print ( match.h2 )
print ( match.h2.text )
print ( match.p )
print ( match.p.text )

for article in soup.find_all("div"):
    headline = article.p.text
    print (headline)


None


### Reading from the Internet

In [41]:

from bs4 import BeautifulSoup   
import requests

source = requests.get("http://httpbin.org/html").text
# print(source)

soup = BeautifulSoup(source,"lxml")
# print (soup)

print (soup.prettify())


<!DOCTYPE html>
<html>
 <head>
 </head>
 <body>
  <h1>
   Herman Melville - Moby-Dick
  </h1>
  <div>
   <p>
    Availing himself of the mild, summer-cool weather that now reigned in these latitudes, and in preparation for the peculiarly active pursuits shortly to be anticipated, Perth, the begrimed, blistered old blacksmith, had not removed his portable forge to the hold again, after concluding his contributory work for Ahab's leg, but still retained it on deck, fast lashed to ringbolts by the foremast; being now almost incessantly invoked by the headsmen, and harpooneers, and bowsmen to do some little job for them; altering, or repairing, or new shaping their various weapons and boat furniture. Often he would be surrounded by an eager circle, all waiting to be served; holding boat-spades, pike-heads, harpoons, and lances, and jealously watching his every sooty movement, as he toiled. Nevertheless, this old man's was a patient hammer wielded by a patient arm. No murmur, no impatience,

In [None]:
print (soup.head)
print (soup.head.text)

print('\n','*'*20)
print (soup.body)

print (soup.body.div)
print (soup.body.div.p)
print (soup.body.div.p.text)


### Reading data from website

In [46]:
#import the Beautiful soup functions to parse the data returned from the website

from bs4 import BeautifulSoup
import requests
import urllib

#specify the url
wiki = "https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals_in_India"

source = requests.get(wiki).text
'''Or'''
source = urllib.request.urlopen(wiki)

soup = BeautifulSoup(source,"lxml")
soup.prettify()
# print (soup.prettify())

all_tables= soup.find_all('table')
# print (all_tables)

right_table= soup.find('table', class_='wikitable')  ## use class underscore
# print (right_table)

#class="wikitable sortable plainrowheaders jquery-tablesorter"
#Generate lists
A=[]
B=[]
C=[]
D=[]
E=[]
F=[]

for row in right_table.findAll('tr'):
    cells = row.findAll('td')
    states = row.findAll('th')
    if len(cells) == 6:
        A.append(states[0].text.strip())
        B.append(cells[1].text.strip())
        C.append(cells[2].text.strip())
        D.append(cells[3].text.strip())
        E.append(cells[4].text.strip())
        F.append(cells[5].text.strip())


from collections import OrderedDict

col_name = ["State or UN","Admin Cap","Legis Cap","Judi Cap","Year","Capital"]
col_data = OrderedDict(zip(col_name,[A,B,C,D,E,F]))

# If you want to store the data in a csv file
import pandas as pd
df = pd.DataFrame(col_data) 
df.to_csv("states_table.csv")


### Web Scrapping using Selenium for static website

In [None]:
"""
Web Scrapping using Selenium
"""
#Download 
#https://www.seleniumhq.org/download/

#installation for firefox
#https://github.com/mozilla/geckodriver/

#installation for chrome
#https://sites.google.com/a/chromium.org/chromedriver/

# Add Web Scrapping using Selenium
# !pip install selenium

"""
Real website data scrapping for Kerela Results
"""

from  selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup as BS

#url = "http://keralaresults.nic.in/sslc2018rgr8364/swr_sslc.htm"
url = "http://keralaresults.nic.in/sslc2019duj946/swr_sslc.htm"


#For Windows System
browser = webdriver.Chrome("E:/Driver/chromedriver.exe")
#browser = webdriver.Firefox(executable_path="D:/geckodriver")

# For Mac System
#browser = webdriver.Chrome(executable_path="/Users/sylvester/chromedriver")
#browser = webdriver.Firefox(executable_path="/Users/sylvester/geckodriver")


browser.get(url)
sleep(2)
 
school_code = browser.find_element_by_name("treg")

#School Code range from 1100 to 5104 
school_code.send_keys("2000")
sleep(2)


#get_school_result = browser.find_element_by_xpath('//*[@id="ctrltr"]/td[3]/input[1]')
get_school_result = browser.find_element_by_xpath('/html/body/form/table/tbody/tr[2]/td/table/tbody/tr[3]/td[3]/input[1]')

get_school_result.click()
sleep(10)
 
html_page = browser.page_source

soup = BS(html_page)

# Now you can add your logic of reading from BeautifulSoup
sleep(10)
sleep(10)

browser.quit()


### Website data scrapping using selenium for dynamic website

In [42]:
"""
Real website data scrapping for List of State and Union Territory using Selenium
"""

import pandas as pd
from selenium import webdriver

wiki = "https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals_in_India"

driver = webdriver.Chrome("E:/Driver/chromedriver.exe")

#driver = webdriver.Firefox(executable_path="/Users/sylvester/geckodriver")

# Opening the submission url
driver.get(wiki)

right_table= driver.find_element_by_class_name('wikitable')

#Generate lists
A=[]
B=[]
C=[]
D=[]
E=[]
F=[]

for row in right_table.find_elements_by_tag_name('tr'):
    cells = row.find_elements_by_tag_name('td')
    states = row.find_elements_by_tag_name('th')
    if len(cells) == 6:
        A.append(states[0].text.strip())
        B.append(cells[1].text.strip())
        C.append(cells[2].text.strip())
        D.append(cells[3].text.strip())
        E.append(cells[4].text.strip())
        F.append(cells[5].text.strip())


import pandas as pd
from collections import OrderedDict

col_name = ["State or UN","Admin Cap","Legis Cap","Judi Cap","Year","Capital"]
col_data = OrderedDict(zip(col_name,[A,B,C,D,E,F]))

df = pd.DataFrame(col_data) 
df.to_csv("states_table.csv")

driver.quit()


In [None]:

"""
Code Challenge
  Name: 
    Webscrapping ICC Cricket Page
  Filename: 
    icccricket.py
  Problem Statement:
    Write a Python code to Scrap data from ICC Ranking's 
    page and get the ranking table for ODI's (Men). 
    Create a DataFrame using pandas to store the information.
  Hint: 
    https://www.icc-cricket.com/rankings/mens/team-rankings/odi 
    
    
    #https://www.icc-cricket.com/rankings/mens/team-rankings/t20i
    #https://www.icc-cricket.com/rankings/mens/team-rankings/test
"""


"""
Code Challenge:
  Name: 
    Bid Plus
  Filename: 
    bid_plus.py
  Problem Statement:
      USE SELENIUM
      Write a Python code to Scrap data and download data from given url.
      url = "https://bidplus.gem.gov.in/bidlists"
      Make list and append given data:
          1. BID NO
          2. items
          3. Quantity Required
          4. Department Name And Address
          5. Start Date/Time(Enter date and time in different columns)
          6. End Date/Time(Enter date and time in different columns)
          
          # Optional - Do not do this
          7. Name of the PDF file
          
     Make a csv file add all data in it.
      csv Name: bid_plus.csv
"""
