### Web Scraping - Example 1 (Extracting Job Descriptions from Job Portal)

In [2]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pd
import re

In [12]:
url = "https://www.iimjobs.com/k/analytics-jobs-190.html"

In [13]:
html = urlopen(url)


In [14]:
html

<http.client.HTTPResponse at 0x115b43a90>

In [15]:
soup = BeautifulSoup(html)
type(soup)

bs4.BeautifulSoup

### Problem

Extract the Job description from each job posting?

### Approach

a. Each of the Job Posting is a link. Find the url (href attribute) for each job
b. Open the job link. Find the attribute that contains the Job Description


In [16]:
# using find_all to find specific tags
all_links = []
for link in soup.find_all("a"):
    all_links.append(link.get("href"))

In [17]:
all_links[1]

'https://www.iimjobs.com/c/banking--finance-jobs-13.html'

In [18]:
len(all_links)

742

#### Approach 1: based on specific attributes

In [19]:
job_links = []
for link in soup.find_all("a"):
    if link.get("data-jobid") is not None:
        job_links.append(link.get("href"))

In [20]:
len(pd.Series(job_links).unique())

121

In [21]:
len(job_links)

242

#### Approach 2: Using parent tag element details

In [22]:
joblisting = soup.find("div", id="listingPanel")

In [23]:
all_links = []
children = joblisting.find_all("a")
for child in children:
    all_links.append(child.get("href"))

In [24]:
len(all_links)

251

In [25]:
len(pd.Series(all_links).unique())

128

#### Approach 3: Using CSS selector

In [26]:
soup.select("#listingPanel > div.listing > div:nth-child(1)")

[<div class="unfollowopt jobRow container table table-hover pdlr0" data-jobid="866004">
 <div class="col-lg-9 col-md-9 col-sm-8 container pdmobr5" style="padding-left:0px;">
 <div class="pull-left col-xs-12 col-lg-3new col-md-3new col-sm-3new pd0 hidden-xs">
 <span class="pull-left companyjobs"><i class="fa fa-suitcase greytxt"></i></span>
 <span class="pull-left" data-trigger="hover click" rel="tooltip" title="premium job">
 <i class="fa fa-bookmark darkgreyish"></i>
 </span>
 <span class="glyphicon glyphicon-plus-sign plsign plsigngrey pull-left" data-trigger="hover click" rel="tooltip"></span>
 <span class="applied-job showicon pull-left" data-trigger="hover click" rel="tooltip" title="">
 <i class="fa fa-check-square-o greytxt"></i>
 </span>
 <span act="save_job" class="glyphicon glyphicon-star-empty saved-job pull-left" data-trigger="hover click" rel="tooltip" title="save this job for future reference"></span>
 <span class="gry_txt txt12 visible-xs pull-right mt3">20/11/2020</span

In [27]:
listPanel = soup.select("#listingPanel > div.listing > *")

In [28]:
listPanel[1]

<div class="unfollowopt jobRow container table table-hover pdlr0 greybg" data-jobid="865600">
<div class="col-lg-9 col-md-9 col-sm-8 container pdmobr5" style="padding-left:0px;">
<div class="pull-left col-xs-12 col-lg-3new col-md-3new col-sm-3new pd0 hidden-xs">
<span class="pull-left companyjobs"><i class="fa fa-suitcase greytxt"></i></span>
<span class="pull-left" data-trigger="hover click" rel="tooltip" title="premium job">
<i class="fa fa-bookmark darkgreyish"></i>
</span>
<span class="pull-left prefrence" data-trigger="hover click" rel="tooltip" title="Women candidates preferred"><img src="https://static.iimjobs.com/resources/images/female_candidate.png"/></span>
<span class="applied-job showicon pull-left" data-trigger="hover click" rel="tooltip" title="">
<i class="fa fa-check-square-o greytxt"></i>
</span>
<span act="save_job" class="glyphicon glyphicon-star-empty saved-job pull-left" data-trigger="hover click" rel="tooltip" title="save this job for future reference"></span>
<s

In [29]:
[x.get("href") for x in  listPanel[0].find_all("a")]

['https://www.iimjobs.com/j/merilytics-business-analyst-data-analytics-iit-bits-nit-0-2-yrs-866004.html?ref=kp',
 'https://www.iimjobs.com/j/merilytics-business-analyst-data-analytics-iit-bits-nit-0-2-yrs-866004.html?ref=kp']

In [30]:
[x.get("href") for x in listPanel[0].find_all("a")]

['https://www.iimjobs.com/j/merilytics-business-analyst-data-analytics-iit-bits-nit-0-2-yrs-866004.html?ref=kp',
 'https://www.iimjobs.com/j/merilytics-business-analyst-data-analytics-iit-bits-nit-0-2-yrs-866004.html?ref=kp']

In [31]:
all_links = []
for i in range(len(listPanel)):
    all_links.append([x.get("href") for x in listPanel[i].find_all("a")])

In [32]:
len(all_links)

128

In [33]:
url = all_links[0][0]
url

'https://www.iimjobs.com/j/merilytics-business-analyst-data-analytics-iit-bits-nit-0-2-yrs-866004.html?ref=kp'

In [34]:
html = urlopen(url)
soup = BeautifulSoup(html)

In [None]:
#page-content-wrapper > div.page-content.inset > div.info.col-md-9.col-sm-9.col-xs-12.pdlr0.mnht520 > div.details.job-description

### Web Scraping - Example 2 (Extracting Reviews from Amazon)

In [42]:
url_template = "https://www.amazon.in/Test-Exclusive-550/product-reviews/B077Q7GW9V/ref=cm_cr_getr_d_paging_btm_prev_1?ie=UTF8&reviewerType=all_reviews&pageNumber=<NUM>"

In [37]:
url = re.sub("<NUM>",str(1),url_template)
url

'https://www.amazon.in/Test-Exclusive-550/product-reviews/B077Q7GW9V/ref=cm_cr_getr_d_paging_btm_prev_1?ie=UTF8&reviewerType=all_reviews&pageNumber=1'

In [47]:
clean_reviews = []
for i in range(10):
    try:
        url = re.sub("<NUM>",str(i),url_template)
        html = urlopen(url)
        soup = BeautifulSoup(html)

        all_reviews = soup.find_all("div", class_="a-row a-spacing-small review-data")

        for review in all_reviews:
            review_text = review.find("span",class_="a-size-base review-text review-text-content")
            clean_reviews.append(review_text.find("span").get_text())
    except Exception as e:
        print(e)
        break

HTTP Error 503: Service Unavailable


In [48]:
len(clean_reviews)

560

In [50]:
clean_reviews[500]

'\n  My opinion after buying a phone ::1. Best Performance.2. One of the best thing Snapdragon processor in Redmi Note series device at this price.3. Good battery life.4. if you like Big Display You get a 6.67-inch full-HD+ (1080x2400 -pixel). But The refresh rate of Redmi Note 9 Pro is 60Hz.5. Gorilla Glass 5 Protection.6. Good Design and The squared  camera design is also Amazing.7. Camera quality is good , portrait shots had nicely blurred background. But No LED flash lights in front.Overall This is balanced Mobile Phone\n'

### Web Scraping - Example 2 (Extracting Reviews from Amazon)

In [51]:
link = "https://www.worldometers.info/coronavirus/"
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(link,headers=hdr)
page = urlopen(req)


In [52]:
soup = BeautifulSoup(page)


In [53]:
covid_tbl = soup.find("table", id = "main_table_countries_today")

In [None]:
print(covid_tbl.prettify()) 

In [55]:
tbl_rows = covid_tbl.find_all("tr")

In [56]:
res = []
for row in tbl_rows:
    td = row.find_all("td")
    td_clean = [x.get_text() for x in td]
    res.append(td_clean)

In [57]:
res[12]

['4',
 'France',
 '2,065,138',
 '',
 '46,698 ',
 '',
 '145,391',
 '',
 '1,873,049',
 '4,775',
 '31,611',
 '715',
 '19,339,461',
 '296,031',
 '65,329,229 ',
 'Europe',
 '32',
 '1,399',
 '3']

In [58]:
res_df = pd.DataFrame(res)

In [59]:
res_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,,,,,,,,,,,,,,,,,,,
1,,\nNorth America\n,13943144,+3959,383096,+506,8730177,+4096,4829871,26194,,,,,,North America,\n,,
2,,\nAsia\n,15370419,+9760,271231,+96,13804597,+14541,1294591,25136,,,,,,Asia,\n,,
3,,\nSouth America\n,10504498,+96,313513,+9,9498954,+544,692031,17363,,,,,,South America,\n,,
4,,\nEurope\n,14694606,+18539,337780,+443,5594370,+7962,8762456,30141,,,,,,Europe,\n,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,,Total:,14694606,+18539,337780,+443,5594370,+7962,8762456,30141,,,,,,Europe,,,
232,,Total:,2026587,,48509,,1709279,,268799,2573,,,,,,Africa,,,
233,,Total:,43074,+10,1003,,32950,,9121,24,,,,,,Australia/Oceania,,,
234,,Total:,721,,15,,659,,47,4,,,,,,,,,


In [60]:
for heading in covid_tbl.find_all("th"):
    print(heading.get_text())

#
Country,Other
TotalCases
NewCases
TotalDeaths
NewDeaths
TotalRecovered
NewRecovered
ActiveCases
Serious,Critical
Tot Cases/1M pop
Deaths/1M pop
TotalTests
Tests/
1M pop

Population
Continent
1 Caseevery X ppl
1 Deathevery X ppl
1 Testevery X ppl


In [None]:
res_df.to_csv("chk.csv")

### Regular expressions

In [61]:
sample_txt = '''

Mr. John.b.Doe
Designation: Sr Software Engineer
DOB: 20-12-1989
email: johndoe@gmail.com
Mob1: 9123456780
Mob2: 8123456780

Working as a Sr Software Engg with ABC Ltd. Total 5+ yrs of experience in Data science & advanced
analytics

'''

In [39]:
import re

In [63]:
for match in re.finditer("john", sample_txt, re.IGNORECASE):
    print(match)

<re.Match object; span=(6, 10), match='John'>
<re.Match object; span=(74, 78), match='john'>


In [64]:
for match in re.finditer(" John ", sample_txt):
    print(match)

In [None]:
# basic search

In [65]:
matches = re.finditer("john",sample_txt,re.IGNORECASE)

In [66]:
for match in matches:
    print(match)

<re.Match object; span=(6, 10), match='John'>
<re.Match object; span=(74, 78), match='john'>


In [40]:
def quick_pat(pat,sample_txt):
    matches = re.finditer(pat,sample_txt,re.IGNORECASE)
    for match in matches:
        print(match)

In [69]:
quick_pat("john")

<re.Match object; span=(6, 10), match='John'>
<re.Match object; span=(74, 78), match='john'>


In [70]:
# what if we only want to match the name 
# use the word boundary indicator
quick_pat(r"\bjohn\b")

<re.Match object; span=(6, 10), match='John'>


In [None]:
# Q: Any alternate approach to extract full match and exclude partial one?

In [None]:
# find mobile no
quick_pat(r"\d{10}")

In [75]:
# alternatively, using character set
quick_pat(r"[9][0-9]{9}")

<re.Match object; span=(98, 108), match='9123456780'>


In [76]:
sample_txt = '''

abc@gmail.com    Rs.1000
klh_564@gmail.com  Rs.2000
bh.glk@yahoo.co.in Rs.3000
bh.glk@abcltd.in Rs.3000

'''

In [80]:
# find all emails
quick_pat(r"\w+@\w+\.(com|co.in)")

<re.Match object; span=(2, 15), match='abc@gmail.com'>
<re.Match object; span=(27, 44), match='klh_564@gmail.com'>
<re.Match object; span=(57, 72), match='glk@yahoo.co.in'>


In [81]:
quick_pat(r"[a-zA-Z._0-9]+@\w+\.(com|co.in)")

<re.Match object; span=(2, 15), match='abc@gmail.com'>
<re.Match object; span=(27, 44), match='klh_564@gmail.com'>
<re.Match object; span=(54, 72), match='bh.glk@yahoo.co.in'>


In [None]:
# find emails (only gmail or yahoo..)

In [None]:
quick_pat(r"[a-zA-Z._0-9]{1,}@(gmail|yahoo)\.(com|co.in)")

In [None]:
quick_pat(r"\w{1,}@(gmail|yahoo)\.(com|co.in)")

In [1]:
### Find IP address from the text

In [2]:
#https://datasetsearch.research.google.com/search?query=Server%20logs&docid=e82RkTuD5g%2BSYjjjAAAAAA%3D%3D

In [35]:
with open("access_log_sample.txt","r") as f:
    content = f.readlines()

In [42]:
content[0]

'54.36.149.41 - - [22/Jan/2019:03:56:14 +0330] "GET /filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C%DA%A9%D8%B3%D9%84,27|%DA%A9%D9%85%D8%AA%D8%B1%20%D8%A7%D8%B2%205%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C%DA%A9%D8%B3%D9%84,p53 HTTP/1.1" 200 30577 "-" "Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)" "-"\n'

In [45]:
for i in content:
    quick_pat(r'\d{,3}\.\d{,3}\.\d{,3}\.\d{,3}',i)
    

<re.Match object; span=(0, 12), match='54.36.149.41'>
<re.Match object; span=(0, 11), match='31.56.96.51'>
<re.Match object; span=(0, 11), match='31.56.96.51'>
<re.Match object; span=(0, 13), match='40.77.167.129'>
<re.Match object; span=(0, 11), match='91.99.72.15'>
<re.Match object; span=(0, 13), match='40.77.167.129'>
<re.Match object; span=(0, 13), match='40.77.167.129'>
<re.Match object; span=(0, 13), match='40.77.167.129'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 13), match='40.77.167.129'>
<re.Match object; span=(0, 13), match='207.46.13.136'>
<re.Match object; span=(0, 13), match='40.77.167.129'>
<re.Match object; span=(0, 13), match='178.253.33.51'>
<re.Match object; span=(0, 13), match='40.77.167.129'>
<re.Match object; span=(0, 11), match='91.99.72.15'>
<re.Match object; span=(0, 13), match='40.77.167.129'>
<re.Match object; span=(0, 13), match='207.46.13.136'>
<re.Match object; span=(0, 13), match='40.77.167.129'>
<re.Match object; s

<re.Match object; span=(0, 12), match='17.58.102.43'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 13), match='66.111.54.249'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 13), match='54.36.148.248'>
<re.Match object; span=(0, 13), match='207.46.13.104'>
<re.Match object; span=(0, 13), match

<re.Match object; span=(0, 13), match='207.46.13.136'>
<re.Match object; span=(0, 12), match='66.249.66.91'>
<re.Match object; span=(0, 12), match='54.36.148.71'>
<re.Match object; span=(0, 14), match='130.185.74.243'>
<re.Match object; span=(0, 13), match='207.46.13.136'>
<re.Match object; span=(0, 12), match='5.160.157.20'>
<re.Match object; span=(0, 12), match='5.160.157.20'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 13), match='5.127.182.189'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 11), match='5.211.97.39'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 12), match='54.36.149.40'>
<re.Match object; span=(0, 12), match='66.249.66.91'>
<re.Match object; span=(0, 14), match='130.185.74.243'>
<re.Match object; span=(0, 13), match='5.127.182.189'>
<re.Match object; span=(0, 12), match='66.249.66.91'>
<re.Match object; spa

<re.Match object; span=(0, 13), match='5.117.242.204'>
<re.Match object; span=(0, 13), match='5.117.242.204'>
<re.Match object; span=(0, 14), match='130.185.74.243'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 14), match='130.185.74.243'>
<re.Match object; span=(0, 13), match='204.18.36.198'>
<re.Match object; span=(0, 14), match='130.185.74.243'>
<re.Match object; span=(0, 14), match='130.185.74.243'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 11), match='51.15.15.54'>
<re.Match object; span=(0, 14), match='130.185.74.243'>
<re.Match object; span=(0, 12), match='17.58.102.43'>
<re.Match object; span=(0, 14), match='130.185.74.243'>
<re.Match object; span=(0, 12), match='66.249.66.91'>
<re.Match object; span=(0, 12), match='66.249.66.91'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match

<re.Match object; span=(0, 12), match='86.55.46.218'>
<re.Match object; span=(0, 12), match='86.55.46.218'>
<re.Match object; span=(0, 12), match='66.249.66.91'>
<re.Match object; span=(0, 11), match='51.75.93.31'>
<re.Match object; span=(0, 12), match='86.55.46.218'>
<re.Match object; span=(0, 12), match='66.249.66.91'>
<re.Match object; span=(0, 12), match='79.137.62.29'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 13), match='91.251.220.58'>
<re.Match object; span=(0, 12), match='86.55.46.218'>
<re.Match object; span=(0, 11), match='37.98.49.55'>
<re.Match object; span=(0, 12), match='86.55.46.218'>
<re.Match object; span=(0, 12), match='86.55.46.218'>
<re.Match object; span=(0, 12), match='86.55.46.218'>
<re.Match object; span=(0, 12), match='79.137.62.29'>
<re.Match object; span=(0, 11), match='37.98.49.55'>
<re.Match object; span=(0, 11), match='37.98.49.55'>
<re.Match object; span=(0, 11), match='37.98.49.55'>
<re.Match object; span=(0, 12),

<re.Match object; span=(0, 11), match='91.99.72.15'>
<re.Match object; span=(0, 12), match='5.115.54.117'>
<re.Match object; span=(0, 12), match='66.249.66.91'>
<re.Match object; span=(0, 13), match='188.158.38.30'>
<re.Match object; span=(0, 11), match='91.99.72.15'>
<re.Match object; span=(182, 193), match='16.0.912.36'>
<re.Match object; span=(0, 11), match='5.123.88.42'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 13), match='63.143.42.246'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 12), match='66.249.66.91'>
<re.Match object; span=(0, 12), match='66.249.66.91'>
<re.Match object; span=(0, 13), match='63.143.42.246'>
<re.Match object; span=(0, 11), match='91.99.72.15'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 11), match='91.99.72.15'>
<re.Match object; span=

<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 11), match='5.122.95.74'>
<re.Match object; span=(0, 12), match='54.36.148.55'>
<re.Match object; span=(0, 11), match='5.122.95.74'>
<re.Match object; span=(0, 13), match='172.80.205.91'>
<re.Match object; span=(0, 13), match='172.80.205.91'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 13), match='83.121.232.38'>
<re.Match object; span=(0, 12), match='172.20.2.174'>
<re.Match object; span=(0, 13), match='172.80.205.91'>
<re.Match object; span=(0, 13), match='83.121.232.38'>
<re.Match object; span=(0, 13), match='172.80.205.91'>
<re.Match object; span=(0, 12), match='54.36.148.95'>
<re.Match object; span=(0, 13), match='66.249.66.194'>
<re.Match object; span=(0, 13), match='172.80.205.91'>
<re.Match object; span=(0, 13), match='172.80.205.91'>
<re.Match object;

In [74]:
res_df = pd.DataFrame({"ip":all_ips,"ts":all_dates})