# SOLUTION KEY

### Problem 5: Write a python program to scrape cricket rankings from ‘www.icc-cricket.com’.

1. Top 10 ODI teams in men’s cricket along with the records for matches, points and rating.

In [42]:
# Note: ICC highlights first ranked team with blue background and larger font so it will have to be scraped separately
#       from the other teams.

from bs4 import BeautifulSoup
import requests
import pandas as pd
page = requests.get('https://www.icc-cricket.com/rankings/mens/team-rankings/odi')
soup = BeautifulSoup(page.content, 'lxml')

# Let us create lists for the 4 columns
name_list=[]
match_list=[]
points_list=[]
rating_list=[]

In [43]:
# Make some functions
def scrape_two_loops(tag, clss, subtag, lst, subclss='',  num=None):
    
    storage = soup.find_all(tag, class_= clss)
    
    # We don't always need all the results, so we use num to narrow it down
    for i in storage[:num]:
        for j in i.find_all(subtag, class_= subclss):
            lst.append(j.text)
        
def scrape_one_loop(tag, clss, lst, num=None):
    
    storage = soup.find_all(tag, class_= clss)
    for i in storage[:num]:
        lst.append(i.text)
        
# Let's add first entry in each list

# Name
scrape_two_loops('td', 'rankings-block__banner--team-name', 'span', name_list, 'u-hide-phablet', 1)

# Rating, Matches, and Points
scrape_one_loop('td', 'rankings-block__banner--rating u-text-right', rating_list, 1)
scrape_one_loop('td', 'rankings-block__banner--matches', match_list, 1)
scrape_one_loop('td', 'rankings-block__banner--points', points_list, 1)

In [44]:
# Let us add the rest of the top 10

# Team
scrape_two_loops('td', 'table-body__cell rankings-table__team', 'span', name_list, 'u-hide-phablet', 9)    

# Matches and Points
    # Note: Both matches and points have the same class and tag so we need to separate them later. For now use a new list

match_point=[]

    # We use 18 because there are two entries per team
scrape_one_loop('td', 'table-body__cell u-center-text', match_point, 18)

# Rating
scrape_one_loop('td', 'table-body__cell u-text-right rating', rating_list, 9)

In [45]:
# Let us separate match_point into its two constituents
match_list.extend(match_point[::2])
points_list.extend(match_point[1::2])

# Let us remove whitespace and \n from the first entry in rating_list
rating_list[0] = rating_list[0].strip()

# Now to make the dataframe
icc_odi = pd.DataFrame({})
icc_odi['Team'] = name_list
icc_odi['Matches'] = match_list
icc_odi['Points'] = points_list
icc_odi['Rating'] = rating_list

# Let us start the index from 1
icc_odi.index+=1
icc_odi

Unnamed: 0,Team,Matches,Points,Rating
1,New Zealand,17,2054,121
2,England,32,3793,119
3,Australia,28,3244,116
4,India,32,3624,113
5,South Africa,25,2459,98
6,Pakistan,27,2524,93
7,Bangladesh,30,2740,91
8,West Indies,30,2523,84
9,Sri Lanka,32,2657,83
10,Afghanistan,17,1054,62


2. Top 10 ODI Batsmen in men along with the records of their team and rating.

In [28]:
page = requests.get('https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting')
soup = BeautifulSoup(page.content)

# Let us create 3 lists
name_list=[]
team_list=[]
rating_list=[]

# The first entry has different font than the rest so we get that first
scrape_one_loop('div', 'rankings-block__banner--name-large', name_list, 1)
scrape_one_loop('div', 'rankings-block__banner--nationality', team_list, 1)
scrape_one_loop('div', 'rankings-block__banner--rating', rating_list, 1)

# Now for the rest of them
storage = soup.find_all('td', class_='table-body__cell rankings-table__name name')
for i in storage[:9]:
    for j in i.find_all('a'):
        name_list.append(j.text)
        
scrape_one_loop('span', 'table-body__logo-text', team_list, 9)
scrape_one_loop('td', 'table-body__cell rating', rating_list, 9)

# Remove whitespaces from first entry in name
team_list[0] = team_list[0].strip()

# The dataframe
batsmen = pd.DataFrame({})

batsmen['Batsman'] = name_list
batsmen['Team'] = team_list
batsmen['Rating'] = rating_list

# Let us start the index from 1
batsmen.index += 1

batsmen

Unnamed: 0,Batsman,Team,Rating
1,Babar Azam,PAK,873
2,Virat Kohli,IND,844
3,Rohit Sharma,IND,813
4,Ross Taylor,NZ,801
5,Aaron Finch,AUS,779
6,Jonny Bairstow,ENG,775
7,David Warner,AUS,762
8,Shai Hope,WI,758
9,Kane Williamson,NZ,754
10,Quinton de Kock,SA,747


3. Top 10 ODI bowlers along with the records of their team and rating.

In [29]:
page = requests.get('https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling')
soup = BeautifulSoup(page.content)

# Let us create 3 lists
name_list=[]
team_list=[]
rating_list=[]

# The first entry has different font than the rest so we get that first
scrape_one_loop('div', 'rankings-block__banner--name-large', name_list, 1)
scrape_one_loop('div', 'rankings-block__banner--nationality', team_list, 1)
scrape_one_loop('div', 'rankings-block__banner--rating', rating_list, 1)

# Now for the rest of them
storage = soup.find_all('td', class_='table-body__cell rankings-table__name name')
for i in storage[:9]:
    for j in i.find_all('a'):
        name_list.append(j.text)
        
scrape_one_loop('span', 'table-body__logo-text', team_list, 9)
scrape_one_loop('td', 'table-body__cell rating', rating_list, 9)

# Remove whitespaces from first entry in name
team_list[0] = team_list[0].strip()

# The dataframe
bowlers = pd.DataFrame({})

bowlers['Bowlers'] = name_list
bowlers['Team'] = team_list
bowlers['Rating'] = rating_list

# Let us start the index from 1
bowlers.index += 1

bowlers

Unnamed: 0,Bowlers,Team,Rating
1,Trent Boult,NZ,737
2,Josh Hazlewood,AUS,709
3,Mujeeb Ur Rahman,AFG,708
4,Chris Woakes,ENG,700
5,Mehedi Hasan,BAN,692
6,Matt Henry,NZ,691
7,Jasprit Bumrah,IND,679
8,Mitchell Starc,AUS,652
9,Shakib Al Hasan,BAN,650
10,Kagiso Rabada,SA,646


### Problem 6: Write a python program to scrape cricket rankings from ‘www.icc-cricket.com’.

1. Top 10 ODI teams in women’s cricket along with the records for matches, points and rating.

In [48]:
page = requests.get('https://www.icc-cricket.com/rankings/womens/team-rankings/odi')
soup = BeautifulSoup(page.content)

# Let us create lists for the 4 columns
name_list=[]
match_list=[]
points_list=[]
rating_list=[]

# Let's add first entry in each list

# Name
scrape_two_loops('td', 'rankings-block__banner--team-name', 'span', name_list, 'u-hide-phablet',  1)

# Rating, Matches, and Points
scrape_one_loop('td', 'rankings-block__banner--rating u-text-right', rating_list, 1)
scrape_one_loop('td', 'rankings-block__banner--matches', match_list, 1)
scrape_one_loop('td', 'rankings-block__banner--points', points_list, 1)


# Let us add the rest of the top 10

# Team
scrape_two_loops('td', 'table-body__cell rankings-table__team', 'span', name_list, 'u-hide-phablet', 9)    

# Matches and Points
    # Note: Both matches and points have the same class and tag so we need to separate them later. For now use a new list

match_point=[]

    # We use 18 because there are two entries per team
scrape_one_loop('td', 'table-body__cell u-center-text', match_point, 18)

# Rating
scrape_one_loop('td', 'table-body__cell u-text-right rating', rating_list, 9)

# Let us separate match_point into its two constituents
match_list.extend(match_point[::2])
points_list.extend(match_point[1::2])

# Let us remove whitespace and \n from the first entry in rating_list
rating_list[0] = rating_list[0].strip()

# Now to make the dataframe
icc_odi = pd.DataFrame({})
icc_odi['Team'] = name_list
icc_odi['Matches'] = match_list
icc_odi['Points'] = points_list
icc_odi['Rating'] = rating_list

# Let us start the index from 1
icc_odi.index+=1

icc_odi

Unnamed: 0,Team,Matches,Points,Rating
1,Australia,19,3129,165
2,England,24,2840,118
3,South Africa,29,3390,117
4,India,24,2635,110
5,New Zealand,25,2324,93
6,West Indies,22,1872,85
7,Pakistan,20,1496,75
8,Bangladesh,5,306,61
9,Sri Lanka,11,519,47
10,Ireland,2,25,13


2. Top 10 women’s ODI batting along with the records of their team and rating.

In [31]:
page = requests.get('https://www.icc-cricket.com/rankings/womens/player-rankings/odi/batting')
soup = BeautifulSoup(page.content)

# Let us create 3 lists
name_list=[]
team_list=[]
rating_list=[]

# The first entry has different font than the rest so we get that first
scrape_one_loop('div', 'rankings-block__banner--name-large', name_list, 1)
scrape_one_loop('div', 'rankings-block__banner--nationality', team_list, 1)
scrape_one_loop('div', 'rankings-block__banner--rating', rating_list, 1)

# Now for the rest of them
storage = soup.find_all('td', class_='table-body__cell rankings-table__name name')
for i in storage[:9]:
    for j in i.find_all('a'):
        name_list.append(j.text)
        
scrape_one_loop('span', 'table-body__logo-text', team_list, 9)
scrape_one_loop('td', 'table-body__cell rating', rating_list, 9)

# Remove whitespaces from first entry in name
team_list[0] = team_list[0].strip()

# The dataframe
batter = pd.DataFrame({})

batter['Batting'] = name_list
batter['Team'] = team_list
batter['Rating'] = rating_list

# Let us start the index from 1
batter.index += 1

batter

Unnamed: 0,Batting,Team,Rating
1,Mithali Raj,IND,762
2,Lizelle Lee,SA,761
3,Alyssa Healy,AUS,756
4,Tammy Beaumont,ENG,740
5,Amy Satterthwaite,NZ,726
6,Meg Lanning,AUS,723
7,Smriti Mandhana,IND,701
8,Ellyse Perry,AUS,679
9,Laura Wolvaardt,SA,672
10,Natalie Sciver,ENG,672


3. Top 10 women’s ODI all-rounder along with the records of their team and rating

In [32]:
page = requests.get('https://www.icc-cricket.com/rankings/womens/player-rankings/odi/all-rounder')
soup = BeautifulSoup(page.content)

# Let us create 3 lists
name_list=[]
team_list=[]
rating_list=[]

# The first entry has different font than the rest so we get that first
scrape_one_loop('div', 'rankings-block__banner--name-large', name_list, 1)
scrape_one_loop('div', 'rankings-block__banner--nationality', team_list, 1)
scrape_one_loop('div', 'rankings-block__banner--rating', rating_list, 1)

# Now for the rest of them
storage = soup.find_all('td', class_='table-body__cell rankings-table__name name')
for i in storage[:9]:
    for j in i.find_all('a'):
        name_list.append(j.text)
        
scrape_one_loop('span', 'table-body__logo-text', team_list, 9)
scrape_one_loop('td', 'table-body__cell rating', rating_list, 9)

# Remove whitespaces from first entry in name
team_list[0] = team_list[0].strip()

# The dataframe
all_rounder = pd.DataFrame({})

all_rounder['All Rounder'] = name_list
all_rounder['Team'] = team_list
all_rounder['Rating'] = rating_list

# Let us start the index from 1
all_rounder.index += 1

all_rounder

Unnamed: 0,All Rounder,Team,Rating
1,Ellyse Perry,AUS,418
2,Marizanne Kapp,SA,384
3,Natalie Sciver,ENG,380
4,Deepti Sharma,IND,331
5,Stafanie Taylor,WI,322
6,Jess Jonassen,AUS,307
7,Dane van Niekerk,SA,274
8,Sophie Devine,NZ,254
9,Katherine Brunt,ENG,254
10,Ashleigh Gardner,AUS,252


### Problem 7: Write a python program to scrape details of all the mobile phones under Rs. 20,000 listed on Amazon.in. The scraped data should include Product Name, Price, Image URL and Average Rating.

In [64]:
page= requests.get('https://www.amazon.in/s?k=mobile+phones+under+20000&i=electronics&ref=nb_sb_noss_1')
soup = BeautifulSoup(page.content)

# Let us create the lists required
name_list=[]
price_list=[]
image_list=[]
rating_list=[]

# Product Name
scrape_one_loop('span', 'a-size-medium a-color-base a-text-normal', name_list)

# Price 
scrape_one_loop('span', 'a-price-whole', price_list)

# Image URL
storage = soup.find_all('img', class_='s-image')
for i in storage:
    image_list.append(i.get('src'))
    
# Average Rating
scrape_one_loop( 'span', 'a-icon-alt', rating_list)

# Removing the excess entries from rating_list
rating_list = rating_list[:-4]

In [65]:
len(name_list), len(price_list), len(image_list), len(rating_list)

(24, 24, 24, 24)

In [67]:
# Since all lengths are equal, let's make a DataFrame
phones = pd.DataFrame({})

phones['Product Name'] = name_list
phones['Price'] = price_list
phones['Imag_URL'] = image_list
phones['Average Rating'] = rating_list

# Correcting the index
phones.index += 1

phones

Unnamed: 0,Product Name,Price,Imag_URL,Average Rating
1,"Redmi 9A (Nature Green, 2GB RAM, 32GB Storage)...",6999,https://m.media-amazon.com/images/I/71sxlhYhKW...,4.2 out of 5 stars
2,"Redmi 9 (Sky Blue, 4GB RAM, 64GB Storage) | 2....",9499,https://m.media-amazon.com/images/I/71A9Vo1Bat...,4.2 out of 5 stars
3,"OPPO A31 (Fantasy White, 6GB RAM, 128GB Storag...",12990,https://m.media-amazon.com/images/I/61CnyJ-IbM...,4.2 out of 5 stars
4,"Samsung Galaxy M31 (Ocean Blue, 8GB RAM, 128GB...",16999,https://m.media-amazon.com/images/I/71-Su4Wr0H...,4.2 out of 5 stars
5,"Redmi 9 (Carbon Black, 4GB RAM, 64GB Storage) ...",9499,https://m.media-amazon.com/images/I/716nHhG9SW...,4.2 out of 5 stars
6,"Redmi 9 Power (Mighty Black, 6GB RAM, 128GB St...",13499,https://m.media-amazon.com/images/I/61LHaUOheh...,4.2 out of 5 stars
7,"Redmi 9A (Midnight Black, 2GB RAM, 32GB Storag...",6999,https://m.media-amazon.com/images/I/71sxlhYhKW...,4.3 out of 5 stars
8,Redmi 9A (Sea Blue 2GB RAM 32GB Storage) | 2GH...,6999,https://m.media-amazon.com/images/I/71sxlhYhKW...,4.2 out of 5 stars
9,"OPPO A74 5G (Fantastic Purple,6GB RAM,128GB St...",17990,https://m.media-amazon.com/images/I/71geVdy6-O...,4.2 out of 5 stars
10,"realme C11 (2021) (Cool Grey, 2GB RAM, 32GB St...",7299,https://m.media-amazon.com/images/I/618UBhFmaQ...,4.1 out of 5 stars


### Problem 8: Write a python program to extract information about the local weather from the National Weather Service website of USA, https://www.weather.gov/ for the city, San Francisco. You need to extract data about 7 day extended forecast display for the city. The data should include period, short description, temperature and description.

In [4]:
page = requests.get('https://forecast.weather.gov/MapClick.php?lat=37.777120000000025&lon=-122.41963999999996#.YU3g-JpBzIU')
soup = BeautifulSoup(page.content)

# Create the required lists
period_list=[]
short_desc_list=[]
temp_list=[]
descr_list=[]

# Period
scrape_one_loop('p', 'period-name', period_list)

# Short description
scrape_one_loop('p', 'short-desc', short_desc_list)

# Temperature
scrape_one_loop('p', 'temp', temp_list)

# Description
scrape_one_loop('div', 'col-sm-10 forecast-text', descr_list)

# Limiting descr_list to 9 as with the others
descr_list = descr_list[:9]

len(period_list), len(short_desc_list), len(temp_list), len(descr_list)


(9, 9, 9, 9)

In [5]:
# Since the lengths are equal, we move on to dataframes
weather = pd.DataFrame({})

weather['Period'] = period_list
weather['Short Description'] = short_desc_list
weather['Temperature'] = temp_list
weather['Description'] = descr_list

# Correcting the index
weather.index += 1

weather

Unnamed: 0,Period,Short Description,Temperature,Description
1,Today,Sunny thenSunny andBreezy,High: 73 °F,"Sunny, with a high near 73. Breezy, with a wes..."
2,Tonight,Partly Cloudy,Low: 56 °F,"Partly cloudy, with a low around 56. West sout..."
3,Saturday,Mostly Sunny,High: 68 °F,"Mostly sunny, with a high near 68. West southw..."
4,SaturdayNight,IncreasingClouds,Low: 55 °F,"Increasing clouds, with a low around 55. West ..."
5,Sunday,Mostly Sunny,High: 65 °F,"Mostly sunny, with a high near 65. West wind 9..."
6,SundayNight,Mostly Cloudy,Low: 55 °F,"Mostly cloudy, with a low around 55."
7,Monday,Mostly Sunny,High: 67 °F,"Mostly sunny, with a high near 67."
8,MondayNight,Slight ChanceRain,Low: 57 °F,A slight chance of rain after 11pm. Mostly cl...
9,Tuesday,Slight ChanceRain thenMostly Sunny,High: 66 °F,A slight chance of rain before 11am. Mostly s...


### Problem 9: Write a python program to scrape fresher job listings from ‘https://internshala.com/’. It should include job title, company name, CTC, and apply date.

In [9]:
url=list('https://internshala.com/fresher-jobs/page-1')

# Let us initialise the required lists
job_list=[]
company_list=[]
ctc_list=[]
date_list=[]

# CTC and apply date are grouped together so we will create one bigger list and separate into CTC and date
ctc_date=[]

# There are 7 pages so we use a for loop to change the last character of url everytime
for x in range(1,8):
    url[-1]=x
    new_url=''.join([str(elem) for elem in url])
    page=requests.get(new_url)
    soup=BeautifulSoup(page.content)

    # Title
    scrape_two_loops('div', 'heading_4_5 profile', 'a', job_list)

    # Company
    scrape_two_loops('div', 'heading_6 company_name', 'a', company_list, 'link_display_like_text')

    # CTC & Apply Date
    scrape_two_loops('div', 'other_detail_item', 'div',  ctc_date, 'item_body')

    # Now let's separate CTC and Apply Date
    ctc_list= ctc_date[1::3]
    date_list= ctc_date[2::3]

# Remove whitespaces from company and CTC lists
company_list_clean=[]
for i in company_list:
    company_list_clean.append(i.strip())

ctc_list_clean=[]    
for i in ctc_list:
    ctc_list_clean.append(i.strip())
    

In [10]:
len(ctc_list_clean), len(job_list), len(date_list), len(company_list_clean)

(280, 280, 280, 280)

In [11]:
# Since all lists are of equal length, we will move on to dataframe
freshers = pd.DataFrame({})

freshers['Job'] = job_list
freshers['Company'] = company_list_clean
freshers['CTC'] = ctc_list_clean
freshers['Apply Date'] = date_list

# Let us start the index from 1
freshers.index += 1

freshers

Unnamed: 0,Job,Company,CTC,Apply Date
1,Omni Sport Leader - Running/Walking,Decathlon Sports India,3 - 3.4 LPA,7 Oct' 21
2,Business Development Executive,Creative Group (CG Resettlement Private Limited),3 - 4.8 LPA,24 Oct' 21
3,Junior Social Media Marketing Executive,AMYDUS,3 - 4 LPA,24 Oct' 21
4,Operations Executive,MiStay,3 LPA,24 Oct' 21
5,Data Entry Executive,Deepdive Media Private Limited,3 LPA,24 Oct' 21
...,...,...,...,...
276,Customer Service Executive,InfyBytes AI Labs Private Limited,3 - 4 LPA,27 Sep' 21
277,Business Development Associate,XOOG EDLEARN PRIVATE LIMITED,4.5 - 7 LPA,27 Sep' 21
278,Trainee Software Engineer,NuWare Systems,3 - 4 LPA,26 Sep' 21
279,Associate (Development),Pratham InfoTech Foundation,3 - 3.5 LPA,29 Sep' 21


### Problem 10: Write a python program to scrape house details from https://www.nobroker.in/ for any location. It should include house title, location, area, emi and price

In [89]:
page = requests.get('https://www.nobroker.in/property/sale/mumbai/Thakur%20Village?searchParam=W3sibGF0IjoxOS4yMTA4MzE0LCJsb24iOjcyLjg3NDc0MSwicGxhY2VJZCI6IkNoSUpWUXRGemptMzV6c1J5RUlrVkp4VjJWdyIsInBsYWNlTmFtZSI6IlRoYWt1ciBWaWxsYWdlIn1d&radius=2.0&type=RK1&price=0,100000000')
soup = BeautifulSoup(page.content)

# Let us create lists for the 5 columns
name_list=[]
location_list=[]
area_list=[]
emi_list=[]
price_list=[]

# Names
scrape_two_loops('h2', 'heading-6 font-semi-bold nb__1AShY', 'span', name_list)

# Locations
scrape_one_loop('div','nb__2CMjv', location_list)

# Area in sq feet
scrape_one_loop('div', 'nb__3oNyC', area_list)

# EMI 
storage=soup.find_all('div', id='roomType')
for i in storage:
    emi_list.append(i.text)

# Price 
storage=soup.find_all('div', class_='font-semi-bold heading-6')
for i in storage:
    for j in i.find_all('span'):
        price_list.append(j.text)

In [90]:
# Removing the extra values
price_list = price_list[1::2]

len(name_list), len(location_list), len(area_list), len(emi_list), len(price_list)

(10, 10, 10, 10, 10)

In [91]:
# Let us make the final dataframe
housing = pd.DataFrame({})

housing['Name'] = name_list
housing['Location'] = location_list
housing['Area'] = area_list
housing['EMI'] = emi_list
housing['Price'] = price_list

# Correcting the index
housing.index +=1

housing

Unnamed: 0,Name,Location,Area,EMI,Price
1,1 RK Flat For Sale In Rna Suncity Phase 1 In...,Thakur village,370 sqft,"₹33,242/Month",₹58 Lacs
2,1 RK Flat For Sale In Rna Suncity Phase 1 In...,Rna Suncity Phase 1,350 sqft,"₹34,388/Month",₹60 Lacs
3,1 RK In Independent House For Sale In Kandi...,"Independent House, Vasant Sagar near Thakur Pu...",275 sqft,"₹18,913/Month",₹33 Lacs
4,1 RK Flat For Sale In Rna Suncity Phase 1 In...,"Kandivali, Phase 1, Thakur Village,",350 sqft,"₹34,961/Month",₹61 Lacs
5,1 RK Flat For Sale In Samarth Sra Co-op Hous...,"Thakur Village, Near Parulekars Gym and guldch...",240 sqft,"₹24,931/Month",₹43.5 Lacs
6,1 RK In Independent House For Sale In Chikha...,"Independent House, Diwaldewjee wadekar chawl",200 sqft,"₹14,328/Month",₹25 Lacs
7,"1 RK Flat For Sale In Gokul Residency , In K...",opp oberoi gardens gokul tower road,450 sqft,"₹47,571/Month",₹83 Lacs
8,1 RK Flat For Sale In Blossom Chs In Kandiva...,"Gunchecha Park, Kandivali, Janupada, Huzefa Na...",440 sqft,"₹40,120/Month",₹70 Lacs
9,1 RK In Independent House For Sale In Kandiv...,"Independent House, Shyamnarayan Thakur Marg,...",350 sqft,"₹20,060/Month",₹35 Lacs
10,"1 RK Flat For Sale In Thakur Village, Kandiv...","Thakur Village, Kandivali East, Mumbai - 40010...",397 sqft,"₹42,985/Month",₹75 Lacs
