# Web Scraping Assignment with Beautiful Soup

In [37]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## 1. Write a python program to display all the header tags from ‘en.wikipedia.org/wiki/Main_Page’.

In [38]:
main_page = requests.get("https://en.wikipedia.org/wiki/Main_Page")
print(main_page) #to check the response

<Response [200]>


In [39]:
soup_wiki = BeautifulSoup(main_page.content)

# extracting the header details
headers = soup_wiki.find_all(["h1","h2","h3","h4","h5","h6"])
header_names = []
for i in headers:
    header_names.append(i.text.replace('\n',''))

# creating a dataframe
df1=pd.DataFrame({})
df1['Headers']=header_names
df1

Unnamed: 0,Headers
0,Main Page
1,From today's featured article
2,Did you know ...
3,In the news
4,On this day
5,Today's featured picture
6,Other areas of Wikipedia
7,Wikipedia's sister projects
8,Wikipedia languages
9,Navigation menu


## 2. Write a python program to display IMDB’s Top rated 100 movies’ data (i.e. Name, IMDB rating, Year of release).

In [40]:
imdb_100_movies_page = requests.get("https://www.imdb.com/list/ls091520106/?sort=list_order,asc&st_dt=&mode=simple&page=1&ref_=ttls_vw_smp")
print(imdb_100_movies_page) #to check to response
soup_imdb = BeautifulSoup(imdb_100_movies_page.content)

<Response [200]>


In [41]:
# extract the movie names, year of release, rating for the movies
top_100_movies = soup_imdb.find_all("span", class_="lister-item-header")
movie_names = []
for i in top_100_movies:
    for j in i.find_all("span"):
        movie_names.append(j.text.replace("\n",""))
name = []
for i in range(1,len(movie_names),3):
    name.append(movie_names[i])
    
year = []
for i in range(2,len(movie_names),3):
    year.append(movie_names[i])
    
imdb_rating = soup_imdb.find_all("div", class_="col-imdb-rating")
rating = []
for i in imdb_rating:
    for j in i.find_all("strong"):
        rating.append(j.text.replace("\n","").strip())
        
# creating the data frame
df2_imdb=pd.DataFrame({})
df2_imdb['Name']=name
df2_imdb['IMDB Rating']=rating
df2_imdb['Year of release']=year
df2_imdb

Unnamed: 0,Name,IMDB Rating,Year of release
0,The Shawshank Redemption(1994),9.3,(1994)
1,The Godfather(1972),9.2,(1972)
2,The Godfather: Part II(1974),9,(1974)
3,The Dark Knight(2008),9,(2008)
4,12 Angry Men(1957),9,(1957)
...,...,...,...
95,North by Northwest(1959),8.3,(1959)
96,A Clockwork Orange(1971),8.3,(1971)
97,Snatch(2000),8.3,(2000)
98,Le fabuleux destin d'Amélie Poulain(2001),8.3,(2001)


## 3. Write a python program to display IMDB’s Top rated 100 Indian movies’ data (i.e. Name, IMDB rating, Year of release).

In [42]:
top_indian_250_movies_page = requests.get("https://www.imdb.com/india/top-rated-indian-movies/?sort=ir,desc&mode=simple&page=1")
print(top_indian_250_movies_page) # to show the response output
soup_imdb_250 = BeautifulSoup(top_indian_250_movies_page.content)

<Response [200]>


In [43]:
# extract the indian movie names, year of release date, indian movie rating
imdb_250_movies = soup_imdb_250.find_all("td", attrs="titleColumn")
indian_movie_names = []
for i in imdb_250_movies[:100]:
    for j in i.find_all("a"):
        indian_movie_names.append(j.text)

year_of_release = []
for i in imdb_250_movies[:100]:
    for j in i.find_all("span"):
        year_of_release.append(j.text)

imdb_indianmovies_rating = soup_imdb_250.find_all("td", class_="ratingColumn imdbRating")
movie_ratings = []
for i in imdb_indianmovies_rating[:100]:
    for j in i.find_all("strong"):
        movie_ratings.append(j.text)
        
# creating the data frame
df3_indian_movies = pd.DataFrame({})
df3_indian_movies["Name"]=indian_movie_names
df3_indian_movies["IMDB rating"]=movie_ratings
df3_indian_movies["Year of release"]=year_of_release
df3_indian_movies

Unnamed: 0,Name,IMDB rating,Year of release
0,Nayakan,8.5,(1987)
1,Anbe Sivam,8.5,(2003)
2,Pariyerum Perumal,8.5,(2018)
3,C/o Kancharapalem,8.5,(2018)
4,Golmaal,8.5,(1979)
...,...,...,...
95,Rang De Basanti,8.1,(2006)
96,Roja,8.1,(1992)
97,Lagaan: Once Upon a Time in India,8.1,(2001)
98,Uri: The Surgical Strike,8.1,(2019)


## 4. Write a python program to scrap book name, author name, genre and book review of any 5 books from ‘www.bookpage.com’

In [44]:
page = requests.get("https://bookpage.com/reviews")
print(page)
soup=BeautifulSoup(page.content)

<Response [200]>


In [45]:
# extract all the book names, all the author details, all the genre details
books=soup.find_all("h4", class_="italic")
booknames = []
for i in books[:5]:
    booknames.append(i.text.replace("★","").strip())

author=soup.find_all("p", class_="sans bold")
authornames = []
for i in author[:5]:
    authornames.append(i.text.replace("\n",""))

genre=soup.find_all("p", class_="genre-links hidden-phone")
genrelist = []
for i in genre[:5]:
    genrelist.append(i.text.replace("\n",""))

# getting all the urls
urls = []
for i in books[:5]:
    for j in i.find_all("a", href=True):
        urls.append(j['href'])

# extract all the reviews for the books
reviewdetails = []
for i in urls[:5]:
    link = requests.get("https://www.bookpage.com"+i)
    soup = BeautifulSoup(link.content, 'html.parser')
    review = soup.find("div", class_="article-body")
    reviewdetails.append(review.text.replace("\n",""))

# checking the length of the columns to create a data frame
print("Length of book names, author names, genre and reviews are:", len(booknames), 
      len(authornames), len(genrelist), len(reviewdetails))

# creating the data frame now
df4=pd.DataFrame({})
df4["Book Name"]=booknames
df4["Author Name"]=authornames
df4["Genre"]=genrelist
df4["Book Review"]=reviewdetails
df4

Length of book names, author names, genre and reviews are: 5 5 5 5


Unnamed: 0,Book Name,Author Name,Genre,Book Review
0,The Secret History of Food,Matt Siegel,Nonfiction / History / Food,When we pour a bowl of cereal or enjoy a dish ...
1,So Many Beginnings,Bethany C. Morrow,YA / YA Fiction,"In So Many Beginnings, Bethany C. Morrow (A So..."
2,Fallout,Steve Sheinkin,Children's / Middle Grade,Bestselling author Steve Sheinkin is best know...
3,Seeing Ghosts,Kat Chow,Nonfiction / Memoir / Family & Relationships,"Early in her debut memoir, Seeing Ghosts, jour..."
4,Hero of Two Worlds,Mike Duncan,Nonfiction / History / American History,"In this engrossing biography, author and histo..."


## 5. Write a python program to scrape cricket rankings from ‘www.icc-cricket.com’. You have to scrape:
## i) Top 10 ODI teams in men’s cricket along with the records for matches, points and rating.
## ii) Top 10 ODI Batsmen in men along with the records of their team and rating.
## iii) Top 10 ODI bowlers along with the records of their team and rating.

### i) Top 10 ODI teams in men’s cricket along with the records for matches, points and rating.

In [46]:
page=requests.get("https://www.icc-cricket.com/rankings/mens/team-rankings/odi")
print(page) # to show the response output
soup=BeautifulSoup(page.content)

# top 10 odi team in men
odi_team_men=soup.find_all("span", class_="u-hide-phablet")
top_10_odi_team_men=[]
for i in odi_team_men[:10]:
    top_10_odi_team_men.append(i.text)

# top 10 odi matches in men
odi_matches_men=soup.find_all("td", class_="rankings-block__banner--matches")
top_10_odi_matches_men=[]
for i in odi_matches_men[:10]:
    top_10_odi_matches_men.append(i.text)
matches_points=[]
odi_matches_scores_men=soup.find_all("td", class_="table-body__cell u-center-text")
for i in odi_matches_scores_men[:18]:
    matches_points.append(i.text)
for i in range(0, len(matches_points), 2):
    top_10_odi_matches_men.append(matches_points[i])

# top 10 odi points in men
odi_points_men=soup.find_all("td", class_="rankings-block__banner--points")
top_10_odi_points_men=[]
for i in odi_points_men:
    top_10_odi_points_men.append(i.text)
for i in range(1, len(matches_points), 2):
    top_10_odi_points_men.append(matches_points[i])

# top 10 odi ratings in men
odi_ratings_men=soup.find_all("td", class_="rankings-block__banner--rating u-text-right")
top_10_odi_ratings_men=[]
for i in odi_ratings_men:
    top_10_odi_ratings_men.append(i.text.replace("\n","").strip())
odi_ratings_men1=soup.find_all("td", class_="table-body__cell u-text-right rating")
for i in odi_ratings_men1[:9]:
    top_10_odi_ratings_men.append(i.text.replace("\n","").strip())

# checking the length of the columns to create a data frame
print(f"Length of Men's ODI team, matches, points and rating are:", len(top_10_odi_team_men), 
      len(top_10_odi_matches_men), len(top_10_odi_points_men), len(top_10_odi_ratings_men))

# creating the data frame now
df_to_10_men_odi_teams=pd.DataFrame({})
df_to_10_men_odi_teams["Top 10 Men's ODI Team Names"]=top_10_odi_team_men
df_to_10_men_odi_teams["Top 10 Men's ODI Team Matches"]=top_10_odi_matches_men
df_to_10_men_odi_teams["Top 10 Men's ODI Team Points"]=top_10_odi_points_men
df_to_10_men_odi_teams["Top 10 Men's ODI Team Ratings"]=top_10_odi_ratings_men
df_to_10_men_odi_teams

<Response [200]>
Length of Men's ODI team, matches, points and rating are: 10 10 10 10


Unnamed: 0,Top 10 Men's ODI Team Names,Top 10 Men's ODI Team Matches,Top 10 Men's ODI Team Points,Top 10 Men's ODI Team Ratings
0,New Zealand,17,2054,121
1,England,32,3793,119
2,Australia,28,3244,116
3,India,32,3624,113
4,South Africa,22,2267,103
5,Pakistan,27,2524,93
6,Bangladesh,29,2639,91
7,West Indies,30,2523,84
8,Sri Lanka,29,2303,79
9,Afghanistan,17,1054,62


### ii) Top 10 ODI Batsmen in men along with the records of their team and rating.

In [47]:
page1=requests.get("https://www.icc-cricket.com/rankings/mens/player-rankings/odi")
print(page1) # to show the response output
soup1=BeautifulSoup(page1.content)

# top 10 odi bats men
odi_matches_bats_men=soup1.find_all("div", class_="rankings-block__banner--name")
top_odi_matches_bats_men=[]
for i in odi_matches_bats_men[:1]:
    top_odi_matches_bats_men.append(i.text)
odi_matches_bats_men1=soup1.find_all("td", class_="table-body__cell name")
for i in odi_matches_bats_men1[:9]:
    for j in i.find_all("a"):
        top_odi_matches_bats_men.append(j.text)

# top 10 odi country bats men
odi_bats_men_country=soup1.find_all("div", class_="rankings-block__banner--nationality")
top_odi_country_bats_men=[]
for i in odi_bats_men_country[:1]:
    top_odi_country_bats_men.append(i.text.replace("\n",""))
top_odi_country_bats_men = list(map(lambda x: x.split(' ')[0], top_odi_country_bats_men))
    # Separating the team and rating inside the list
odi_bats_men_country1=soup1.find_all("span", class_="table-body__logo-text")
for i in odi_bats_men_country1[:9]:
    top_odi_country_bats_men.append(i.text)

# top 10 odi rating bats men
odi_bats_men_rating=soup1.find_all("div", class_="rankings-block__banner--rating")
top_odi_ratings_bats_men=[]
for i in odi_bats_men_rating[:1]:
    top_odi_ratings_bats_men.append(i.text)
odi_bats_men_rating1=soup1.find_all("td", class_="table-body__cell u-text-right rating")
for i in odi_bats_men_rating1[:9]:
    top_odi_ratings_bats_men.append(i.text)
    
# checking the length of the columns
print(f"Length of Top 10 ODI Batsman name, country and rating are:", len(top_odi_matches_bats_men), 
      len(top_odi_country_bats_men), len(top_odi_ratings_bats_men))

# creating a dataframe
df_batsman=pd.DataFrame({})
df_batsman["Top 10 ODI Batsman Names"]=top_odi_matches_bats_men
df_batsman["Top 10 ODI Batsman Country"]=top_odi_country_bats_men
df_batsman["Top 10 ODI Batsman Rating"]=top_odi_ratings_bats_men
df_batsman

<Response [200]>
Length of Top 10 ODI Batsman name, country and rating are: 10 10 10


Unnamed: 0,Top 10 ODI Batsman Names,Top 10 ODI Batsman Country,Top 10 ODI Batsman Rating
0,Babar Azam,PAK,873
1,Virat Kohli,IND,844
2,Rohit Sharma,IND,813
3,Ross Taylor,NZ,801
4,Aaron Finch,AUS,779
5,Jonny Bairstow,ENG,775
6,David Warner,AUS,762
7,Quinton de Kock,SA,758
8,Shai Hope,WI,758
9,Kane Williamson,NZ,754


### iii) Top 10 ODI bowlers along with the records of their team and rating.

In [48]:
# top 10 bowler names in mean
odi_matches_bowler_men=soup1.find_all("div", class_="rankings-block__banner--name")
top_odi_bowler_name_men=[]
for i in odi_matches_bowler_men[1:2]:
    top_odi_bowler_name_men.append(i.text)
odi_matches_bowler_men1=soup1.find_all("td", class_="table-body__cell name")
for i in odi_matches_bowler_men1[9:18]:
    top_odi_bowler_name_men.append(i.text.replace("\n",""))

# top 10 bowler team name in men
odi_team_bowler_men=soup1.find_all("div", class_="rankings-block__banner--nationality")
top_odi_bowler_team_men=[]
for i in odi_team_bowler_men[1:2]:
    top_odi_bowler_team_men.append(i.text.replace("\n",""))
top_odi_bowler_team_men = list(map(lambda x: x.split(' ')[0], top_odi_bowler_team_men))
    # Separating the team and rating inside the list
odi_team_bowler_men1=soup1.find_all("span", class_="table-body__logo-text")
for i in odi_team_bowler_men1[36:45]:
    top_odi_bowler_team_men.append(i.text.replace("\n",""))

# top 10 bowler rating in men
odi_rating_bowler_men=soup1.find_all("div", class_="rankings-block__banner--rating")
top_odi_bowler_rating_men=[]
for i in odi_rating_bowler_men[1:2]:
    top_odi_bowler_rating_men.append(i.text)
odi_rating_bowler_men1=soup1.find_all("td", class_="table-body__cell u-text-right rating")
for i in odi_rating_bowler_men1[36:45]:
    top_odi_bowler_rating_men.append(i.text)

# checking the length of the columns
print(f"Length of Top 10 ODI Bowler's Men name, coutry and rating are:", len(top_odi_bowler_name_men), 
      len(top_odi_bowler_team_men), len(top_odi_bowler_rating_men))

# creating a dataframe
df_bowlers=pd.DataFrame({})
df_bowlers["Top 10 ODI Bowler's Men Names"]= top_odi_bowler_name_men
df_bowlers["Top 10 ODI Bowler's Men Teams"]= top_odi_bowler_team_men
df_bowlers["Top 10 ODI Bowler's Men Ratings"]= top_odi_bowler_rating_men
df_bowlers

Length of Top 10 ODI Bowler's Men name, coutry and rating are: 10 10 10


Unnamed: 0,Top 10 ODI Bowler's Men Names,Top 10 ODI Bowler's Men Teams,Top 10 ODI Bowler's Men Ratings
0,Trent Boult,NZ,737
1,Josh Hazlewood,AUS,709
2,Mujeeb Ur Rahman,AFG,708
3,Chris Woakes,ENG,700
4,Mehedi Hasan,BAN,692
5,Matt Henry,NZ,691
6,Jasprit Bumrah,IND,679
7,Mitchell Starc,AUS,652
8,Shakib Al Hasan,BAN,650
9,Kagiso Rabada,SA,648


## 6. Write a python program to scrape cricket rankings from ‘www.icc-cricket.com’. You have to scrape:
## i) Top 10 ODI teams in women’s cricket along with the records for matches, points and rating.
## ii) Top 10 women’s ODI players along with the records of their team and rating.
## iii) Top 10 women’s ODI all-rounder along with the records of their team and rating.

### i) Top 10 ODI teams in women’s cricket along with the records for matches, points and rating.

In [49]:
page=requests.get("https://www.icc-cricket.com/rankings/womens/team-rankings/odi")
print(page) # to show the response output
soup=BeautifulSoup(page.content)

# top 10 women odi team
women_odi_team=soup.find_all("span", class_="u-hide-phablet")
top_10_women_odi_teams=[]
for i in women_odi_team[:10]:
    top_10_women_odi_teams.append(i.text)

# top 10 women odi matches
women_odi_matches=soup.find_all("td", class_="rankings-block__banner--matches")
top_10_women_odi_matches=[]
for i in women_odi_matches:
    top_10_women_odi_matches.append(i.text)

# top 10 women odi scores
women_odi_matches1=soup.find_all("td", class_="table-body__cell u-center-text")
matches_and_scores=[]
for i in women_odi_matches1:
    matches_and_scores.append(i.text)
for i in range(0, len(matches_and_scores), 2):
    top_10_women_odi_matches.append(matches_and_scores[i])
women_odi_points=soup.find_all("td", class_="rankings-block__banner--points")
top_10_women_odi_points=[]
for i in women_odi_points:
    top_10_women_odi_points.append(i.text)
for i in range(1, len(matches_and_scores), 2):
    top_10_women_odi_points.append(matches_and_scores[i])

# top 10 women odi ratings
women_odi_ratings=soup.find_all("td", class_="rankings-block__banner--rating u-text-right")
top_10_women_odi_ratings=[]
for i in women_odi_ratings:
    top_10_women_odi_ratings.append(i.text.replace("\n","").strip())
women_odi_ratings1=soup.find_all("td", class_="table-body__cell u-text-right rating")
for i in women_odi_ratings1:
    top_10_women_odi_ratings.append(i.text.replace("\n","").strip())

# checking the length of the columns
print(f"Length of Women's ODI team, matches, points and ratings are:", len(top_10_women_odi_teams), 
      len(top_10_women_odi_matches), len(top_10_women_odi_points), len(top_10_women_odi_ratings))

# creating the dataframe
df_to_10_women_odi_teams=pd.DataFrame({})
df_to_10_women_odi_teams["Top 10 Women's ODI Team Names"]=top_10_women_odi_teams
df_to_10_women_odi_teams["Top 10 Women's ODI Team Matches"]=top_10_women_odi_matches
df_to_10_women_odi_teams["Top 10 Women's ODI Team Points"]=top_10_women_odi_points
df_to_10_women_odi_teams["Top 10 Women's ODI Team Ratings"]=top_10_women_odi_ratings
df_to_10_women_odi_teams

<Response [200]>
Length of Women's ODI team, matches, points and ratings are: 10 10 10 10


Unnamed: 0,Top 10 Women's ODI Team Names,Top 10 Women's ODI Team Matches,Top 10 Women's ODI Team Points,Top 10 Women's ODI Team Ratings
0,Australia,18,2955,164
1,England,20,2370,119
2,South Africa,24,2828,118
3,India,23,2535,110
4,New Zealand,21,1947,93
5,West Indies,17,1427,84
6,Pakistan,20,1496,75
7,Bangladesh,5,306,61
8,Sri Lanka,11,519,47
9,Ireland,2,25,13


### ii) Top 10 women’s ODI players along with the records of their team and rating.

In [50]:
page2=requests.get("https://www.icc-cricket.com/rankings/womens/player-rankings/odi")
print(page2) # to check the respose output
soup2=BeautifulSoup(page2.content)

# top 10 odi bats women names
women_odi_bat_name=soup2.find_all("div", class_="rankings-block__banner--name")
top_10_women_odi_bat_name=[]
for i in women_odi_bat_name[0:1]:
    top_10_women_odi_bat_name.append(i.text)
women_odi_bat_name1=soup2.find_all("td", class_="table-body__cell name")
for i in women_odi_bat_name1[0:9]:
    top_10_women_odi_bat_name.append(i.text.replace("\n",""))

# top 10 odi bats women teams
women_odi_bat_team=soup2.find_all("div", class_="rankings-block__banner--nationality")
top_10_women_odi_bat_team=[]
for i in women_odi_bat_team[0:1]:
    top_10_women_odi_bat_team.append(i.text.replace("\n",""))
top_10_women_odi_bat_team = list(map(lambda x: x.split(' ')[0], top_10_women_odi_bat_team))
    # Separating the team and rating inside the list
women_odi_bat_team1=soup2.find_all("span", class_="table-body__logo-text")
for i in women_odi_bat_team1[:9]:
    top_10_women_odi_bat_team.append(i.text.replace("\n",""))

# top 10 odi bats women ratings
women_odi_bat_rating=soup2.find_all("div", class_="rankings-block__banner--rating")
top_10_women_odi_bat_rating=[]
for i in women_odi_bat_rating[0:1]:
    top_10_women_odi_bat_rating.append(i.text)
women_odi_bat_rating1=soup2.find_all("td", class_="table-body__cell u-text-right rating")
for i in women_odi_bat_rating1[:9]:
    top_10_women_odi_bat_rating.append(i.text)

# checking the length of the columns
print(f"Length of Top 10 Women ODI Batting players, team and ratings are:", len(top_10_women_odi_bat_name), 
      len(top_10_women_odi_bat_team), len(top_10_women_odi_bat_rating))

# creating the dataframe
df_batswomen=pd.DataFrame({})
df_batswomen["Top 10 Women's ODI Batswomen Names"]=top_10_women_odi_bat_name
df_batswomen["Top 10 Women's ODI Batswomen Teams"]=top_10_women_odi_bat_team
df_batswomen["Top 10 Women's ODI Batswomen Ratings"]=top_10_women_odi_bat_rating
df_batswomen

<Response [200]>
Length of Top 10 Women ODI Batting players, team and ratings are: 10 10 10


Unnamed: 0,Top 10 Women's ODI Batswomen Names,Top 10 Women's ODI Batswomen Teams,Top 10 Women's ODI Batswomen Ratings
0,Mithali Raj,IND,762
1,Lizelle Lee,SA,758
2,Alyssa Healy,AUS,756
3,Tammy Beaumont,ENG,754
4,Stafanie Taylor,WI,736
5,Meg Lanning,AUS,723
6,Amy Satterthwaite,NZ,715
7,Natalie Sciver,ENG,706
8,Smriti Mandhana,IND,701
9,Laura Wolvaardt,SA,683


### iii) Top 10 women’s ODI all-rounder along with the records of their team and rating.

In [51]:
# top 10 odi bowler names in women
women_odi_bowler_name=soup2.find_all("div", class_="rankings-block__banner--name")
top_10_women_odi_bowler_name=[]
for i in women_odi_bowler_name[1:2]:
    top_10_women_odi_bowler_name.append(i.text)
women_odi_bowler_name1=soup2.find_all("td", class_="table-body__cell name")
for i in women_odi_bowler_name1[9:18]:
    top_10_women_odi_bowler_name.append(i.text.replace("\n",""))
    
# top 10 odi bowler teams in women
women_odi_bowler_team=soup2.find_all("div", class_="rankings-block__banner--nationality")
top_10_women_odi_bowler_teams=[]
for i in women_odi_bowler_team[1:2]:
    top_10_women_odi_bowler_teams.append(i.text.replace("\n",""))
top_10_women_odi_bowler_teams = list(map(lambda x: x.split(' ')[0], top_10_women_odi_bowler_teams))
    # Separating the team and rating inside the list
women_odi_bowler_team1=soup2.find_all("span", class_="table-body__logo-text")
for i in women_odi_bowler_team1[9:18]:
    top_10_women_odi_bowler_teams.append(i.text.replace("\n",""))

# top 10 odi bowler ratings in women
women_odi_bowler_rating=soup2.find_all("div", class_="rankings-block__banner--rating")
top_10_women_odi_bowler_ratings=[]
for i in women_odi_bowler_rating[1:2]:
    top_10_women_odi_bowler_ratings.append(i.text)
women_odi_bowler_rating1=soup2.find_all("td", class_="table-body__cell u-text-right rating")
for i in women_odi_bowler_rating1[9:18]:
    top_10_women_odi_bowler_ratings.append(i.text)
    
# checking the length of the columns
print(f"Length of Top 10 Women ODI Bowling players, team and ratings are:", len(top_10_women_odi_bowler_name), 
      len(top_10_women_odi_bowler_teams), len(top_10_women_odi_bowler_ratings))

# creating the dataframe
df_women_bowlers=pd.DataFrame({})
df_women_bowlers["Top 10 Women's ODI Bowler Names"]=top_10_women_odi_bowler_name
df_women_bowlers["Top 10 Women's ODI Bowler Teams"]=top_10_women_odi_bowler_teams
df_women_bowlers["Top 10 Women's ODI Bowler Ratings"]=top_10_women_odi_bowler_ratings
df_women_bowlers

Length of Top 10 Women ODI Bowling players, team and ratings are: 10 10 10


Unnamed: 0,Top 10 Women's ODI Bowler Names,Top 10 Women's ODI Bowler Teams,Top 10 Women's ODI Bowler Ratings
0,Jess Jonassen,AUS,808
1,Megan Schutt,AUS,762
2,Marizanne Kapp,SA,747
3,Shabnim Ismail,SA,717
4,Jhulan Goswami,IND,694
5,Sophie Ecclestone,ENG,668
6,Katherine Brunt,ENG,646
7,Ayabonga Khaka,SA,638
8,Poonam Yadav,IND,617
9,Ellyse Perry,AUS,616


## 7. Write a python program to scrape details of all the mobile phones under Rs. 20,000 listed on Amazon.in. The scraped data should include Product Name, Price, Image URL and Average Rating.

In [52]:
HEADERS=({'user-agent': 'your-own-user-agent/0.0.1'})
page=requests.get("https://www.amazon.in/s?k=smartphones+under+20000+rupees+only&crid=3QV1RABX4EPDF&sprefix=smartphones+under+20000+rupees%2Caps%2C369&ref=nb_sb_ss_ts-doa-p_3_30",headers=HEADERS)
print(page) # to check the response output

soup=BeautifulSoup(page.content)

# extract mobile names
mob_name=soup.find_all("span", class_="a-size-medium a-color-base a-text-normal")
product_names=[]
for i in mob_name[:10]:
    product_names.append(i.text)

# extract mobile prices
mob_price=soup.find_all("span", class_="a-price-whole")
product_price=[]
for i in mob_price[:10]:
    product_price.append(i.text)

# extract image urls
images=soup.find_all("div", class_="a-section aok-relative s-image-fixed-height")
images_url=[]
for i in images[:10]:
    for j in i.find_all("img", class_="s-image"):
        images_url.append(j.get("src"))

# extract product ratings
avg_rating=soup.find_all("i", class_="a-icon a-icon-star-small a-star-small-4 aok-align-bottom")
ratings=[]
for i in avg_rating[:10]:
    ratings.append(i.text)

# checking the length of the columns
print(f"Length of product name, price, image url and ratings", len(product_names), 
      len(product_price), len(images_url), len(ratings))

# creating the dataframe
df7=pd.DataFrame({})
df7["Product Names"]=product_names
df7["Product Prices"]=product_price
df7["Image Urls"]=images_url
df7["Average Rating"]=ratings
df7

<Response [200]>
Length of product name, price, image url and ratings 10 10 10 10


Unnamed: 0,Product Names,Product Prices,Image Urls,Average Rating
0,"iQOO Z3 5G (Cyber Blue, 8GB RAM, 128GB Storage...",20990,https://m.media-amazon.com/images/I/615CXlFtDD...,4.1 out of 5 stars
1,"Redmi Note 10S (Cosmic Purple, 6GB RAM, 64 GB ...",14999,https://m.media-amazon.com/images/I/81sZamLSPW...,4.2 out of 5 stars
2,"OPPO A74 5G (Fantastic Purple,6GB RAM,128GB St...",17990,https://m.media-amazon.com/images/I/71geVdy6-O...,4.2 out of 5 stars
3,"Redmi 9 (Sky Blue, 4GB RAM, 64GB Storage) | 2....",10499,https://m.media-amazon.com/images/I/71A9Vo1Bat...,4.1 out of 5 stars
4,"Samsung Galaxy M02s (Blue,4GB RAM, 64GB Storag...",6999,https://m.media-amazon.com/images/I/71wkpcIfqd...,4.2 out of 5 stars
5,"Redmi 9A (Nature Green, 2GB RAM, 32GB Storage)...",7299,https://m.media-amazon.com/images/I/71sxlhYhKW...,4.1 out of 5 stars
6,"realme C11 (2021) (Cool Grey, 2GB RAM, 32GB St...",13499,https://m.media-amazon.com/images/I/618UBhFmaQ...,4.2 out of 5 stars
7,"Redmi 9 Power (Blazing Blue, 6GB RAM, 128GB St...",7299,https://m.media-amazon.com/images/I/71hEzQGO5q...,4.1 out of 5 stars
8,"realme C11 (2021) (Cool Blue, 2GB RAM, 32GB St...",17990,https://m.media-amazon.com/images/I/71FYSKYFup...,4.2 out of 5 stars
9,"OPPO A74 5G (Fluid Black,6GB RAM,128GB Storage...",12490,https://m.media-amazon.com/images/I/71poFSdDs5...,4.2 out of 5 stars


## 8. Write a python program to extract information about the local weather from the National Weather Service website of USA, https://www.weather.gov/ for the city, San Francisco. You need to extract data about 7 day extended forecast display for the city. The data should include period, short description, temperature and description.

In [53]:
page=requests.get("https://forecast.weather.gov/MapClick.php?lat=37.777120000000025&lon=-122.41963999999996#.YSkWx44zbIV")
print(page) # to get the response output
soup=BeautifulSoup(page.content)

# get the period names
period=soup.find_all("p", class_="period-name")
period_names=[]
for i in period:
    period_names.append(i.text)

# get short description
short_desc=soup.find_all("p", class_="short-desc")
short_detail=[]
for i in short_desc:
    short_detail.append(i.text)

# get the temperature details
temp1=soup.find_all("p", class_="temp temp-low")
temp2=soup.find_all("p", class_="temp temp-high")
temperature=[]
for i in temp1:
    temperature.append(i.text)
for i in temp2:
    temperature.append(i.text)

# get the full description
long_desc=soup.find_all("div", class_="col-sm-10 forecast-text")
description=[]
for i in long_desc[:9]:
    description.append(i.text)

# checking the length of the columns
print(f"Length of period, short description, temperature and long description are:", len(period_names), 
      len(short_detail), len(temperature), len(description))

# creating the dataframe
df8=pd.DataFrame({})
df8['Period Names']=period_names
df8['Short Detail']=short_detail
df8['Temperature']=temperature
df8['Description']=description
df8

<Response [200]>
Length of period, short description, temperature and long description are: 9 9 9 9


Unnamed: 0,Period Names,Short Detail,Temperature,Description
0,Today,Patchy Fogthen Sunnyand Breezy,Low: 56 °F,"Patchy fog before 10am. Otherwise, partly sun..."
1,Tonight,IncreasingClouds,Low: 56 °F,"Increasing clouds, with a low around 56. West ..."
2,Monday,Mostly Sunnythen Sunnyand Breezy,Low: 54 °F,"Mostly cloudy, then gradually becoming sunny, ..."
3,MondayNight,Mostly Clear,Low: 54 °F,"Mostly clear, with a low around 56. West south..."
4,Tuesday,Sunny thenSunny andBreezy,High: 74 °F,"Sunny, with a high near 67. Breezy, with a wes..."
5,TuesdayNight,Mostly Clearand Breezythen PartlyCloudy,High: 68 °F,"Mostly clear, with a low around 54. Breezy."
6,Wednesday,Mostly Sunny,High: 67 °F,"Mostly sunny, with a high near 66."
7,WednesdayNight,Partly Cloudy,High: 66 °F,"Partly cloudy, with a low around 54."
8,Thursday,Mostly Sunny,High: 65 °F,"Mostly sunny, with a high near 65."


## 9. Write a python program to scrape fresher job listings from ‘https://internshala.com/’. It should include job title, company name, CTC, and apply date.

In [54]:
page=requests.get("https://internshala.com/fresher-jobs")
print(page) # to get the response output
soup=BeautifulSoup(page.content)

# get the job titles
job_title=soup.find_all("div", class_="heading_4_5 profile")
title=[]
for i in job_title:
    title.append(i.text.replace("\n","").strip())

# get the company names
company_names=soup.find_all("div", class_="heading_6 company_name")
company=[]
for i in company_names:
    company.append(i.text.replace("\n","").strip())

# get the CTC details
CTC_list=soup.find_all("div", class_="item_body")
CTC=[]
for i in CTC_list:
    for j in i.find_all("i"):
        CTC.append(i.text.replace("\n","").strip())

# get the apply dates for jobs
applying=soup.find_all("div", class_="item_body")
apply_date=[]
for i in applying:
    apply_date.append(i.text)
dates=[]
for i in range(2, len(apply_date), 3):
    dates.append(apply_date[i])

# checking the length of the columns
print(f"Lengths of Job title, Company name, CTC and Apply date are:", len(title), len(company), len(CTC), len(dates))

# creating the dataframe
df9=pd.DataFrame({})
df9['Job Title']=title
df9['Company Name']=company
df9['CTC']=CTC
df9['Apply Date']=dates
df9

<Response [200]>
Lengths of Job title, Company name, CTC and Apply date are: 40 40 40 40


Unnamed: 0,Job Title,Company Name,CTC,Apply Date
0,OMNI SPORT LEADER,Decathlon Sport India Private Limited,3 - 4 LPA,18 Sep' 21
1,Executive/Senior Executive - Partnerships,Freecharge Payments Technology Private Limited,3 - 4.2 LPA,11 Sep' 21
2,Executive - Sales,Freecharge Payments Technology Private Limited,3 - 3.5 LPA,11 Sep' 21
3,Junior Operations Executive,Freecharge Payments Technology Private Limited,3 - 4 LPA,4 Sep' 21
4,Inside Sales Executive,Neo91 (DSBL Private Limited),3.5 LPA,27 Sep' 21
5,Sales Client Onboarding Executive,Intesome,3 - 4 LPA,27 Sep' 21
6,B2B Digital Sales Executive,Intesome,3 - 4 LPA,27 Sep' 21
7,Business Development Executive,AAPC India Private Limited,3 - 4.5 LPA,27 Sep' 21
8,Customer Service Executive,InfyBytes AI Labs Private Limited,3 - 4 LPA,27 Sep' 21
9,Business Development Associate,XOOG EDLEARN PRIVATE LIMITED,4.5 - 7 LPA,27 Sep' 21


## 10. Write a python program to scrape house details from https://www.nobroker.in/ for any location. It should include house title, location, area, emi and price

In [55]:
page=requests.get("https://www.nobroker.in/property/sale/delhi/Karol%20Bagh?searchParam=W3sibGF0IjoyOC42NTUwNDU4LCJsb24iOjc3LjE4ODgyMDEsInBsYWNlSWQiOiJDaElKMHk1QVg1d0NEVGtSUnRtZFRDbDBJWlEiLCJwbGFjZU5hbWUiOiJLYXJvbCBCYWdoIn1d&radius=2.0")
print(page) # to check the response output
soup=BeautifulSoup(page.content)

# extract house titles
house_title=soup.find_all("h2", class_="heading-6 font-semi-bold nb__1AShY")
htitle=[]
for i in house_title:
    for j in i.find_all("span"):
        htitle.append(i.text)

# extract house locations
loc=soup.find_all("div", class_="nb__2CMjv")
location=[]
for i in loc:
    location.append(i.text)

# extract the area details of the house
builtup=soup.find_all("div", class_="nb__3oNyC")
area=[]
for i in builtup:
    area.append(i.text)

# extract the EMI details
estimated_emi=soup.find_all("div", class_="font-semi-bold heading-6")
EMI=[]
for i in estimated_emi:
    EMI.append(i.text)
emi_detail=[]
for i in range(1, len(EMI), 3):
    emi_detail.append(EMI[i])
emi_detail

# extract the price information
price=[]
for i in range(2, len(EMI), 3):
    price.append(EMI[i])

# checking the length of the columns
print(f"Lengths of house title, location, area, emi and price:", len(htitle), len(location), len(area), 
      len(emi_detail), len(price))

# creating the dataframe
df10=pd.DataFrame({})
df10["House Title"]=htitle
df10["Location"]=location
df10["Area"]=area
df10["EMI"]=emi_detail
df10["Price"]=price
df10

<Response [200]>
Lengths of house title, location, area, emi and price: 10 10 10 10 10


Unnamed: 0,House Title,Location,Area,EMI,Price
0,2 BHK Flat For Sale In Dori Walan Karol Bagh...,11005,600 sqft,"₹40,120/Month",₹70 Lacs
1,2 BHK Flat For Sale In Floor In Karol Bagh,karol bagh,600 sqft,"₹34,388/Month",₹60 Lacs
2,3 BHK Flat For Sale In Karol Bagh,"Standalone Building, Rohtak Rd, Near Delhi Jal...","1,008 sqft","₹54,448/Month",₹95 Lacs
3,3 BHK Flat For Sale In Karol Bagh,"Standalone Building, New Rohtak Road","1,350 sqft","₹80,240/Month",₹1.4 Crores
4,1 RK Flat For Sale In Karol Bagh,"Standalone Building, Tank Rd Block 1 B, Mata R...",270 sqft,"₹11,462/Month",₹20 Lacs
5,2 BHK In Independent House For Sale In Karo...,"Independent House, SD12, Deshbandhu Gupta Rd, ...",675 sqft,₹5.73 Lacs/Month,₹10 Crores
6,2 BHK Flat For Sale In Karol Bagh,"Standalone Building, Guru Ravi Das Marg, near ...",600 sqft,"₹45,851/Month",₹80 Lacs
7,1 RK Flat For Sale In Karol Bagh,"Standalone Building, deshbandhu gupta rd Beado...",234 sqft,"₹17,194/Month",₹30 Lacs
8,3 BHK Flat For Sale In Apartment In Karol Bagh,"Ramjas Road, near Peshawari Chicken Corner,","1,350 sqft",₹1.2 Lacs/Month,₹2.1 Crores
9,1 BHK Flat For Sale In Karol Bagh,"Standalone building, Desh Bandhu Gupta Rd, Dev...",540 sqft,"₹30,949/Month",₹54 Lacs
