## Web scrapping using beautifulsoup

In [1]:
import requests
from bs4 import BeautifulSoup
import lxml
import html5lib

### 1. Write a python program to display all the header tags from ‘en.wikipedia.org/wiki/Main_Page’.

In [3]:
url = "https://en.wikipedia.org/wiki/Main_Page"
web_page = requests.get(url) # making request to get the page downloaded

In [4]:
result = BeautifulSoup(web_page.text, "html5lib") #passing the downloaded page to beautifulSoup and parser 

In [5]:
headers= result.select(".mw-headline") # extracting the headers, .is used to represent the class 

In [6]:
header_name = [] # creating an empty list
for i in headers: # running for loop to get each header from the page 
    header_name.append(i.text.replace("\xa0...","")) # adding the header in the created header_name list, replaced \xa0 with no space

In [7]:
header_name

["From today's featured article",
 'Did you know',
 'In the news',
 'On this day',
 "Today's featured picture",
 'Other areas of Wikipedia',
 "Wikipedia's sister projects",
 'Wikipedia languages']

### 2. Write a python program to display IMDB’s Top rated 100 movies’ data (i.e. Name, IMDB rating, Year of release) and save it in form of a CSV file.

In [8]:
"""       creating 3 lists according to the ask and saving respective data into it.

      running for loop from 1 to 100  and step size is 50 as we have 50 movies on one page and adding that thing with 
      link. rest we are following the same procedure """

movie_name = []
Year = []
Ranking = []
for page in range(1,101,50):
    url = "https://www.imdb.com/search/title/?groups=top_100&sort=user_rating" +str(page)
    IMDB_data = requests.get(url)
    IMDB_data_result = BeautifulSoup(IMDB_data.text, "lxml")
    top_movies = IMDB_data_result.select(".lister-item-content")
    
    for i in top_movies:
        name  = i.h3.a.text
        movie_name.append(name)
        
#         saving year in year col
        
        year = i.h3.find("span", class_ ="lister-item-year").text
        Year.append(year)
        
#         extracting rating

        rank = float(i.strong.text)
        Ranking.append(rank)

In [9]:
import pandas as pd  # importing pandas
movie_name = pd.DataFrame(movie_name,columns= ["Movie_title"]) # saving extracted information into pandas dataframe
Year = pd.DataFrame(Year,columns= ["release_year"])
Ranking = pd.DataFrame(Ranking,columns= ["Rating"])

top_100_movies = pd.concat([movie_name,Year,Ranking],axis =1) # joining all data
top_100_movies.head()

Unnamed: 0,Movie_title,release_year,Rating
0,Avengers: Endgame,(2019),8.4
1,The Godfather,(1972),9.2
2,Gisaengchung,(2019),8.6
3,The Shawshank Redemption,(1994),9.3
4,Avengers: Infinity War,(2018),8.4


In [10]:
"""we have parenthesis in year column, so taking only number from it, we have two method to correct it 1 using re library and 
                            2, str.extract and applying regex """ 

top_100_movies["release_year"] = top_100_movies["release_year"].str.extract('(\d+)') 

In [11]:
top_100_movies.to_csv("IMDB_Top_rated_100_movies.csv") # saving the file

### 3. Write a python program to display IMDB’s Top rated 100 Indian movies’ data (i.e. Name, IMDB rating, Year of release) and save it in form of a CSV file.

In [12]:
# extracting top 100 indian movie


url = "https://www.imdb.com/list/ls009997493/"# page contain all 100 movies in one page

load_page = requests.get(url)

movies_data = BeautifulSoup(load_page.text,"lxml")

indian_movies = movies_data.select(".lister-item-content")

Title = []
Year = []
Rating = []

for i in indian_movies:
    name = i.h3.a.text # extracting movies name
    Title.append(name)
    
    # extracting years
    
    year = i.h3.find("span", class_="lister-item-year").text
    Year.append(year)
    
    
    # Extracting Rating
    
    rank = i.find("span", class_ = "ipl-rating-star__rating").text
    Rating.append(rank)

In [13]:
ind_movie = pd.DataFrame({"Movie":Title,"Released_year":Year,"Ranking":Rating})
ind_movie.head()

Unnamed: 0,Movie,Released_year,Ranking
0,Rang De Basanti,(2006),8.1
1,3 Idiots,(2009),8.4
2,Taare Zameen Par,(2007),8.4
3,Dil Chahta Hai,(2001),8.1
4,"Swades: We, the People",(2004),8.2


In [14]:
ind_movie["Released_year"] = ind_movie["Released_year"].str.extract('(\d+)') # clening the year columns
ind_movie.head()

Unnamed: 0,Movie,Released_year,Ranking
0,Rang De Basanti,2006,8.1
1,3 Idiots,2009,8.4
2,Taare Zameen Par,2007,8.4
3,Dil Chahta Hai,2001,8.1
4,"Swades: We, the People",2004,8.2


In [15]:
ind_movie.to_csv("top_100_indian_movies.csv")

### 4. Write a python program to scrap book name, author name, genre and book review of any 5 books from ‘www.bookpage.com

In [16]:
# saving the URL in url variable 
url = "https://bookpage.com/reviews"

load_page = requests.get(url) # making request to load the page 

result = BeautifulSoup(load_page.text,"lxml") # passing the loaded page to beautiful and parser engien  




In [17]:
book1 = result.select(".bp-block.article-info") # selecting all the book from a page, it has 10 book

## Running a for loop for get book name one by one from the mentioned class and saving them into book.  
book = []  
for i in book1:
    book_name = i.h4.a.text
    book.append(book_name)
    
## as i mentioned that we have 10 different books and our requirement is only 5 so taking only first 5 books by slicing   
book = book[:5] 

In [18]:
book

[' ★ Facing the Mountain',
 'Lucky Girl',
 ' ★ On Juneteenth',
 'Pawcasso',
 'The Most Beautiful Girl in Cuba']

In [19]:
# selecting all the genre from a page

genre = result.select(".genre-links.hidden-phone")

## Running a for loop for get book genre one by one from the mentioned class and saving them into book_renre.

book_genre = []
for i in genre:
    book_genre.append(i.text.replace("\n",""))
    

## as i mentioned that we have 10 different books and our requirement is only 5 so taking only first 5 by slicing   
    
book_genre = book_genre[:5] 

In [20]:
# selecting all the authors from a page

author = result.select(".sans.bold")

## Running a for loop for get book author name one by one from the mentioned class and saving them into book_author_name.

book_author_name =[]

for i in author:
    book_author_name.append(i.text.replace("\n",""))
    

## as i mentioned that we have 10 different books and our requirement is only 5 so taking only first 5  by slicing   
 
book_author_name = book_author_name[:5]

In [21]:
# selecting all the review from a page
reviews=  result.select(".excerpt")

## Running a for loop for get book review one by one from the mentioned class and saving them into Reviews.

Reviews = []
for i in reviews:
    Review =i.text.strip()
    Reviews.append(Review)
    
## as i mentioned that we have 10 different books and our requirement is only 5 so taking only first 5  by slicing   

book_review = Reviews[:5]
book_review

['Most of the Japanese American patriots who formed the 442nd Infantry Regiment are gone, but their stories live on in this empathetic tribute to their courage.',
 '',
 'Gordon-Reed’s essays seamlessly merge history, memoir and family history into a complex portrait of her beloved, turbulent Texas.',
 '',
 'Chanel Cleeton delivers a sweeping story of love and courage, as well as a sobering reminder of the power and responsibility of the media.']

In [24]:
# creating dataframe

book_dict = {"book":book,"book_genre":book_genre,"book_author_name":book_author_name,"book_review":book_review}
book_df = pd.DataFrame(book_dict)
book_df.head()

Unnamed: 0,book,book_genre,book_author_name,book_review
0,★ Facing the Mountain,Nonfiction / History / American History,Daniel James Brown,Most of the Japanese American patriots who for...
1,Lucky Girl,YA Fiction / YA,Jamie Pacton,
2,★ On Juneteenth,Nonfiction / History / American History,Annette Gordon-Reed,"Gordon-Reed’s essays seamlessly merge history,..."
3,Pawcasso,Children's / Middle Grade,Remy Lai,
4,The Most Beautiful Girl in Cuba,Fiction / Historical Fiction,Chanel Cleeton,Chanel Cleeton delivers a sweeping story of lo...


## 5 Write a python program to scrape cricket rankings from ‘www.icc-cricket.com’. You have to scrape:
i) Top 10 ODI teams in men’s cricket along with the records for matches, points and 
rating. <br>
ii) Top 10 ODI Batsmen in men along with the records of their team and rating.<br>
iii)Top 10 ODI bowlers along with the records of their team and rating.

#### i) Top 10 ODI teams in men’s cricket along with the records for matches, points and rating.

In [25]:
 # loading page 

page_load_ODI = requests.get("https://www.icc-cricket.com/rankings/mens/team-rankings/odi")

# passing the loaded page to beautiful and parser engien 
soup_ODI = BeautifulSoup(page_load_ODI.text,"html5lib")

In [26]:
""" Extracting team name from the this, names are avilable under u-hide-phablet class. DOT(.) is represent the class when
we use select """

team_name = []
for i in soup_ODI.select(".u-hide-phablet"):
    team_name.append(i.text)
    
# while extracting the name, i have found couple of blank rows also, so sliced on filled one.
    
team_name = team_name[:20]

In [27]:
"""very important
   first rows of number one team's matches, points and rating tags are differrent and for other rest of the team tag are same
    thus i had to extract these information seprately. i will add them into final list later on  """ 

# Number of matches
First_matches = soup_ODI.find("td",class_="rankings-block__banner--matches").text

# Points

First_team_point = soup_ODI.find("td",class_="rankings-block__banner--points").text

# Rating

First_rating =soup_ODI.find("td",class_= "rankings-block__banner--rating u-text-right").text.replace("\n","").strip()

In [28]:
""" Extracting point and rating both, both have same tag thus, later on i will seprate them by using slicing
""" 

match = soup_ODI.select(".table-body__cell.u-center-text")
Record_for_matches = []
for i in match:
    Record_for_matches.append(i.text)
    
""" saving matches into Matches variable, out Record_for_matches list is like 0 index is match count and 1 index is for point.

            so i have sliced accoridnly """
Matches = Record_for_matches[::2]

# inserting the first team match count at 0 index because first team is at 0 index so that it matches

Matches.insert(0,First_matches)
Matches

['17',
 '25',
 '29',
 '27',
 '20',
 '24',
 '24',
 '27',
 '21',
 '17',
 '2',
 '18',
 '15',
 '7',
 '5',
 '5',
 '9',
 '6',
 '8',
 '5']

In [29]:
# matches points starts from 1st index in record_for_matches thus i have sliced accordingly

points = Record_for_matches[1::2]
# inserting the first team points at 0 index because first team is at 0 index so that it matches

points.insert(0,First_team_point)

In [30]:
# Extracting the rating from it.

rank = soup_ODI.select(".table-body__cell.u-text-right.rating")
Rating = []
for i in rank:
    Rating.append(i.text)
    
# inserting the first team rating at 0 index because first team is at 0 index so that it matches

Rating.insert(0,First_rating)


In [32]:
# creating dataframe

Match_dict = {"team_name":team_name,"Matches":Matches,"points":points,"Rating":Rating}
Match_df = pd.DataFrame(Match_dict)
Match_df.head()

Unnamed: 0,team_name,Matches,points,Rating
0,New Zealand,17,2054,121
1,Australia,25,2945,118
2,India,29,3344,115
3,England,27,3100,115
4,South Africa,20,2137,107


### ii) Top 10 ODI Batsmen in men along with the records of their team and rating

In [34]:
# requesting to upload page

url = "https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting"
ODI_bats_page = requests.get(url) 

# pasing loaded page through beautifulsoup and html parser engien

ODI_bats_soup = BeautifulSoup(ODI_bats_page.text,"lxml")


In [35]:
# batsman who is at no.1 position, his tag is different than others so that is why i am extracting it single
first_batsman_name = ODI_bats_soup.find("div",class_ ="rankings-block__banner--name-large").text

In [36]:
first_batsman_name

'Babar Azam'

In [37]:
# extracting first batsman team

first_batsman_team = ODI_bats_soup.find("div", class_="rankings-block__banner--nationality").text.replace("\n","")

In [38]:
first_batsman_team

'PAK                    '

In [39]:
# extracting first batsman Ranking

first_bats_Ranking = ODI_bats_soup.find("div", class_="rankings-block__banner--rating").text.replace("\n","")

In [40]:
first_bats_Ranking

'865'

In [41]:
# extracting other player name 

player = ODI_bats_soup.select(".table-body__cell.rankings-table__name.name")

player_name = []
for i in player:
    name = i.text.replace("\n","")
    player_name.append(name)

In [42]:
top_10_player_name = player_name[:9] # it gives entire list of player and our requirement in top 10
top_10_player_name.insert(0,first_batsman_name) # adding first player name in the list at 0 index position 
top_10_player_name

['Babar Azam',
 'Virat Kohli',
 'Rohit Sharma',
 'Ross Taylor',
 'Aaron Finch',
 'Jonny Bairstow',
 'Fakhar Zaman',
 'Francois du Plessis',
 'David Warner',
 'Shai Hope']

In [43]:
# extracting the team name from the this mentioned class
teams = ODI_bats_soup.select(".table-body__logo-text")

teams_name = []
for i in teams:
    team = i.text
    teams_name.append(team)
player_team_name = teams_name[:9] # it gives entire list of player and our requirement in top 10
player_team_name.insert(0,first_batsman_team)
player_team_name

['PAK                    ',
 'IND',
 'IND',
 'NZ',
 'AUS',
 'ENG',
 'PAK',
 'SA',
 'AUS',
 'WI']

In [44]:
# extracting the ranking of the player from the this mentioned class
Rating = ODI_bats_soup.select(".table-body__cell.rating")

#using list comprehension for extracting ranking one by one
teams_rating = [i.text for i in Rating] 

 # it gives entire list of player and our requirement in top 10
player_team_rating = teams_rating[:9]

# inserting first batsman rating in the list

player_team_rating.insert(0,first_bats_Ranking)

# priting the same 
player_team_rating

['865', '857', '825', '801', '791', '785', '778', '778', '773', '773']

In [45]:
# creating dataframe
batsman_dict = {"top_10_player_name":top_10_player_name,"player_team_name":player_team_name,"player_team_rating":player_team_rating}
batsman_df = pd.DataFrame(batsman_dict)
batsman_df.head()

Unnamed: 0,top_10_player_name,player_team_name,player_team_rating
0,Babar Azam,PAK,865
1,Virat Kohli,IND,857
2,Rohit Sharma,IND,825
3,Ross Taylor,NZ,801
4,Aaron Finch,AUS,791


### iii)Top 10 ODI bowlers along with the records of their team and rating.

In [47]:
# requesting to upload bowler page

url = "https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling"
ODI_bowl_page = requests.get(url) 

# pasing loaded page through beautifulsoup and html parser engien

ODI_bowl_soup = BeautifulSoup(ODI_bowl_page.text,"lxml")

In [48]:
# Bowler who is at no.1 position, his tag is different than others so that is why i am extracting it single
first_Bowler_name = ODI_bowl_soup.find("div",class_ ="rankings-block__banner--name-large").text
first_Bowler_name

'Trent Boult'

In [49]:
# extracting first bowler team

first_bowler_team = ODI_bowl_soup.find("div", class_="rankings-block__banner--nationality").text.replace("\n","")
first_bowler_team

'NZ                    '

In [50]:
# extracting first batsman Ranking

first_bowler_Ranking = ODI_bowl_soup.find("div", class_="rankings-block__banner--rating").text.replace("\n","")
first_bowler_Ranking

'737'

In [51]:
# extracting other player name 

bowler = ODI_bowl_soup.select(".table-body__cell.rankings-table__name.name")

bowler_name = []
for i in bowler:
    name = i.text.replace("\n","")
    bowler_name.append(name)
top_10_bowler_name = bowler_name[:9] # it gives entire list of player and our requirement in top 10
top_10_bowler_name.insert(0,first_Bowler_name) # adding first player name in the list at 0 index position 
top_10_bowler_name

['Trent Boult',
 'Mujeeb Ur Rahman',
 'Matt Henry',
 'Jasprit Bumrah',
 'Mehedi Hasan',
 'Kagiso Rabada',
 'Chris Woakes',
 'Josh Hazlewood',
 'Pat Cummins',
 'Mohammad Amir']

In [52]:
# extracting the team name from the this mentioned class
bowler_teams = ODI_bowl_soup.select(".table-body__logo-text")

bowler_teams_name = []
for i in bowler_teams:
    bowlteam = i.text
    bowler_teams_name.append(bowlteam)
bowl_team_name = bowler_teams_name[:9] # it gives entire list of player and our requirement in top 10
bowl_team_name.insert(0,first_bowler_team)
bowl_team_name

['NZ                    ',
 'AFG',
 'NZ',
 'IND',
 'BAN',
 'SA',
 'ENG',
 'AUS',
 'AUS',
 'PAK']

In [53]:
# extracting the ranking of the player from the this mentioned class
bowlRating = ODI_bowl_soup.select(".table-body__cell.rating")

#using list comprehension for extracting ranking one by one
bowl_rating = [i.text for i in bowlRating] 

 # it gives entire list of player and our requirement in top 10
bowl_team_rating = bowl_rating[:9]

# inserting first batsman rating in the list

bowl_team_rating.insert(0,first_bowler_Ranking)

# priting the same 
bowl_team_rating

['737', '708', '691', '690', '668', '666', '665', '660', '646', '638']

In [54]:
# creating dataframe
bowler_dict = {"top_10_bowler_name":top_10_bowler_name,"bowl_team_name":bowl_team_name,"bowl_team_rating":bowl_team_rating}
bowler_df = pd.DataFrame(bowler_dict)
bowler_df.head()

Unnamed: 0,top_10_bowler_name,bowl_team_name,bowl_team_rating
0,Trent Boult,NZ,737
1,Mujeeb Ur Rahman,AFG,708
2,Matt Henry,NZ,691
3,Jasprit Bumrah,IND,690
4,Mehedi Hasan,BAN,668


## 6.Write a python program to scrape cricket rankings from ‘www.icc-cricket.com’. You have to scrape:
i) Top 10 ODI teams in women’s cricket along with the records for matches, points 
and rating. <br>
ii) Top 10 women’s ODI players along with the records of their team and rating. <br>
iii)Top 10 women’s ODI all-rounder along with the records of their team and rating.<br>

#### i) Top 10 ODI teams in women’s cricket along with the records for matches, points and rating.

In [57]:
 # loading page 

Woman_ODI_page = requests.get("https://www.icc-cricket.com/rankings/womens/team-rankings/odi")

# passing the loaded page to beautiful and parser engien 
soup_ODI_woman = BeautifulSoup(Woman_ODI_page.text,"html5lib")    

## Teams

In [58]:
team = soup_ODI_woman.select(".u-hide-phablet") # selecting the class which contains the team name

woman_team = [] # creating empty list

# Running loop for extracting the team names
for i in team:   
    woman_team.append(i.text)

# extracing the top 10 team, i have used sliced top 10 team because there are a few empty string as well. 
    
woman_team= woman_team[:10]
woman_team

['Australia',
 'South Africa',
 'England',
 'India',
 'New Zealand',
 'West Indies',
 'Pakistan',
 'Bangladesh',
 'Sri Lanka',
 'Ireland']

## team matches

In [59]:
# first and rest of the macthes tags are different thus extracting it seprately 
first_team_matches = soup_ODI_woman.find("td", class_= "rankings-block__banner--matches").text

# extracting rest of the team's matches 

team_matches = soup_ODI_woman.select(".table-body__cell.u-center-text")

Woman_teams_matches = [] # creating list 

for i in team_matches:
    Woman_teams_matches.append(i.text)
    
""" saving matches into Matches variable, out Record_for_matches list is like 0 index is match count and 1 index is for point.

            so i have sliced accoridnly """

woman_top_10_team = Woman_teams_matches[::2]
woman_top_10_team.insert(0,first_team_matches)
woman_top_10_team

['18', '24', '17', '20', '21', '12', '15', '5', '11', '2']

### Women team points

In [60]:
# extracting first team points as it has different tag from other teams' point

first_team_points = soup_ODI_woman.find("td",class_ ="rankings-block__banner--points").text

# rest of the teams' points from woman_teams_matches

woman_team_points = Woman_teams_matches[1::2]
woman_team_points.insert(0,first_team_points)
woman_team_points

['2,955',
 '2,828',
 '1,993',
 '2,226',
 '1,947',
 '1,025',
 '1,101',
 '306',
 '519',
 '25']

### woman Team Ranting

In [61]:
# taking out first team rating 
first_team_rating = soup_ODI_woman.find("td",class_ ="rankings-block__banner--rating").text.replace("\n","").strip()

# extracting other teams' rating
Rating = soup_ODI_woman.select(".table-body__cell.u-text-right.rating")

woman_teams_rating = []
for i in Rating:
    woman_teams_rating.append(i.text)
    
woman_teams_rating.insert(0,first_team_rating)
woman_teams_rating

['164', '118', '117', '111', '93', '85', '73', '61', '47', '13']

In [62]:
# creating dataframe
woman_team_dict = {"woman_team":woman_team,"woman_top_10_team":woman_top_10_team,"woman_team_points":woman_team_points,"woman_teams_rating":woman_teams_rating}
woman_team_df = pd.DataFrame(woman_team_dict)
woman_team_df.head()

Unnamed: 0,woman_team,woman_top_10_team,woman_team_points,woman_teams_rating
0,Australia,18,2955,164
1,South Africa,24,2828,118
2,England,17,1993,117
3,India,20,2226,111
4,New Zealand,21,1947,93


### ii) Top 10 women’s ODI players along with the records of their team and rating.

In [63]:
# Extracting the batsman info, saving url in a variable and making request to load the page
url = "https://www.icc-cricket.com/rankings/womens/player-rankings/odi/batting"

woman_bats_page = requests.get(url)

# passing the woman_bats_page to beautiful soup and html parser

woman_batsman_info = BeautifulSoup(woman_bats_page.text, "lxml")

### Player name

In [64]:
# extracting first player name
first_batsman = woman_batsman_info.find("div", class_= "rankings-block__banner--name-large").text

# extracting rest all the

batsman_name  = woman_batsman_info.select(".table-body__cell.rankings-table__name.name")

woman_batsman_name = []
for i in batsman_name:
    woman_batsman_name.append(i.text.replace("\n",""))
    
woman_batsman_name = woman_batsman_name[:9]
woman_batsman_name.insert(0,first_batsman)
woman_batsman_name

['Tammy Beaumont',
 'Lizelle Lee',
 'Alyssa Healy',
 'Stafanie Taylor',
 'Meg Lanning',
 'Amy Satterthwaite',
 'Smriti Mandhana',
 'Mithali Raj',
 'Natalie Sciver',
 'Laura Wolvaardt']


### Woman team name

In [65]:
# Extracting first player

first_batsman_team = woman_batsman_info.find("div", class_="rankings-block__banner--nationality").text.replace("\n","").strip()

# extracting info of other teams.

teams = woman_batsman_info.select(".table-body__logo-text")

batsman_team  = []

for i in teams:
    batsman_team.append(i.text)

batsman_team = batsman_team[:9]  
batsman_team.insert(0,first_batsman_team)
batsman_team

['ENG', 'SA', 'AUS', 'WI', 'AUS', 'NZ', 'IND', 'IND', 'ENG', 'SA']

### woman team Rating

In [66]:
# Extracting first player rating
first_batsman_rating = woman_batsman_info.find("div",class_= "rankings-block__banner--rating").text 

# extracting other player rating 
rating = woman_batsman_info.select(".table-body__cell.rating")

player_rating = []
for i in rating:
    player_rating.append(i.text)

player_rating = player_rating[:9]
player_rating.insert(0,first_batsman_rating)
player_rating

['765', '758', '756', '746', '723', '715', '710', '709', '685', '683']

In [67]:
# creating dataframe
woman_batsman_dict = {"woman_batsman_name":woman_batsman_name,"batsman_team":batsman_team,"player_rating":player_rating}
woman_batsman_df = pd.DataFrame(woman_batsman_dict)
woman_batsman_df.head()

Unnamed: 0,woman_batsman_name,batsman_team,player_rating
0,Tammy Beaumont,ENG,765
1,Lizelle Lee,SA,758
2,Alyssa Healy,AUS,756
3,Stafanie Taylor,WI,746
4,Meg Lanning,AUS,723


### iii)Top 10 women’s ODI all-rounder along with the records of their team and rating.

In [68]:
# Extracting the batsman info, saving url in a variable and making request to load the page
url = "https://www.icc-cricket.com/rankings/womens/player-rankings/odi/all-rounder"

woman_all_page = requests.get(url)

# passing the woman_bats_page to beautiful soup and html parser

woman_allrounder_info = BeautifulSoup(woman_all_page.text, "lxml")

#### Player Name

In [69]:
# extracting first player name
first_allrounder = woman_allrounder_info.find("div", class_= "rankings-block__banner--name-large").text

# extracting rest all the

Allrounder_name  = woman_allrounder_info.select(".table-body__cell.rankings-table__name.name")

woman_allrounder_name = []

for i in Allrounder_name:
    woman_allrounder_name.append(i.text.replace("\n",""))
    
woman_allrounder_name = woman_allrounder_name[:9]
woman_allrounder_name.insert(0,first_allrounder)
woman_allrounder_name

['Marizanne Kapp',
 'Ellyse Perry',
 'Stafanie Taylor',
 'Natalie Sciver',
 'Deepti Sharma',
 'Jess Jonassen',
 'Ashleigh Gardner',
 'Dane van Niekerk',
 'Sophie Devine',
 'Amelia Kerr']

### Team name

In [70]:
# Extracting first player

first_allrounder_team = woman_allrounder_info.find("div", class_="rankings-block__banner--nationality").text.replace("\n","").strip()

# extracting info of other teams.

allrounder_team = woman_allrounder_info.select(".table-body__logo-text")

allrounder_team_name  = []

for i in allrounder_team:
    allrounder_team_name.append(i.text)

allrounder_team_name = allrounder_team_name[:9]  
allrounder_team_name.insert(0,first_allrounder_team)
allrounder_team_name

['SA', 'AUS', 'WI', 'ENG', 'IND', 'AUS', 'AUS', 'SA', 'NZ', 'NZ']

### Player Rating

In [71]:
# Extracting first player rating
first_allrounder_rating = woman_allrounder_info.find("div",class_= "rankings-block__banner--rating").text 

# extracting other player rating 
allrounder_rating = woman_allrounder_info.select(".table-body__cell.rating")

allrounder_player_rating = []
for i in allrounder_rating:
    allrounder_player_rating.append(i.text)

allrounder_player_rating = allrounder_player_rating[:9]
allrounder_player_rating.insert(0,first_allrounder_rating)
allrounder_player_rating

['418', '418', '410', '349', '343', '307', '252', '243', '242', '236']

In [72]:
# creating dataframe
allrounder_dict = {"woman_allrounder_name":woman_allrounder_name,"allrounder_team_name":allrounder_team_name,"allrounder_player_rating":allrounder_player_rating}
allrounder_df = pd.DataFrame(allrounder_dict)
allrounder_df.head()

Unnamed: 0,woman_allrounder_name,allrounder_team_name,allrounder_player_rating
0,Marizanne Kapp,SA,418
1,Ellyse Perry,AUS,418
2,Stafanie Taylor,WI,410
3,Natalie Sciver,ENG,349
4,Deepti Sharma,IND,343


### 7. Write a python program to extract information about the local weather from the National Weather Service website of USA, https://www.weather.gov/ for the city, San Francisco. You need to extract data about 7 day extended forecast display for the city. The data should include period, short description, temperature and description.


In [73]:
# extracting forecast information for San francisco city
url = "https://forecast.weather.gov/MapClick.php?lat=37.777120000000025&lon=-122.41963999999996#.YJbYuofivIV"

# making request and loading page
forecast_page = requests.get(url)

# beautifulsoup instance
forecast_info = BeautifulSoup(forecast_page.text, "lxml")


# Class selection where period are written in HTML
forecast_day = forecast_info.select(".col-sm-2.forecast-label")

# running loop for extracting period 
Days = []
for i in forecast_day:
    Days.append(i.text)
Days

# extracting detailed forecast
extended_forecast = forecast_info.select(".col-sm-10.forecast-text")

# running loop for extracting forecast details
forecast = []
for i in extended_forecast:
    forecast.append(i.text.split()) # with split it become nesting list,because i need short description seperately. 

# saving short_description and temperature_description in below list
short_description =[]
temperature_description = []
for i in forecast:
    short_description.append(i[0]) # extracting short description 
    temperature_description.append(i[1::]) # extracting short temperature and description seprately 

# converting nexting list into single list by joining all element in one sentence 
temperature_description_updated = []
for i in temperature_description:
    temperature_description_updated.append(" ".join(i))

temperature_description_updated

d = {"Days":Days,"temperature_description_updated":temperature_description_updated,"short_description":short_description}
import pandas as pd
df = pd.DataFrame(d)

df.head()

Unnamed: 0,Days,temperature_description_updated,short_description
0,Today,"with a high near 71. West wind 7 to 15 mph, wi...","Sunny,"
1,Tonight,"with a low around 52. West wind 6 to 14 mph, w...","Clear,"
2,Monday,with a high near 77. Light west northwest wind...,"Sunny,"
3,Monday Night,"clear, with a low around 52. West wind 5 to 14...",Mostly
4,Tuesday,with a high near 75. Light west northwest wind...,"Sunny,"
