<a href="https://colab.research.google.com/github/susmoy15/Web-Scraping/blob/main/beautifulsoup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [71]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [72]:
html = """
<html>
  <head><title>Test Page</title></head>
  <body>
    <h1 class="title">Hello, World!</h1>
    <p id="first">This is a <a href="http://example.com">link</a>.</p>
    <p id="second">This is another paragraph.</p>
  </body>
</html>
"""

In [73]:
soup = BeautifulSoup(html, 'html.parser')

In [74]:
# Title
print("Title:", soup.title.string)

# All <p> tags
print("Paragraphs:", soup.find_all('p'))

# First <a> tag href
print("Link:", soup.find('a')['href'])

# Get text only
print("All text:", soup.get_text())

# Find by attribute
print("Second Paragraph:", soup.find('p', id='second').get_text())

# CSS selector
print("Using select:", soup.select_one('h1.title').text)

Title: Test Page
Paragraphs: [<p id="first">This is a <a href="http://example.com">link</a>.</p>, <p id="second">This is another paragraph.</p>]
Link: http://example.com
All text: 

Test Page

Hello, World!
This is a link.
This is another paragraph.



Second Paragraph: This is another paragraph.
Using select: Hello, World!


# * **Mini Projects Work**    

In [75]:
url = "https://www.bbc.com/sport/football/premier-league/top-scorers"
response = requests.get(url)

In [76]:
response.raise_for_status()

In [77]:
print(response.raise_for_status())

None


In [78]:
response.status_code

200

In [79]:
response.text[:200]

'<!DOCTYPE html><html lang="en-GB" class="no-js"><head><meta charSet="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" /><title data-rh="true">Premier League Top Scorers - B'

In [80]:
type(response.content)

bytes

In [81]:
soup = BeautifulSoup(response.content, "html.parser")

In [82]:
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en-GB">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title data-rh="true">
   Premier League Top Scorers - BBC Sport
  </title>
  <meta content="Premier League top scorers. Showing assists, time on pitch and the shots on and off target." data-rh="true" name="description"/>
  <meta content="#FFFFFF" data-rh="true" name="theme-color"/>
  <meta content="Premier League top scorers. Showing assists, time on pitch and the shots on and off target." data-rh="true" property="og:description"/>
  <meta content="https://static.files.bbci.co.uk/core/website/assets/static/sport/bbc-sport-logo.0da9386782.png" data-rh="true" property="og:image"/>
  <meta content="BBC Sport" data-rh="true" property="og:site_name"/>
  <meta content="Premier League Top Scorers - BBC Sport" data-rh="true" property="og:title"/>
  <meta content="article" data-rh="true" property="og:type"/>
  <meta content="https://www.b

In [83]:
player_names = []
team_names = []
goals = []
assists = []
num_matches = []
shots = []

In [86]:
try:
    response = requests.get(url)
    response.raise_for_status()
except Exception as e:
    print(e)
else:
    soup = BeautifulSoup(response.content, "html.parser")
    players = soup.find('tbody').find_all('tr', class_='ssrcss-qqhdqi-TableRowBody e1icz100')
    for player in players:
        player_name = player.find('div', class_='ssrcss-m6ah29-PlayerName e1n8xy5b1').get_text(strip=True)
        team_name = player.find('div', class_='ssrcss-qvpga1-TeamsSummary e1n8xy5b0').get_text(strip=True)
        goal_score = int(player.find('div', class_='ssrcss-18ap757-CellWrapper ef9ipf0').get_text(strip=True))

        stats = player.find_all('div', class_='ssrcss-1vo7v3r-CellWrapper ef9ipf0')
        assists_made = int(stats[0].get_text(strip=True))
        matches_played =int(stats[2].get_text(strip=True))
        shots_taken = int(stats[-3].get_text(strip=True))


        player_names.append(player_name)
        team_names.append(team_name)
        goals.append(goal_score)
        assists.append(assists_made)
        num_matches.append(matches_played)
        shots.append(shots_taken)



    data = {
        'Player Name': player_names,
        'Team Name': team_names,
        'Goals': goals,
        'Assists': assists,
        'Matches Played': num_matches,
        'Shots Taken': shots
    }
    df_players = pd.DataFrame(data)



In [87]:
df_players

Unnamed: 0,Player Name,Team Name,Goals,Assists,Matches Played,Shots Taken
0,Mohamed Salah,Liverpool,29,18,38,130
1,A. Isak,Newcastle,23,6,34,99
2,E. Haaland,Man City,22,3,31,108
3,B. Mbeumo,Brentford,20,7,38,85
4,C. Wood,Nottm Forest,20,3,36,68
5,Y. Wissa,Brentford,19,4,35,90
6,O. Watkins,Aston Villa,16,8,38,84
7,C. Palmer,Chelsea,15,8,37,126
8,Matheus Cunha,Wolves,15,6,33,110
9,J. Strand Larsen,Wolves,14,4,35,54


In [89]:
df_players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Player Name     25 non-null     object
 1   Team Name       25 non-null     object
 2   Goals           25 non-null     int64 
 3   Assists         25 non-null     int64 
 4   Matches Played  25 non-null     int64 
 5   Shots Taken     25 non-null     int64 
dtypes: int64(4), object(2)
memory usage: 1.3+ KB


In [90]:
df_players.describe()

Unnamed: 0,Goals,Assists,Matches Played,Shots Taken
count,25.0,25.0,25.0,25.0
mean,14.48,5.08,34.72,80.52
std,5.017636,3.340659,3.021037,24.556262
min,10.0,1.0,27.0,47.0
25%,11.0,3.0,33.0,60.0
50%,13.0,4.0,35.0,73.0
75%,16.0,6.0,37.0,96.0
max,29.0,18.0,38.0,130.0


In [91]:
df_players.to_csv('premier_league_scorers.csv', index=False)