# Web Scraping Notebook

## Scraping the transfermarkt.com Website
Scraping [transfermarkt.com](https://www.transfermarkt.com/) for penalty shootouts in major tournaments. The website was utilized as the databases includes player profiles (age, team, market value, etc.) for each season.

**Major tournaments:**
- **European Championship** *aka Euro Cup*
- **South American Championship** *aka Copa America*
- **Champions League/European Cup**
- **Europa League/UEFA Cup**

In [1]:
import requests
import re
from bs4 import BeautifulSoup

In [2]:
def get_soup(url):
    '''
    Function to request a web page and return a Beautiful Soup object
    '''
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
    }
    
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    return soup

## Scraping game page

In [3]:
# Sample game page
url = 'https://www.transfermarkt.com/spielbericht/index/spielbericht/3605575'
soup = get_soup(url)

### General match data

In [4]:
def get_match_data(soup):
    '''
    Function to retrieve general match data
    '''
    
    # Home and away teams
    home_team = soup.find(attrs={'class': 'sb-team sb-heim'}).find('img')['alt']
    away_team = soup.find(attrs={'class': 'sb-team sb-gast'}).find('img')['alt']

    # Stadium info
    stad = soup.find(attrs={'class': 'sb-zusatzinfos'}).find('a')
    stad_link = stad['href']
    base_url = 'https://www.transfermarkt.com'
    stad_soup = get_soup(base_url + stad_link)
    try:
        stad_home = stad_soup.find(attrs={'itemprop': 'name'}).find('span').text
    except:
        stad_home = 'none'
    try:
        tds = stad_soup.find_all('table')[1].find_all('td')[:4]
        address = [ td.text.replace(u'\xa0', u' ') for td in tds ]
    except:
        address = []
    
    # Neutral venue
    if (home_team == stad_home) or (home_team in address):
        neutral = 'False'
        true_home = home_team
    elif (away_team == stad_home) or (away_team in address):
        neutral = 'False'
        true_home = away_team
    else:
        neutral = 'True'
        true_home = stad_home
        
    # Match date
    match_date = soup.find(attrs={'sb-spieldaten'}).find('a')['href'][-10:]
    
    return {
        'home_team': home_team,
        'away_team': away_team,
        'neutral': neutral,
        'true_home': true_home,
        'match_date': match_date
    }

In [5]:
get_match_data(soup)

{'home_team': 'Italy',
 'away_team': 'England',
 'neutral': 'False',
 'true_home': 'England',
 'match_date': '2021-07-11'}

### Penalty shootout data

In [6]:
def get_penalty_so(soup):
    '''
    Function to retrieve penalty shootout data
    '''
    
    try:
        # Penalty shootout section of game page
        shot_list = soup.find(attrs={'id': 'sb-elfmeterscheissen'}).find('ul').find_all('li')

        shootout = []
        home = 0
        away = 0
        for shot in shot_list:
            result = shot.find('span')['title']
            player = shot.find(attrs={'class': 'sb-aktion-aktion'}).find('a')
            p_name = player['title']
            p_href = player['href']
            if shot['class'][0] == 'sb-aktion-heim':
                team = 'home'
                if result == 'Scored':
                    home += 1
            else:
                team = 'away'
                if result == 'Scored':
                    away += 1

            shootout.append({'team': team, 'result': result, 'player': p_name, 'player_href': p_href})
    except:
        # Missing shootout section
        result = soup.find(attrs={'class': 'sb-endstand'}).text.strip().split(':')
        home = int(re.search(r'[\d]*', result[0])[0])
        away = int(re.search(r'[\d]*', result[1])[0])
        shootout = []
        
    if home > away:
        winner = 'home'
    else:
        winner = 'away'

    return {
        'winner': winner,
        'shootout': shootout
    }

In [7]:
shootout = get_penalty_so(soup)
print(shootout)

{'winner': 'home', 'shootout': [{'team': 'home', 'result': 'Scored', 'player': 'Domenico Berardi', 'player_href': '/domenico-berardi/leistungsdatendetails/spieler/177843/saison/2020/wettbewerb/EM20'}, {'team': 'away', 'result': 'Scored', 'player': 'Harry Kane', 'player_href': '/harry-kane/leistungsdatendetails/spieler/132098/saison/2020/wettbewerb/EM20'}, {'team': 'home', 'result': 'Saved', 'player': 'Andrea Belotti', 'player_href': '/andrea-belotti/leistungsdatendetails/spieler/167727/saison/2020/wettbewerb/EM20'}, {'team': 'away', 'result': 'Scored', 'player': 'Harry Maguire', 'player_href': '/harry-maguire/leistungsdatendetails/spieler/177907/saison/2020/wettbewerb/EM20'}, {'team': 'home', 'result': 'Scored', 'player': 'Leonardo Bonucci', 'player_href': '/leonardo-bonucci/leistungsdatendetails/spieler/39983/saison/2020/wettbewerb/EM20'}, {'team': 'away', 'result': 'Missed', 'player': 'Marcus Rashford', 'player_href': '/marcus-rashford/leistungsdatendetails/spieler/258923/saison/2020

### Substitution data
Numerical representation of the match clock for match events is not utilized for the website. Instead, position on a timeline is used.

Example:
![](imgs/timeline.png)

In [8]:
def get_subs(soup):
    '''
    Function to get subs
    '''
    
    # Subs from timeline section of page
    home_subs = soup.find(attrs={'class': 'sb-leiste-heim'}).find_all(attrs={'class': 'sb-wechsel'})
    away_subs = soup.find(attrs={'class': 'sb-leiste-gast'}).find_all(attrs={'class': 'sb-wechsel'})
    
    # Home subs
    home_times = []
    for sub in home_subs:
        # Extract position from style section of element
        text = sub.parent['style']
        num = float(re.search(r'(?<=t: )(.*)(?=%; )', text)[0]) / 100
        
        # Convert to time in minutes
        time = round(num * 120)
        home_times.append(time)
    
    # Away subs
    away_times = []
    for sub in away_subs:
        # Extract position from style section of element
        text = sub.parent['style']
        num = float(re.search(r'(?<=t: )(.*)(?=%; )', text)[0]) / 100
        
        # Convert to time in minutes
        time = round(num * 120)
        away_times.append(time)
        
    # Find corresponding players in substitution section of page
    actions = soup.find(attrs={'id': 'sb-wechsel'}).find('ul')
    h_subs = actions.find_all(attrs={'class': 'sb-aktion-heim'})
    a_subs = actions.find_all(attrs={'class': 'sb-aktion-gast'})
    
    subs_list = []
    # Home
    for i, sub in enumerate(h_subs):
        try:
            player = sub.find(attrs={'class': 'sb-aktion-wechsel-ein'}).find('a')['title']
            play_time = 120 - home_times[i] # If subbed out then player will not be needed
            p_dict = {'player': player, 'time': play_time, 'team': 'home'}
        except:
            p_dict = {'player': 'none', 'time': 'none', 'team': 'home'}
        subs_list.append(p_dict)
    # Away
    for i, sub in enumerate(a_subs):
        try:
            player = sub.find(attrs={'class': 'sb-aktion-wechsel-ein'}).find('a')['title']
            play_time = 120 - away_times[i] # If subbed out the player will not be needed
            p_dict = {'player': player, 'time': play_time, 'team': 'away'}
        except:
            p_dict = {'player': 'none', 'time': 'none', 'team': 'away'}
        subs_list.append(p_dict)
        
    return subs_list

In [9]:
# Note: Jordan Henderson was subbed out before fulltime and subsequently did not take a penalty
subs = get_subs(soup)
subs

[{'player': 'Bryan Cristante', 'time': 66, 'team': 'home'},
 {'player': 'Domenico Berardi', 'time': 65, 'team': 'home'},
 {'player': 'Federico Bernardeschi', 'time': 34, 'team': 'home'},
 {'player': 'Andrea Belotti', 'time': 29, 'team': 'home'},
 {'player': 'Manuel Locatelli', 'time': 24, 'team': 'home'},
 {'player': 'Alessandro Florenzi', 'time': 2, 'team': 'home'},
 {'player': 'Bukayo Saka', 'time': 49, 'team': 'away'},
 {'player': 'Jordan Henderson', 'time': 46, 'team': 'away'},
 {'player': 'Jack Grealish', 'time': 21, 'team': 'away'},
 {'player': 'Marcus Rashford', 'time': 0, 'team': 'away'},
 {'player': 'Jadon Sancho', 'time': 0, 'team': 'away'}]

In [11]:
def get_match(soup):
    
    match = get_match_data(soup)
    shootout = get_penalty_so(soup)
    subs = get_subs(soup)

    # Append time played to players in shootout (excludes end of period added time)
    for player in shootout['shootout']:
        player['time_played'] = 120
        # Ensure same name but different team player not used
        for sub in subs:
            if (player['player'] == sub['player']) and (player['team'] == sub['team']):
                player['time_played'] = sub['time']

    match.update(shootout)

    return match

In [12]:
get_match(soup)

{'home_team': 'Italy',
 'away_team': 'England',
 'neutral': 'False',
 'true_home': 'England',
 'match_date': '2021-07-11',
 'winner': 'home',
 'shootout': [{'team': 'home',
   'result': 'Scored',
   'player': 'Domenico Berardi',
   'player_href': '/domenico-berardi/leistungsdatendetails/spieler/177843/saison/2020/wettbewerb/EM20',
   'time_played': 65},
  {'team': 'away',
   'result': 'Scored',
   'player': 'Harry Kane',
   'player_href': '/harry-kane/leistungsdatendetails/spieler/132098/saison/2020/wettbewerb/EM20',
   'time_played': 120},
  {'team': 'home',
   'result': 'Saved',
   'player': 'Andrea Belotti',
   'player_href': '/andrea-belotti/leistungsdatendetails/spieler/167727/saison/2020/wettbewerb/EM20',
   'time_played': 29},
  {'team': 'away',
   'result': 'Scored',
   'player': 'Harry Maguire',
   'player_href': '/harry-maguire/leistungsdatendetails/spieler/177907/saison/2020/wettbewerb/EM20',
   'time_played': 120},
  {'team': 'home',
   'result': 'Scored',
   'player': 'Leo

## Scraping competition pages
**NOTE:** Results written to JSON file.

In [13]:
import json

In [14]:
COMPS = [
    ('euro_cup', 'https://www.transfermarkt.com/europameisterschaft-2020/elfmeterschiessen/pokalwettbewerb/EM20'),
    ('copa_am', 'https://www.transfermarkt.com/copa-america-2021/elfmeterschiessen/pokalwettbewerb/CAM2'),
    ('cl', 'https://www.transfermarkt.com/uefa-champions-league/elfmeterschiessen/pokalwettbewerb/CL'),
    ('el', 'https://www.transfermarkt.com/europa-league/elfmeterschiessen/pokalwettbewerb/EL')
]

In [15]:
def get_comp(comp):
    '''
    Function to get all penalty shootouts from a competition
    '''
    
    comp_name = comp[0]
    url = comp[1]
    soup = get_soup(url)
    rows = soup.find('table', attrs={'class': 'items'}).find('tbody').find_all('tr')

    season = None
    pks = []
    for row in rows:
        try:
            row['class']
            links = row.find_all('a')
            match_round = links[0].text
            link = links[3]['href']
            pks.append({'season': season, 'match_round': match_round, 'link': link})
        except:
            season = row.find('a').text
        
    base_url = 'https://www.transfermarkt.com'
    matches = []
    for pk in pks:
        match_soup = get_soup(base_url + pk['link'])
        match = {
            'comp': comp_name,
            'season': pk['season'],
            'match_round': pk['match_round'],
            **get_match(match_soup)
        }
        matches.append(match)
        
    return matches

### Euro Cup and Copa America - No issues

In [16]:
# for comp in COMPS[:2]:
#     comp_name = comp[0]
#     comp_data = get_comp(comp)
#     with open(f'{comp_name}.json', 'w', encoding='utf-8') as f:
#         json.dump(comp_data, f, ensure_ascii=False, indent=4)

### Champion's League
Issue with missing substitute player names, ```get_subs``` function modified with ```try```/```except``` block.

In [17]:
# for comp in COMPS[2:3]:
#     comp_name = comp[0]
#     comp_data = get_comp(comp)
#     with open(f'{comp_name}.json', 'w', encoding='utf-8') as f:
#         json.dump(comp_data, f, ensure_ascii=False, indent=4)

### Europa League
Issue with missing shootout data, ```get_penalty_so``` function modified with ```try```/```except``` block.

In [18]:
# for comp in COMPS[3:]:
#     comp_name = comp[0]
#     comp_data = get_comp(comp)
#     with open(f'{comp_name}.json', 'w', encoding='utf-8') as f:
#         json.dump(comp_data, f, ensure_ascii=False, indent=4)