In [None]:
import numpy as np
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

import pickle

In [2]:
base_url = 'https://www.transfermarkt.com/uefa-champions-league/gesamtspielplan/pokalwettbewerb/CL/saison_id/'
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
headers = {
    'User-Agent': ua
}

In [7]:
# Function for defining latest season
def which_season():
    today = datetime.date.today()
    if today.month > 8:
        season = today.year
    else:
        season = today.year - 1
    
    return season

# Function to get match data
def get_match(match):
    links = match.find_all('a')
    home_team = links[0].text.strip()
    home_href = links[0]['href']
    away_team = links[4].text.strip()
    away_href = links[4]['href']
    score = links[2].text.strip()
    
    return home_team, home_href, away_team, away_href, score

# Get player value
def get_value(player):
    if re.search('m', player):
        value = float(player[1:-1])
    elif re.search('Th.', player):
        value = float(player[1:-3]) * 0.001
    else:
        value = 0
        
    return value

# Get team value
def get_t_value(team, teams):
    url = teams[team]['href']
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text)
    odds = soup.find_all('tr', attrs={'odd'})
    evens = soup.find_all('tr', attrs={'even'})
    total = 0
    for player in odds:
        pv = player.find_all('td')[-1].text.strip()
        total += get_value(pv)

    for player in evens:
        pv = player.find_all('td')[-1].text.strip()
        total += get_value(pv)

    return round(total, 2)

In [8]:
# Function to get seasonal data
def get_season(year):
    # Assert given year within CL seasons with transfer market data (2004 to present)
    min_season = 2004
    max_season = which_season()
    season_range = range(min_season, max_season + 1)
    assert year in season_range, f'Season (beginning) must be 2004 through {max_season}'
    
    # Get elements of match rows
    r = requests.get(base_url + str(year), headers=headers)
    soup = BeautifulSoup(r.text)
    match_reps = soup.find_all(attrs={'title' : 'Match report'})
    match_eles = [ match.parent.parent for match in match_reps ]
    
    # Create season match dictionary
    matches = []
    teams = {}
    tm_url = 'https://www.transfermarkt.com'
    for match in match_eles:
        # Use get_match function to get match data
        home_team, home_href, away_team, away_href, score = get_match(match)
        
        if home_href not in teams.keys():
            teams[home_team] = {'href': tm_url + home_href}
        if away_href not in teams.keys():
            teams[away_team] = {'href': tm_url + away_href}
            
        match_dict = {
            'home_team': home_team,
            'away_team': away_team,
            'score': score,
            'match_season': year,
        }
        matches.append(match_dict)
    
    # Use helper 'get value' functions
    team_values = {}
    for team in teams.keys():
        team_values[team] = get_t_value(team, teams)
    
    # Combine data
    for match in matches:
        match['home_value'] = team_values[match['home_team']]
        match['away_value'] = team_values[match['away_team']]
        
    return matches

In [None]:
cl_matches = []
for year in range(2004, 2019):
    cl_matches + get_season(year)

In [None]:
cl_matches[:10]

In [None]:
with open('cl_matches.p', 'wb') as f:
    pickle.dump(cl_matches, f)