# Crawling data on match results by team from Europe's top five leagues

A crawling was conducted from ESPN, we extracted information on match date / opponent / final score / league / match result / home. We looked at the HTML code of the ESPN site and extracted the match date, opponent, final score, and league information. Since game result information was not directly provided, win/loss information was extracted by determining home/away.

In [None]:
# Import modules needed for crawling and modules needed to create csv files
import requests
from bs4 import BeautifulSoup
import pandas as pd
from openpyxl import Workbook
# My user-agent header for accessing the ESPN site
headers = {
    'User-Agent': ''}

In [None]:
# generate empty lists
pl_data = []
laliga_data = []
seria_data = []
bundes_data = []
ligue1_data = []

# Function to extract game results for each team
def get_team_results(team, team_url):
    response = requests.get(team_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    '''
    In the picture above, the tag for the part corresponding to one match is <tr>.
    class is 'Table__TR Table__TR--sm Table__even'
    Find all information of class 'Table__TR Table__TR--sm Table__even' in <tr> tag
    '''
    matches = soup.find_all('tr', class_='Table__TR Table__TR--sm Table__even')
    # A list that stores information about one game
    match_results = []
    for match in matches:
        # The first information with the 'matchTeams' class in the <div> tag is the match date.
        date = match.find('div', class_='matchTeams').text if match.find('div', class_='matchTeams') else ''
        opponents=[]
        count = 0
        # Process of finding information on the opposing team
        for opponent in match:
            '''    (1)
            Team information for a match is listed in the order of home team - away team,
            Since the tags and classes are the same,
            we need to extract them all and then distinguish between the home team and the away team.
            '''
            opponent = match.find_all('a', class_='AnchorLink Table__Team')
            count = 0
            home = True
            '''    (2)
            If the game is played at the opposing team's home, the opposing team's name appears first,
            so break immediately without repeating the for statement below.,
            If the opposing team is playing an away game, the opposing team's name appears later,
            so increment the count, repeat the for statement below, record the opposing team's name, and then break.
            '''
            for link in opponent:
                if team not in link.text:
                    opponents.append(link.text)
                    break
                count += 1
        # Since the opposing team's name is recorded 5 times, the one in the first index is used.
        op = opponents[0]
        # If the count did not increase in the for statement (2) above, the opposing team's home
        if count == 0:
            home = False

        # List for recording final scores
        scores = []
        '''    (3)
        Since there is multiple pieces of information with the <a> tag and 'AnchorLink' class,
        and the final score is not the first piece of information,
        a method similar to the method used to extract the opposing team's information is used.
        '''
        for result in match:
            result = match.find_all('a', class_='AnchorLink', href=True)
            # Among the information with the <a> tag and the 'AnchorLink' class, the only information with '-' is the final score.
            for link in result:
                if ' - ' in link.text:
                    scores.append(link.text)
        # Since the final score is recorded 5 times, the one in the first index is used.
        res = str(scores[0])
        '''    (4)
        Use the eval method to determine whether you win or not
        Since the final score is in the format of 2 - 1,
        you can determine whether you win by subtracting.
        '''
        win = eval(res)
        r = '.'
        # If it is a home game, if eval(res) is positive, you win.
        if home:
            if win > 0:
                r = 'Win'
            elif win == 0:
                r = 'Draw'
            else:
                r = 'Lose'
        # In the case of an away game, if eval(res) is negative, you win.
        else:
            if win < 0:
                r = 'Win'
            elif win == 0:
                r = 'Draw'
            else:
                r = 'Lose'

        # League classification
        competitions = []
        for comp in match:
            # Find matches only for the top 5 leagues
            comp = match.find_all('td', class_='Table__TD')
            for link in comp:
                if 'English Premier League' in link.text:
                    competitions.append(link.text)
                elif 'Spanish LALIGA' in link.text:
                    competitions.append(link.text)
                elif 'Italian Serie A' in link.text:
                    competitions.append(link.text)
                elif 'German Bundesliga' in link.text:
                    competitions.append(link.text)
                elif 'French Ligue 1' in link.text:
                    competitions.append(link.text)

        # For cup competitions, replace with '.'
        if len(competitions) == 0:
            competitions.append('.')

        # For leagues, use the one in the first index.
        competition = competitions[0]

        # Home/Away Indicating
        h = 'Home'
        if not home:
            h = 'Away'

        # For home games, use eval(res) as is.
        if home:
            if r == 'Win':
                result_ = eval(res)
            elif r == 'Draw':
                result_ = 0
            else:
                result_ =  eval(res)
        # For away games, use eval(res) multiplied by -1
        else:
            if r == 'Win':
                result_ = -eval(res)
            elif r == 'Draw':
                result_ = 0
            else:
                result_ = -eval(res)

        if competition == 'English Premier League':
            pl_data.append({'Team': team, 'Opponent': op, 'Result': result_})
        elif competition == 'Spanish LALIGA':
            laliga_data.append({'Team': team, 'Opponent': op, 'Result': result_})
        elif competition == 'Italian Serie A':
            seria_data.append({'Team': team, 'Opponent': op, 'Result': result_})
        elif competition == 'German Bundesliga':
            bundes_data.append({'Team': team, 'Opponent': op, 'Result': result_})
        elif competition == 'French Ligue 1':
            ligue1_data.append({'Team': team, 'Opponent': op, 'Result': result_})

        # Add all of the above to the match_result list
        match_results.append({
            'Date': date,
            'Team': team,
            'Opponent': op,
            'Score': '\'' + res,
            'Competition': competition,
            'Result': r,
            'Home': h
        })

    return match_results

# Teams in the 5 major leagues in the 2324 season
team_urls = {
    # EPL
    'AFC Bournemouth': 'https://www.espn.com/soccer/team/results/_/id/349/afc-bournemouth',
    'Arsenal': 'https://www.espn.com/soccer/team/results/_/id/359/arsenal',
    'Aston Villa': 'https://www.espn.com/soccer/team/results/_/id/362/aston-villa',
    'Brentford': 'https://www.espn.com/soccer/team/results/_/id/337/brentford',
    'Brighton & Hove Albion': 'https://www.espn.com/soccer/team/results/_/id/331/brighton-hove-albion',
    'Burnley': 'https://www.espn.com/soccer/team/results/_/id/379/burnley',
    'Chelsea': 'https://www.espn.com/soccer/team/results/_/id/363/chelsea',
    'Crystal Palace': 'https://www.espn.com/soccer/team/results/_/id/384/crystal-palace',
    'Everton': 'https://www.espn.com/soccer/team/results/_/id/368/everton',
    'Fulham': 'https://www.espn.com/soccer/team/results/_/id/370/fulham',
    'Liverpool': 'https://www.espn.com/soccer/team/results/_/id/364/liverpool',
    'Luton Town': 'https://www.espn.com/soccer/team/results/_/id/301/luton-town',
    'Manchester City': 'https://www.espn.com/soccer/team/results/_/id/382/manchester-city',
    'Manchester United': 'https://www.espn.com/soccer/team/results/_/id/360/manchester-united',
    'Newcastle United': 'https://www.espn.com/soccer/team/results/_/id/361/newcastle-united',
    'Nottingham Forest': 'https://www.espn.com/soccer/team/results/_/id/393/nottingham-forest',
    'Sheffield United': 'https://www.espn.com/soccer/team/results/_/id/398/sheffield-united',
    'Tottenham Hotspur': 'https://www.espn.com/soccer/team/results/_/id/367/tottenham-hotspur',
    'West Ham United': 'https://www.espn.com/soccer/team/results/_/id/371/west-ham-united',
    'Wolverhampton Wanderers': 'https://www.espn.com/soccer/team/results/_/id/380/wolverhampton-wanderers',

    # LALIGA
    'Alaves': 'https://www.espn.com/soccer/team/results/_/id/96/alaves',
    'Almeria': 'https://www.espn.com/soccer/team/results/_/id/6832/almeria',
    'Athletic Club': 'https://www.espn.com/soccer/team/results/_/id/93/athletic-club',
    'Atletico Madrid': 'https://www.espn.com/soccer/team/results/_/id/1068/atletico-madrid',
    'Barcelona': 'https://www.espn.com/soccer/team/results/_/id/83/barcelona',
    'Celta Vigo': 'https://www.espn.com/soccer/team/results/_/id/85/celta-vigo',
    'Cadiz': 'https://www.espn.com/soccer/team/results/_/id/3842/cadiz',
    'Getafe': 'https://www.espn.com/soccer/team/results/_/id/2922/getafe',
    'Girona': 'https://www.espn.com/soccer/team/results/_/id/9812/girona',
    'Granada': 'https://www.espn.com/soccer/team/results/_/id/3747/granada',
    'Las Palmas': 'https://www.espn.com/soccer/team/results/_/id/98/las-palmas',
    'Mallorca': 'https://www.espn.com/soccer/team/results/_/id/84/mallorca',
    'Osasuna': 'https://www.espn.com/soccer/team/results/_/id/97/osasuna',
    'Rayo Vallecano': 'https://www.espn.com/soccer/team/results/_/id/101/rayo-vallecano',
    'Real Betis': 'https://www.espn.com/soccer/team/results/_/id/244/real-betis',
    'Real Madrid': 'https://www.espn.com/soccer/team/results/_/id/86/real-madrid',
    'Real Sociedad': 'https://www.espn.com/soccer/team/results/_/id/89/real-sociedad',
    'Sevilla': 'https://www.espn.com/soccer/team/results/_/id/243/sevilla',
    'Valencia': 'https://www.espn.com/soccer/team/results/_/id/94/valencia',
    'Villarreal': 'https://www.espn.com/soccer/team/results/_/id/102/villarreal',

    # SERIA A
    'AC Milan': 'https://www.espn.com/soccer/team/results/_/id/103/ac-milan',
    'AS Roma': 'https://www.espn.com/soccer/team/results/_/id/104/as-roma',
    'Atalanta': 'https://www.espn.com/soccer/team/results/_/id/105/atalanta',
    'Bologna': 'https://www.espn.com/soccer/team/results/_/id/107/bologna',
    'Cagliari': 'https://www.espn.com/soccer/team/results/_/id/2925/cagliari',
    'Empoli': 'https://www.espn.com/soccer/team/results/_/id/2574/empoli',
    'Fiorentina': 'https://www.espn.com/soccer/team/results/_/id/109/fiorentina',
    'Frosinone': 'https://www.espn.com/soccer/team/results/_/id/4057/frosinone',
    'Genoa': 'https://www.espn.com/soccer/team/results/_/id/3263/genoa',
    'Hellas Verona': 'https://www.espn.com/soccer/team/results/_/id/119/hellas-verona',
    'Internazionale': 'https://www.espn.com/soccer/team/results/_/id/110/internazionale',
    'Juventus': 'https://www.espn.com/soccer/team/results/_/id/111/juventus',
    'Lazio': 'https://www.espn.com/soccer/team/results/_/id/112/lazio',
    'Lecce': 'https://www.espn.com/soccer/team/results/_/id/113/lecce',
    'Monza': 'https://www.espn.com/soccer/team/results/_/id/4007/monza',
    'Napoli': 'https://www.espn.com/soccer/team/results/_/id/114/napoli',
    'Salernitana': 'https://www.espn.com/soccer/team/results/_/id/3240/salernitana',
    'Sassuolo': 'https://www.espn.com/soccer/team/results/_/id/3997/sassuolo',
    'Torino': 'https://www.espn.com/soccer/team/results/_/id/239/torino',
    'Udinese': 'https://www.espn.com/soccer/team/results/_/id/118/udinese',

    # BUNDESLIGEA
    '1. FC Heidenheim 1846': 'https://www.espn.com/soccer/team/results/_/id/6418/1-fc-heidenheim-1846',
    '1. FC Union Berlin': 'https://www.espn.com/soccer/team/results/_/id/598/1-fc-union-berlin',
    'Bayer Leverkusen': 'https://www.espn.com/soccer/team/results/_/id/131/bayer-leverkusen',
    'Bayern Munich': 'https://www.espn.com/soccer/team/results/_/id/132/bayern-munich',
    'Borussia Dortmund': 'https://www.espn.com/soccer/team/results/_/id/124/borussia-dortmund',
    'Borussia Monchengladbach': 'https://www.espn.com/soccer/team/results/_/id/268/borussia-monchengladbach',
    'Eintracht Frankfurt': 'https://www.espn.com/soccer/team/results/_/id/125/eintracht-frankfurt',
    'FC Augsburg': 'https://www.espn.com/soccer/team/results/_/id/3841/fc-augsburg',
    'FC Cologne': 'https://www.espn.com/soccer/team/results/_/id/122/fc-cologne',
    'Mainz': 'https://www.espn.com/soccer/team/results/_/id/2950/mainz',
    'RB Leipzig': 'https://www.espn.com/soccer/team/results/_/id/11420/rb-leipzig',
    'SC Freiburg': 'https://www.espn.com/soccer/team/results/_/id/126/sc-freiburg',
    'SV Darmstadt 98': 'https://www.espn.com/soccer/team/results/_/id/3812/sv-darmstadt-98',
    'TSG Hoffenheim': 'https://www.espn.com/soccer/team/results/_/id/7911/tsg-hoffenheim',
    'VfB Stuttgart': 'https://www.espn.com/soccer/team/results/_/id/134/vfb-stuttgart',
    'VfL Bochum': 'https://www.espn.com/soccer/team/results/_/id/121/vfl-bochum',
    'VfL Wolfsburg': 'https://www.espn.com/soccer/team/results/_/id/138/vfl-wolfsburg',
    'Werder Bremen': 'https://www.espn.com/soccer/team/results/_/id/137/werder-bremen',

    # LIGUE 1
    'AS Monaco': 'https://www.espn.com/soccer/team/results/_/id/174/as-monaco',
    'Brest': 'https://www.espn.com/soccer/team/results/_/id/6997/brest',
    'Clermont Foot': 'https://www.espn.com/soccer/team/results/_/id/3171/clermont-foot',
    'Le Havre AC': 'https://www.espn.com/soccer/team/results/_/id/3236/le-havre-ac',
    'Lens': 'https://www.espn.com/soccer/team/results/_/id/175/lens',
    'Lille': 'https://www.espn.com/soccer/team/results/_/id/166/lille',
    'Lorient': 'https://www.espn.com/soccer/team/results/_/id/273/lorient',
    'Lyon': 'https://www.espn.com/soccer/team/results/_/id/167/lyon',
    'Marseille': 'https://www.espn.com/soccer/team/results/_/id/176/marseille',
    'Metz': 'https://www.espn.com/soccer/team/results/_/id/177/metz',
    'Montpellier': 'https://www.espn.com/soccer/team/results/_/id/274/montpellier',
    'Nantes': 'https://www.espn.com/soccer/team/results/_/id/165/nantes',
    'Nice': 'https://www.espn.com/soccer/team/results/_/id/2502/nice',
    'Paris Saint-Germain': 'https://www.espn.com/soccer/team/results/_/id/160/paris-saint-germain',
    'Stade Rennais': 'https://www.espn.com/soccer/team/results/_/id/169/stade-rennais',
    'Stade de Reims': 'https://www.espn.com/soccer/team/results/_/id/3243/stade-de-reims',
    'Strasbourg': 'https://www.espn.com/soccer/team/results/_/id/180/strasbourg',
    'Toulouse': 'https://www.espn.com/soccer/team/results/_/id/179/toulouse'
}

all_team_results = {}

for team, url in team_urls.items():
    results = get_team_results(team, url)
    all_team_results[team] = results

# print
for team, results in all_team_results.items():
    df = pd.DataFrame(results)
    #print(f"\n{team} Match Results:")
    #print(df)
pl_df = pd.DataFrame(pl_data)


In [None]:
for team, results in all_team_results.items():
    df = pd.DataFrame(results)
    # Exclude games other than league games and save them to csv file
    df = df[df['Competition'] != '.']
    file_name = f'{team}_2324.csv'
    df.to_csv(file_name, index=False, encoding = "utf-8-sig")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def create_heatmap(team_results):
    df = pd.DataFrame(team_results, columns=['Team', 'Opponent', 'Result'])
    df_pivot = df.pivot_table(index='Team', columns='Opponent', values='Result')
    plt.figure(figsize=(12, 8))
    sns.heatmap(df_pivot, annot=True, cmap='coolwarm', cbar=True, linewidths=.5)
    plt.title('Team Results Heatmap')
    plt.show()

pl_df.to_csv('pl_df_2324.csv', index=False, encoding="utf-8-sig")
laliga_df = pd.DataFrame(laliga_data)
laliga_df.to_csv('laliga_df_2324.csv', index=False, encoding="utf-8-sig")
seria_df = pd.DataFrame(seria_data)
seria_df.to_csv('seria_df_2324.csv', index=False, encoding="utf-8-sig")
bundes_df = pd.DataFrame(bundes_data)
bundes_df.to_csv('bundes_df_2324.csv', index=False, encoding="utf-8-sig")
ligue1_df = pd.DataFrame(ligue1_data)
ligue1_df.to_csv('ligue1_df_2324.csv', index=False, encoding="utf-8-sig")