# Goals

- Scrap transfermarket webpage for the market value of Premier League and Championship teams over the last 10 years
- Analyze the evolution of the market value
- Study the correlation between market value and final league standings. Identify which teams underperformed and overperformed.

In [1]:
from tqdm.notebook import tqdm

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
df = {}
for year in tqdm(range(2010,2023), desc="Year"):
    headers = {'User-Agent': 
               'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

    page = f'https://www.transfermarkt.com/premier-league/tabelle/wettbewerb/GB1/saison_id/{year}'
    pageTree = requests.get(page, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')

    teams = pageSoup.find_all("td", {"class": "no-border-links hauptlink"})
    stats = pageSoup.find_all("td", {"class": "zentriert"})
    entries = []
    for i in range(20):
        team_name = teams[i].get_text(strip=True)
        games = int(stats[1+8*i].text) # games
        wins = int(stats[2+8*i].text) # wins
        draws = int(stats[3+8*i].text) # draws
        losses = int(stats[4+8*i].text) # losses
        goals = stats[5+8*i].text # goals
        goals_for, goals_against = goals.split(':')
        plusminus = int(stats[6+8*i].text) # +-
        points = int(stats[7+8*i].text) # points
        entries.append((team_name, games, wins, draws, losses, goals_for, goals_against, plusminus, points))
    df[year] = pd.DataFrame(entries, columns=['Team', 'Games', 'Wins', 'Draws', 'Losses', 'GF', 'GA', '+/-', 'Pts'])

Year:   0%|          | 0/13 [00:00<?, ?it/s]

In [7]:
for year in range(2010,2023):
    p1 = str(year)[-2:]
    p2 = str(year+1)[-2:]
    df[year].to_csv(f'raw_data/transfermarkt/premier_league{p1}-{p2}.csv', index=False)