### Az adatok letöltése 00/01 - 19/20
A vizsgált csapat: Liverpool

In [None]:
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from io import StringIO
from datetime import datetime

In [None]:
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0',
    'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
    ('Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) '
     'Version/6.0 Mobile/10A5355d Safari/8536.25'),
    ('Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; '
     '.NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0'),
]

def get_header(agents):
    return {'User-agent': random.choice(agents)}

In [None]:
url = 'https://www.football-data.co.uk/englandm.php'
response = requests.get(url, headers=get_header(USER_AGENTS))

In [None]:
base = 'https://www.football-data.co.uk/'
soup = BeautifulSoup(response.content, 'html.parser')
urls = [base+url.get('href') for url in soup.find_all('a') if 'Premier League' in url.get_text()]

In [None]:
names = {
    'Date': 'date',
    'HomeTeam': 'HOME',
    'AwayTeam': 'AWAY',
    'FTHG':'H_goals',
    'FTAG':'A_goals',
    'HTHG':'H_half_time_goals',
    'HTAG':'A_half_time_goals',
    'HS':'H_shots',
    'AS':'A_shots',
    'HST':'H_shots_on_target',
    'AST':'A_shots_on_target',
    'HC':'H_corners',
    'AC':'A_corners',
    'HF':'H_fouls_committed',
    'AF':'A_fouls_committed',
    'HFKC':'H_free_kicks_conceded',
    'AFKC':'A_free_kicks_conceded',
    'HY':'H_yellow_cards',
    'AY':'A_yellow_cards',
    'HR':'H_red_cards',
    'AR':'A_red_cards'
}

In [None]:
columns = ['year', 'goal_difference', 'half_time_goal_difference']
for col in list(names.values()):
    columns.append(col)

In [None]:
final = ['year', 'match', 'home', 'OTHER', 'goal_difference', 'half_time_goal_difference']
for col in list(names.values())[3:]:
    if col[0] == 'H':
        final.append(('L' + col[1:]))
    elif col[0] == 'A':
        final.append(('O' + col[1:]))

In [None]:
replacing_home = {}
replacing_away = {}
for col in [c for c in columns if c not in ['goal_difference', 'half_time_goal_difference']]:
    if (col == 'AWAY'):
        replacing_home['AWAY'] = 'OTHER'
        replacing_away['AWAY'] = 'L'
    elif (col == 'HOME'):
        replacing_home['HOME'] = 'L'
        replacing_away['HOME'] = 'OTHER'
    else:
        if (col[0] == 'H'):
            replacing_home[col] = 'L' + col[1:]
            replacing_away[col] = 'O' + col[1:]
        elif (col[0] == 'A'):
            replacing_home[col] = 'O' + col[1:]
            replacing_away[col] = 'L' + col[1:]
        else:
            replacing_home[col] = col
            replacing_away[col] = col

In [None]:
dfs = []


for url in tqdm(urls[:20]):
    try:
        year = url.split('/')[4][:2] + '-' + url.split('/')[4][2:4]

        response = requests.get(url, headers=get_header(USER_AGENTS))
        col_names = pd.read_csv(StringIO(response.text), nrows=0)
        data = pd.read_csv(StringIO(response.text), usecols=col_names)

        columns_to_use = [col for col in names.keys() if col in data.columns]
        data = pd.DataFrame(data[columns_to_use])
        data.dropna(axis=0, how='all', inplace=True)

        for col in [col for col in names.keys() if col not in columns_to_use]:
            data[col] = [np.nan for i in data.index.values]

        data.rename(columns=names, inplace=True)

        data['year'] = [year for i in data.index.values]
        data = data.loc[(data.HOME == 'Liverpool') | (data.AWAY == 'Liverpool')]
        
        data['home'] = [int(home == 'Liverpool') for home in data.HOME]
        home = data.loc[data.home == 1]
        away = data.loc[data.home == 0]
        home.rename(columns=replacing_home, inplace=True)
        away.rename(columns=replacing_away, inplace=True)
        
        data = pd.concat([home, away], sort=False)
        data.drop(columns=['L'], inplace=True)
        
        data['goal_difference'] = data['L_goals'] - data['O_goals']
        data['half_time_goal_difference'] = data['L_half_time_goals'] - data['O_half_time_goals']
        
        try:
            data['date'] = [datetime.strptime(d, '%d/%m/%Y') for d in data.date]
        except:
            data['date'] = [datetime.strptime(d, '%d/%m/%y') for d in data.date]
        data.sort_values(by='date', inplace=True)
        data.reset_index(drop=True, inplace=True)
        data.drop(columns=['date'], inplace=True)
        data['match'] = [i+1 for i in data.index.values]
        
        data = data[final]
        point = 0
        data['points'] = [np.nan for i in data.index.values]
        for i in data.index.values:
            if data.at[i, 'goal_difference'] > 0:
                point += 3
            elif data.at[i, 'goal_difference'] == 0:
                point += 1
            data.at[i, 'points'] = point
        
        dfs.append(data)
    except Exception as e:
        print(e)
        print(urls.index(url), url)

In [None]:
data = pd.concat(dfs, ignore_index=True)
data.dropna(axis=1, how='all', inplace=True)

In [None]:
data.to_csv('.\\Liverpool_football_data.csv', index=False)