In [0]:
# Import everything
import urllib
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import numpy as np
import time
from plotje import styler


In [0]:
# Make the season list of fromay 'YYYY-YY'
seasons = np.arange(1992, 2019)
season_list = [str(s) + '-' + str(int(str(s)[2:])+1).zfill(2)[:2]
               for s in seasons]
season_list[7] = '1999-00'

c = []
r = []
ha = []
ses = []
# Loop through each season and grab results
for season in season_list:
    url = urllib.request.urlopen(
        'https://www.liverpoolfc.com/match/' + season + '/first-team/fixtures-and-results')
    soup = BeautifulSoup(url.read())

    # The rest of this just grabs the results
    fixlist = soup.find('ul', {'class': "fixture-list-main"})
    fixlist = fixlist.findAll('li')
    i = 0
    for fix in fixlist:
        comp = fix.find('div', {'class': 'comp-mobile'})

        if comp is not None:
            # Postponed games return '-' result
            res = fix.find('div', {'class': 'fixture-list-item-score'}
                           ).get_text().replace(' ', '').replace('\n', '')
            if res != '-':
                if comp.find('img') is not None:
                    c.append(comp.find('img').attrs['alt'])
                else:
                    c.append(comp.get_text())

                res = res.split('(')[0]
                res = res.split('-')
                res = list(map(int, res))
                r.append(res)

                homeaway = fix.find(
                    'div', {'class': 'fixture-list-item-class'}).get_text()
                ha.append(homeaway)
                i += 1

    ses.append([season] * i)
    # Don't overload the server and annoy anyone
    time.sleep(2)


In [0]:
# Format the results from above

# Append the season sublists into one long list
season = []
for s in ses:
    season = season + s

# Make results a numpy array
r = np.array(r)

# Calculate the points for each game
points = []
for i, s in enumerate(ha):
    if s == 'H':
        if r[i, 0] > r[i, 1]:
            points.append(3)
        elif r[i, 0] < r[i, 1]:
            points.append(0)
        elif r[i, 0] == r[i, 1]:
            points.append(1)
    elif s == 'A':
        if r[i, 0] > r[i, 1]:
            points.append(0)
        elif r[i, 0] < r[i, 1]:
            points.append(3)
        elif r[i, 0] == r[i, 1]:
            points.append(1)
    else:
        # is only CL final which has N
        points.append(np.nan)

# Change some of the competition names
c_edited = []
for comp in c:
    if 'FA Cup' in comp:
        c_edited.append('fa cup')
    elif comp == 'Premier League Asia Trophy':
        c_edited.append('friendly')
    elif comp == 'International Champions Cup':
        c_edited.append('friendly')
    elif 'Premiership' in comp:
        c_edited.append('pl')
    elif 'Premier League' in comp:
        c_edited.append('pl')
    else:
        c_edited.append(comp.lower())


In [0]:
# Make dataframe
df = pd.DataFrame(data={
    'season': season,
    'home_away': ha,
    'result-h': r[:, 0],
    'result-a': r[:, 1],
    'competition': c_edited,
    'points': points})

# Peak at what competitions are left to make sure we've compressed them correctly
df['competition'].unique()


In [0]:
dfg = df.groupby('season')


In [0]:
# Go through each season and calculate the points per game
f_ppg = []
start_ppg = []
league_ppg = []
for season in list(dfg.groups.keys()):
    frame = dfg.get_group(season)
    # only get friendly played before first pl game
    friendlyframe = frame.iloc[:np.where(
        np.cumsum(frame['competition'] == 'pl') == 1)[0][0]]
    f_ppg.append(
        friendlyframe[friendlyframe['competition'] == 'friendly']['points'].mean())
    start_ppg.append(frame[frame['competition'] ==
                           'pl'].iloc[:5]['points'].mean())
    league_ppg.append(frame[frame['competition'] == 'pl']['points'].mean())


In [0]:
# Plot results
fig, ax = plt.subplots(1, 2)
ax[0].scatter(f_ppg, start_ppg)
ax[1].scatter(f_ppg, league_ppg)
ax[0].scatter(f_ppg[-3:], start_ppg[-3:], color='salmon')
ax[1].scatter(f_ppg[-3:], league_ppg[-3:], color='salmon')
lab = ['PPG (First 5 Games)', 'PPG (Entire season)']
for ai, a in enumerate(ax):
    a.set_xlim(0, 3.1)
    a.set_ylim(0, 3.1)
    a.set_yticks([0, 1, 2, 3])
    a.set_yticks([0, 1, 2, 3])
    styler(a, xlabel='Preseason PPG', ylabel=lab[ai], aspectsquare=True)

fig.tight_layout()