# Some more analyses

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.optimize import minimize
from statsmodels.stats.descriptivestats import sign_test
import math
import os
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick

# load data
dirs = [f for f in os.listdir('data') if not f.startswith('.')]

data = pd.DataFrame()

for i in range(0,len(dirs)) :
    tempdata = pd.read_csv('data/'+dirs[i]+'/champs.csv')
    tempdata['year'] = int(dirs[i][0:4:1])
    data = data.append(tempdata)

data["home_goals"] = np.nan
data["away_goals"] = np.nan
data["sign"] = np.nan
data["leg2"] = "False"
data["agg_sign"] = np.nan
data.reset_index(inplace=True)

nrdatasets=len(data)

for i in range(0,nrdatasets) :
    middle = data.iloc[i]['FT'].find('-')
    
    if (data.iloc[i]['FT'].find('*')==-1) :
        end = len(data.iloc[i]['FT'])
    else :
        end = data.iloc[i]['FT'].find(' ')
    home_goals = int(data.iloc[i]['FT'][0: middle: 1])
    away_goals = int(data.iloc[i]['FT'][(middle+1): end: 1])
    data.at[i,"home_goals"] = home_goals
    data.at[i,"away_goals"] = away_goals
    
    if (home_goals > away_goals) :
        data.at[i,"results"] = "H"
        data.at[i,"sign"] = 1
    elif (home_goals == away_goals) :
        data.at[i,"results"] = "D"
        data.at[i,"sign"] = -1
    else :
        data.at[i,"results"] = "A"
        data.at[i,"sign"] = 0
    
    if (data.iloc[i]['Round'].find('Leg 2')!=-1) :
        data.at[i,"leg2"] = True
        
        if (data.iloc[i]['∑FT'].find('(a)')==-1) :
            middle = data.iloc[i]['∑FT'].find('-')
            end = data.iloc[i]['∑FT'].find(' ')
            home_goals = int(data.iloc[i]['∑FT'][0: middle: 1])
            away_goals = int(data.iloc[i]['∑FT'][(middle+1): end: 1])
            away_goals_win = 0
        else :
            middle = data.iloc[i]['∑FT'].find('-')
            end = data.iloc[i]['∑FT'].find(' ',middle)
            beginning = data.iloc[i]['∑FT'].find(' ')
            home_goals = int(data.iloc[i]['∑FT'][beginning: middle: 1])
            away_goals = int(data.iloc[i]['∑FT'][(middle+1): end: 1])
            
            if (data.iloc[i]['FT'].find('*')==-1) :
                middle = data.iloc[i]['FT'].find('-')
                end = len(data.iloc[i]['FT'])            
                home_goals_FT = int(data.iloc[i]['FT'][0: middle: 1])
                away_goals_FT = int(data.iloc[i]['FT'][(middle+1): end: 1])
            else :
                middle = data.iloc[i]['FT'].find('-')
                end = end = data.iloc[i]['∑FT'].find(' ')
                home_goals_FT = int(data.iloc[i]['FT'][0: middle: 1])
                away_goals_FT = int(data.iloc[i]['FT'][(middle+1): end: 1])
            
            away_goals_hometeam = home_goals-home_goals_FT
            away_goals_win = 1
            
        if (home_goals > away_goals) :
            data.at[i,"agg_results"] = "H"
            data.at[i,"agg_sign"] = 1
        elif (home_goals == away_goals) :
            if (away_goals_win == 0) :
                data.at[i,"agg_results"] = "D"
                data.at[i,"agg_sign"] = 0
            else :
                if (away_goals_hometeam>away_goals) :
                    data.at[i,"agg_results"] = "H"
                    data.at[i,"agg_sign"] = 1
                else :
                    data.at[i,"agg_results"] = "A"
                    data.at[i,"agg_sign"] = -1
        else :
            data.at[i,"agg_results"] = "A"
            data.at[i,"agg_sign"] = -1
        

Unsurprisingly, there is a clear home advantage in the Champions league.  

In [36]:
print('The home team wins ' + str(np.round(np.mean(data["results"]=="H")*100,1)) + '% of games.')
print('The away team wins ' + str(np.round(np.mean(data["results"]=="A")*100,1)) + '% of games.')
print('There is a draw in ' + str(np.round(np.mean(data["results"]=="D")*100,1)) + '% of games.')
print('The home team scores an average of ' + str(np.round(np.mean(data["home_goals"]),2)) + ' goals.')
print('The away team scores an average of ' + str(np.round(np.mean(data["away_goals"]),2)) + ' goals.')

The home team wins 52.5% of games.
The away team wins 25.1% of games.
There is a draw in 22.4% of games.
The home team scores an average of 1.69 goals.
The away team scores an average of 1.04 goals.


Interestingly, there appears to be no benefit for the team that plays at home during the second leg of a playoff.

In [7]:
print('In a Leg 2 game, the home team goes through on ' + str(np.round(np.mean(data[(data["leg2"]==True)]["agg_results"]=="H")*100,1)) + '% of games.')
print('In a Leg 2 game, the away team goes through on ' + str(np.round(np.mean(data[(data["leg2"]==True)]["agg_results"]=="A")*100,1)) + '% of games.')
print('In a Leg 2 game, extra time is needed on ' + str(np.round(np.mean(data[(data["leg2"]==True)]["agg_results"]=="D")*100,1)) + '% of games.')

results=sign_test(data[(data["leg2"]==True) & (data["agg_results"]!="D")]["agg_sign"])
print(' ')
print('There is no significant difference between the win percentage of home and away teams on leg 2,\nwith a sign-test giving a p-value of ' + str(np.round(results[1],2)) + '.')

In a Leg 2 game, the home team goes through on 48.4% of games.
In a Leg 2 game, the away team goes through on 46.2% of games.
In a Leg 2 game, extra time is needed on 5.5% of games.
 
There is no significant difference between the win percentage of home and away teams on leg 2,
with a sign-test giving a p-value of 0.34.


Here, I repeat the same analyses, but I exclude the Ro16:

In [6]:
print('In a Leg 2 game, the home team goes through on ' + str(np.round(np.mean(data[(data["leg2"]==True) & (data["Round"]!="Round of 16 | Leg 2")]["agg_results"]=="H")*100,1)) + '% of games.')
print('In a Leg 2 game, the away team goes through on ' + str(np.round(np.mean(data[(data["leg2"]==True) & (data["Round"]!="Round of 16 | Leg 2")]["agg_results"]=="A")*100,1)) + '% of games.')
print('In a Leg 2 game, extra time is needed on ' + str(np.round(np.mean(data[(data["leg2"]==True) & (data["Round"]!="Round of 16 | Leg 2")]["agg_results"]=="D")*100,1)) + '% of games.')

results=sign_test(data[(data["leg2"]==True) & (data["agg_results"]!="D") & (data["Round"]!="Round of 16 | Leg 2")]["agg_sign"])
print(' ')
print('There is no significant difference between the win percentage of home and away teams on leg 2,\nwith a sign-test giving a p-value of ' + str(np.round(results[1],2)) + '.')

In a Leg 2 game, the home team goes through on 48.3% of games.
In a Leg 2 game, the away team goes through on 46.7% of games.
In a Leg 2 game, extra time is needed on 5.0% of games.
 
There is no significant difference between the win percentage of home and away teams on leg 2,
with a sign-test giving a p-value of 0.52.
