# Initial data visualization

In this notebook, we shall do some exploratory data analysis. We shall explore how the different factors such as who plays at home, the toss and relative rankings influence the results. We do not do any formal statistical analysis in this notebook.

In [None]:
# start by loading the key packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Impact of playing at home vs away

In the following we consider the impact of playing at home or away on the final result.

In [None]:
# get the number of home, away wins and draws
n_home_wins = df_main[df_main.result == "home"].shape[0]
n_away_wins = df_main[df_main.result == "away"].shape[0]
n_draws = df_main[df_main.result == "draw"].shape[0]
n_tot = df_main.shape[0]

# print the data
print("Number of home wins = ", n_home_wins)
print("Number of away wins = ", n_away_wins)
print("Number of draws = ", n_draws)

In [None]:
# plot the data
x_array = np.array([-1,0,1]) # arbitrary units for x-axis
xshift = 0.3
y_array = [n_home_wins, n_draws, n_away_wins] # results to be plotted
# calculate the "expected number" for each outcome assuming equal probability
y0_array = [n_tot/3, n_tot/3, n_tot/3]
plt.bar(x_array, y_array, width=0.3, color='b', alpha=0.5)
plt.bar(x_array+xshift, y0_array, width=0.3, color='r', alpha=0.5)

#labels for axes etc
xlocs = [-0.85, 0.15, 1.15]
xlabs = ["home wins", "draws", "away wins"]
plt.xticks(xlocs, xlabs)
plt.ylabel("N outcomes")

plt.show()

It might also be insightful to remove draws from the dataset to see how many matches are won / lost playing at home.

In [None]:
# number of matches without a draw
tot_no_draws = tot - n_draws
x_arr = np.array([0,1])
y_arr = np.array([n_home_wins, n_away_wins])
y0_arr = np.array([tot_no_draws/2, tot_no_draws/2])                

plt.bar(x_arr, y_arr, width=0.3, color='b', alpha=0.5)
plt.bar(x_arr+xshift, y0_arr, width=0.3, color='r', alpha=0.5)

#labels for axes etc
xlocs = [0.15, 1.15]
xlabs = ["home wins", "away wins"]
plt.xticks(xlocs, xlabs)
plt.ylabel("N outcomes")

plt.show()

In [None]:
n_hw_ht = df_main[(df_main.result == "home") & (df_main.toss == "home")].shape[0]
n_hw_at = df_main[(df_main.result == "home") & (df_main.toss == "away")].shape[0]
n_aw_ht = df_main[(df_main.result == "away") & (df_main.toss == "home")].shape[0]
n_aw_at = df_main[(df_main.result == "away") & (df_main.toss == "away")].shape[0]

In [None]:
print(n_hw_ht, n_hw_at, n_aw_ht, n_aw_at)

In [None]:
n_mw_tw = n_hw_ht + n_aw_at
n_mw_tl = n_aw_ht + n_hw_at
print(n_mw_tw, n_mw_tl)

In [None]:
plt.bar([0,1], np.array([n_mw_tw, n_mw_tl])/(n_mw_tw+n_mw_tl), width=0.5)

In [None]:
num_ht = df_main[(df_main.toss=="home")].shape[0]
num_at = df_main[(df_main.toss=="away")].shape[0]
print(num_ht, num_at)

In [None]:
print(num_home_wins/num_away_wins)
print(n_hw_ht / n_aw_ht)
print(n_hw_at / n_aw_at)

In [None]:
test_teams = df_main.home_team.drop_duplicates().to_list()

In [None]:
for team in test_teams:
    num_matches = df_main[df_main.home_team==team].shape[0]
    num_draws = df_main[(df_main.home_team==team) & (df_main.result == "draw")].shape[0]
    num_hws = df_main[(df_main.home_team==team) & (df_main.result == "home")].shape[0]
    num_aws = df_main[(df_main.home_team==team) & (df_main.result == "away")].shape[0]
    print(team,num_matches, num_hws, num_draws, num_aws)

In [None]:
rank_diff = df_main.home_rank - df_main.away_rank
rank_diff = np.where(rank_diff<0,1,-1)

In [None]:
df_main["rank_diff"]=rank_diff
print(df_main.head())
all_rank_diffs = df_main.rank_diff.drop_duplicates().to_list()

In [None]:
print(all_rank_diffs)

In [None]:
for rank_diff in all_rank_diffs:
    num_matches = df_main[df_main.rank_diff==rank_diff].shape[0]
    print(rank_diff,num_matches)

In [None]:
high_wins = (
    df_main[(df_main.result == "home") & (df_main.rank_diff == 1)].shape[0]
    + df_main[(df_main.result == "away") & (df_main.rank_diff == -1)].shape[0]
)
low_wins = (
    df_main[(df_main.result == "home") & (df_main.rank_diff == -1)].shape[0]
    + df_main[(df_main.result == "away") & (df_main.rank_diff == 1)].shape[0]
)
print(high_wins, low_wins, num_draws)

In [None]:
plt.bar([-1,0,1],np.array([high_wins, low_wins, num_draws]),width=0.2)
plt.bar([-0.8,0.2,1.2],np.array([num_home_wins, num_away_wins, num_draws]),width=0.2)
plt.bar([-0.6,0.4,1.4],np.array([n_mw_tw, n_mw_tl, num_draws]),width=0.2)
num_wl = tot-num_draws
expt_w = num_wl/2
plt.plot([-1.1,-0.5],[expt_w,expt_w],color='k',ls='--')

In [None]:
from scipy import stats
p_home = stats.binom_test(num_home_wins,num_wl,0.5)
p_toss = stats.binom_test(n_mw_tw,num_wl,0.5)
p_high_rank = stats.binom_test(high_wins,num_wl,0.5)
print(p_home, p_toss, p_high_rank)