In [1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import cufflinks
import plotly
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
%matplotlib inline

In [2]:
league_18 = pd.read_csv("data/season-1718_csv.csv")

In [3]:
home_sums = league_18[["HomeTeam", "HTHG", "FTHG"]].groupby("HomeTeam").sum()
away_sums = league_18[["AwayTeam", "HTAG", "FTAG"]].groupby("AwayTeam").sum()

In [4]:
home_sums.index.name = "Team"
away_sums.index.name = "Team"
home_sums.columns = ["HalfTime", "FullTime"]
away_sums.columns = ["HalfTime", "FullTime"]

In [5]:
full_sums = home_sums + away_sums
full_sums["SecondHalf"] = full_sums.loc[:, "FullTime"] - full_sums.loc[:, "HalfTime"]

In [6]:
full_sums.drop(columns="FullTime", inplace=True)

In [7]:
full_sums["Ratio"] = full_sums["SecondHalf"] / (full_sums["HalfTime"] + full_sums["SecondHalf"])

In [8]:
full_sums.reset_index(inplace=True)

In [9]:
def wins(df):
    teams = df.HomeTeam.unique()
    team_wins = dict()
    for team in teams:
        wins = 0
        for row in range(df.shape[0]):
            if df.HomeTeam.iloc[row] == team and df.FTHG.iloc[row] > df.FTAG.iloc[row]:
                wins += 1
            elif df.AwayTeam.iloc[row] == team and df.FTAG.iloc[row] > df.FTHG.iloc[row]:
                wins += 1
        team_wins[team] = wins
    return team_wins

In [10]:
team_wins = wins(league_18)

In [11]:
full_sums["Wins"] = full_sums.Team.replace(team_wins)

In [12]:
hover_text = [f"Team: {row[0]}<br>Wins: {row[1]}<br>First Half: {row[2]}<br>Second Half: {row[3]}" 
              for row in full_sums[["Team", "Wins", "HalfTime", "SecondHalf"]].values]

In [13]:
trace = [go.Scatter(
    x=full_sums["HalfTime"]+np.random.choice(a=np.linspace(-1, 1, 20), replace=False, size=20), 
    y=full_sums["SecondHalf"].values,
    text=hover_text,
    hoverinfo="text",
    mode="markers",
    marker=dict(
        size=full_sums["Wins"]**1.2,
        color="Green"))]

In [14]:
layout = go.Layout(
    title="Second Half Goals vs Wins for 2018 Season", 
    xaxis=dict(title="First Half Goals"), 
    yaxis=dict(title="Second Half Goals"))

In [15]:
py.iplot(dict(data=trace, layout=layout), filename='GoalComparison')


Consider using IPython.display.IFrame instead



In [15]:
iplot(dict(data=trace, layout=layout))

# Discussion

The scatter plot above shows the relationship between total first half goals, second half goals, and wins in the 2017/18 Premier League season. The axes correspond to each half and the size of the point reflects the number of wins the team earned in the year. As we would expect, the teams with more goals in both halves won more games; however, we do see in some cases that teams won more games but did not score as many goals as other teams. For example, Man United had the second most wins this year (25), but were outscored in either dimension by four teams that finished with less wins. We can conclude that Man United played more defensive this year, scoring less goals but also conceding less goals, and winning more as a result. A similar situation can be seen in Burnley's performance. On the other hand, Liverpool scored more goals in the first and second half but only had 21 wins, so we can assume that Liverpool probably conceded more goals and either lost or tied games because of it. Thus, we can tease out the attitudes some teams approached games with using this logic. 