In [1]:
import pandas as pd 
from sklearn.externals import joblib
import numpy as np


In [2]:
df_week = pd.read_csv("../data/16-17/week2.csv")

In [3]:
def get_teams(df):
    
    teams = []
    
    for i,cal in enumerate(df.iterrows()):
        teams.append(cal[1]["HomeTeam"])
        teams.append(cal[1]["AwayTeam"])

        if i == 9: 
            return teams


In [4]:
teams_1617 = get_teams(df_week)

print teams_1617

['Juventus', 'Fiorentina', 'Roma', 'Udinese', 'Atalanta', 'Lazio', 'Bologna', 'Crotone', 'Chievo', 'Inter', 'Empoli', 'Sampdoria', 'Genoa', 'Cagliari', 'Milan', 'Torino', 'Palermo', 'Sassuolo', 'Pescara', 'Napoli']


In [5]:
def build_features(df, teams):

    team_features = []
    
    for team in teams:
        #print team_a
        team_home = df[df['HomeTeam']==team]
        team_away = df[df['AwayTeam']==team]
        
        #shots made
        team_s    = team_away["AS"].sum()  + team_home["HS"].sum()
        #shots on-target made
        team_st   = team_away["AST"].sum() + team_home["HST"].sum()
        #shots conceded
        team_sc    = team_away["HS"].sum()  + team_home["AS"].sum()
        #shots on-target conceded
        team_stc   = team_away["HST"].sum() + team_home["AST"].sum()
        #corners awarded
        team_c    = team_away["AC"].sum()  + team_home["HC"].sum()
        #corners conceded
        team_cc    = team_away["HC"].sum()  + team_home["AC"].sum()

        team_features.append([team_s,team_sc,team_st,team_stc,team_c,team_cc])

    return team_features

In [6]:
def build_target(df, teams):

    team_target = []
    
    for team in teams:
        #print team_a
        t      = df[(df['HomeTeam']==team) | (df['AwayTeam']==team)]
        team_home = df[df['HomeTeam']==team]
        team_away = df[df['AwayTeam']==team]

        team_h_win = len(team_home[team_home['FTHG']>team_home['FTAG']])
        team_a_win = len(team_away[team_away['FTAG']>team_away['FTHG']])
        team_draw = len(t[t['FTAG']==t['FTHG']])

        team_points = 3*team_a_win + 3*team_h_win + team_draw
        team_target.append(team_points)

    return team_target


In [7]:
targ_1617 = build_target(df_week, teams_1617)
feat_1617 = build_features(df_week, teams_1617)

In [10]:
model_1 = joblib.load('../data/linreg_model.pkl')

pred_1617 = model_1.predict(feat_1617)

In [11]:
#print pred_1617
#print teams_1617
ranking = []

for t,p in zip(teams_1617,pred_1617):
    ranking.append((t, p)) 

In [12]:
ranking.sort(key=lambda x: x[1],reverse=True)
for t,p in ranking:
    print t,p

Roma 5.61077852595
Torino 4.68689890086
Genoa 4.64998250775
Napoli 4.34760480607
Sassuolo 3.90708817407
Juventus 3.79296392636
Inter 3.44670595045
Bologna 2.81170041242
Sampdoria 2.80962757449
Pescara 2.79262250505
Atalanta 2.33999287994
Chievo 2.2796843525
Empoli 2.20741464882
Milan 2.17008875932
Cagliari 2.13436581672
Fiorentina 1.7815215445
Lazio 1.65869880867
Udinese 1.64523539414
Palermo 1.1333146753
Crotone 0.967736261906


In [13]:
print targ_1617
#Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % model_1.score(feat_1617, targ_1617))
print("Residual sum of squares: %.2f"
      % np.mean((model_1.predict(feat_1617) - targ_1617) ** 2))

[6, 3, 4, 3, 0, 3, 3, 0, 3, 1, 0, 6, 6, 1, 3, 3, 1, 6, 1, 4]
Variance score: 0.32
Residual sum of squares: 2.75


In [14]:
model_1 = joblib.load('../data/regtree_model.pkl')

pred_1617 = model_1.predict(feat_1617)

In [15]:
print pred_1617
print targ_1617

[ 43.35714286  43.35714286  43.35714286  43.35714286  43.35714286
  43.35714286  43.35714286  43.35714286  43.35714286  43.35714286
  43.35714286  43.35714286  43.35714286  43.35714286  43.35714286
  43.35714286  43.35714286  43.35714286  43.35714286  43.35714286]
[6, 3, 4, 3, 0, 3, 3, 0, 3, 1, 0, 6, 6, 1, 3, 3, 1, 6, 1, 4]
