In [1]:
import pandas as pd


In [2]:
def init_pl_teams():
    teams = []
    teamstring = """
     Bournemouth
     Arsenal
     Burnley
     Chelsea
     Crystal Palace
     Everton
     Hull City
     Leicester City
     Liverpool
     Manchester City
     Manchester United
     Middlesbrough
     Southampton
     Stoke City
     Sunderland
     Swansea City
     Tottenham Hotspur
     Watford
     West Bromwich Albion
     West Ham United"""

    for team in teamstring.split('\n'):
        if len(team.strip()) > 1:
            teams.append(team.strip())

    return teams
pl_teams = init_pl_teams()

def is_pl_team(team):
    global pl_teams
    return team in pl_teams
        
pl_teams
    

['Bournemouth',
 'Arsenal',
 'Burnley',
 'Chelsea',
 'Crystal Palace',
 'Everton',
 'Hull City',
 'Leicester City',
 'Liverpool',
 'Manchester City',
 'Manchester United',
 'Middlesbrough',
 'Southampton',
 'Stoke City',
 'Sunderland',
 'Swansea City',
 'Tottenham Hotspur',
 'Watford',
 'West Bromwich Albion',
 'West Ham United']

In [3]:


all_players = pd.read_csv(filepath_or_buffer='17500_players.csv')
players = all_players[all_players['team'].map(is_pl_team)]
players

#conveniently, the players are sorted for us. 


Unnamed: 0,name,team,fifa_OVA,fifa_POT
5,De Gea,Manchester United,90,92
9,Z. Ibrahimović,Manchester United,90,90
10,T. Courtois,Chelsea,89,93
12,M. Özil,Arsenal,89,89
15,S. Agüero,Manchester City,89,89
16,P. Pogba,Manchester United,88,94
18,K. De Bruyne,Manchester City,88,91
20,E. Hazard,Chelsea,88,90
23,H. Lloris,Tottenham Hotspur,88,88
28,P. Čech,Arsenal,88,88


In [4]:
#print the two best players in each team
for team in pl_teams:
    print team
    print players[players['team'] == team].head(n=2)

Bournemouth
            name         team  fifa_OVA  fifa_POT
221  J. Wilshere  Bournemouth        82        85
625    M. Gradel  Bournemouth        79        79
Arsenal
       name     team  fifa_OVA  fifa_POT
12  M. Özil  Arsenal        89        89
28  P. Čech  Arsenal        88        88
Burnley
          name     team  fifa_OVA  fifa_POT
648  S. Defour  Burnley        79        79
894  T. Heaton  Burnley        78        78
Chelsea
           name     team  fifa_OVA  fifa_POT
10  T. Courtois  Chelsea        89        93
20    E. Hazard  Chelsea        88        90
Crystal Palace
            name            team  fifa_OVA  fifa_POT
118  S. Mandanda  Crystal Palace        84        84
333   C. Benteke  Crystal Palace        81        83
Everton
            name     team  fifa_OVA  fifa_POT
93     R. Lukaku  Everton        84        90
173  A. Williams  Everton        83        83
Hull City
            name       team  fifa_OVA  fifa_POT
633   D. Mbokani  Hull City        79        7

In [5]:
# teams_from_players = set()
# for row in all_players.iterrows():
#     teams_from_players.add(row[1]['team'])
# This creates a set from the teams in the all_players.csv. 

In [6]:
#player names might be different across different datasets. For example, D. Payet instead of Dmitriy Payet or Dmitri Payet..
#if necessary, this function should have some logic to resolve similar names (D. Payet -> Dmitriy Payet) within the context of a team
def get_player_fifa_OVA(name, team=None):
    if (team == None):
        score = players[players['name'] == name].iloc[0]['fifa_OVA']
    else:
        score = players[(players['team'] == team) & (players['name'] == name)].iloc[0]['fifa_OVA']
    return int(score)

print get_player_fifa_OVA('D. Payet')
print get_player_fifa_OVA('J. Defoe')

    

86
80


## Creating match history dataframes
Here we will read in the .csv files of the match histories. Each csv file contains information in the following format:
date, home, away, home_goals, away_goals, result, league

In [7]:
seasons = ['2016','2015','2014','2013','2012']
season_histories = {season:None for season in seasons}
for season in seasons:
    df = pd.read_csv('premier-league-' + season +'.csv')
    season_histories[season] = df
    
season_histories['2015']
len(season_histories['2015'].index)
df = season_histories['2014']
home_games = df[(df['home'] == 'Leicester City') | (df['away'] == 'Leicester City') & (df['league'] == 'Premier League')]
home_games

Unnamed: 0,date,home,away,home_goals,away_goals,result,league
14,26 Oct 2013,Leicester City,AFC Bournemouth,2,1,W,League Championship
129,14 Dec 2013,Leicester City,Burnley,1,1,D,League Championship
349,11 Aug 2013,Leicester City,Leeds United,0,0,D,League Championship
351,24 Aug 2013,Leicester City,Birmingham City,3,2,W,League Championship
354,14 Sep 2013,Leicester City,Wigan Athletic,2,0,W,League Championship
355,17 Sep 2013,Leicester City,Blackburn Rovers,2,1,W,League Championship
357,24 Sep 2013,Leicester City,Derby County,2,1,W,League Cup
358,28 Sep 2013,Leicester City,Barnsley,2,1,W,League Championship
361,19 Oct 2013,Leicester City,Huddersfield Town,2,1,W,League Championship
362,26 Oct 2013,Leicester City,AFC Bournemouth,2,1,W,League Championship


# Baseline Predictor
The baseline predictor will attempt to predict the outcome of a football match (either a home win, draw, or home loss), by considering the following factors:

    1) The overall win/loss ratio of each of the two teams
    2) The win/loss ratio of the two teams specifically against each other
    3) The sum of the FIFA_OVA ratings for the players on each team (more involved because we need the roster for each team for each game we want to predict).

In [41]:
#home is name of home team, away is name of away team, seasons is a list of seasons (['2016','2015',...]) to take into account
def predict(home,away,seasons,home_roster=None, away_roster=None):
    #home overall win count, home overall draw count, home overall loss count, home overall number of games
    home_ow, home_od, home_ol, home_og = 0,0,0,0
    #away overall win count, away overall draw count, away overall loss count, away oveall number of games
    away_ow, away_od, away_ol, away_og = 0,0,0,0
    
    #count of home wins/draw/loss/ number of total games against this specific team, no need for another set of variables for the away team,
    #for they are complementary
    home_sw, home_sd, home_sl, home_sg = 0,0,0,0
    
    #iterate through the seasons
    for season in seasons:
        #change the name for ease of use
        df = season_histories[season]
        
        #get a dataframe of all the games from this season that the home team played in (either as home or away)
        home_games = df[((df['home'] == home) | (df['away'] == home)) & (df['league'] == 'Premier League')]
        
        #same for the away team
        away_games = df[((df['home'] == away) | (df['away'] == away)) & (df['league'] == 'Premier League')]
        
        
        # add on total number of overall games for each    
        home_og += len(home_games.index)
        away_og += len(away_games.index)
        
        # add on total number of games against each other
        against_each_other_games = home_games[(home_games['home'] == away) | (home_games['away'] == away)]
        home_sg += len(against_each_other_games.index)
        
        # lets count how many games the home team won, drew, and lost (overall)
        home_ow += len(home_games[home_games['result'] == 'W'].index)
        home_od += len(home_games[home_games['result'] == 'D'].index)
        home_ol += len(home_games[home_games['result'] == 'L'].index)
        
        # lets count how many games the home team won, drew, and lost (overall)
        away_ow += len(away_games[away_games['result'] == 'W'].index)
        away_od += len(away_games[away_games['result'] == 'D'].index)
        away_ol += len(away_games[away_games['result'] == 'L'].index)
        
        # lets count how many games the home team won, drew, and lost against the away team. 
        home_sw += len(home_games[((home_games['home'] == away) | (home_games['away'] == away)) & (home_games['result'] == 'W')].index)
        home_sd += len(home_games[((home_games['home'] == away) | (home_games['away'] == away)) & (home_games['result'] == 'D')].index)
        home_sl += len(home_games[((home_games['home'] == away) | (home_games['away'] == away)) & (home_games['result'] == 'L')].index)
               
        print home, "won", len(home_games[home_games['result'] == 'W'].index), "games during the",season,'season.'
        print away, "won", len(away_games[away_games['result'] == 'W'].index), "games during the",season,'season.'
        print '\n'
        
    #divide by total number of games to create the ratio
    home_ow_rat = home_ow / float(home_og)
    home_od_rat = home_od / float(home_og)
    home_ol_rat = home_ol / float(home_og)

    #divide by the total number of games to create the ratio
    away_ow_rat = away_ow / float(away_og)
    away_od_rat = away_od / float(away_og)
    away_ol_rat = away_ol / float(away_og)
    
    #divide by the total number of games to create the ratio
    home_sw_rat = home_sw / float(home_sg)
    home_sd_rat = home_sd / float(home_sg)
    home_sl_rat = home_sl / float(home_sg)
    
    print '-----------------------------------'
    print home, 'stats for season range:',seasons
    print "overall win ratio:",'\t',home_ow_rat
    print "overall draw ratio:",'\t',home_od_rat
    print "overall loss ratio:",'\t',home_ol_rat
    print ''
    
    print away, 'stats for season range:',seasons
    print "overall win ratio:",'\t',away_ow_rat
    print "overall draw ratio:",'\t',away_od_rat
    print "overall loss ratio:",'\t',away_ol_rat
    print ''
    
    print 'Rivalry stats:'
    print home, 'beats',   '\t',away, "{0:.2f}".format(home_sw_rat*100) + '%', 'of the time.'
    print home, 'draws',   '\t',away, "{0:.2f}".format(home_sd_rat*100) + '%', 'of the time.'
    print home, 'loses to','\t',away, "{0:.2f}".format(home_sl_rat*100) + '%', 'of the time.'
    
    print home, 'has played', home_sg, 'games against',away,','
    print 'winning',home_sw,'of them,','drawing',home_sd,'of them,','and losing',home_sl,'of them.'
    print '-----------------------------------'
    
    
    ########
    ## Calculate average player FIFA rating of top 11 players
    ## Note: 
    
    num_players = 11
    
    home_ratings = (players[players['team'] == home]['fifa_OVA'])
    home_avg_rating = home_ratings[:num_players].sum(axis=0) / float(num_players)
    
    away_ratings = (players[players['team'] == away]['fifa_OVA'])
    away_avg_rating = away_ratings[:num_players].sum(axis=0) / float(num_players)
    
    print '{0: <20}'.format(home),'average rating of top', str(num_players) + ":",'{0:.2f}'.format(home_avg_rating)
    print '{0: <20}'.format(away),'average rating of top', str(num_players) + ":",'{0:.2f}'.format(away_avg_rating)
    
    

            
predict('Manchester United','Leicester City',['2016','2015','2014','2012'])

    

Manchester United won 39 games during the 2016 season.
Leicester City won 25 games during the 2016 season.


Manchester United won 35 games during the 2015 season.
Leicester City won 32 games during the 2015 season.


Manchester United won 25 games during the 2014 season.
Leicester City won 0 games during the 2014 season.


Manchester United won 27 games during the 2012 season.
Leicester City won 0 games during the 2012 season.


-----------------------------------
Manchester United stats for season range: ['2016', '2015', '2014', '2012']
overall win ratio: 	0.477272727273
overall draw ratio: 	0.212121212121
overall loss ratio: 	0.310606060606

Leicester City stats for season range: ['2016', '2015', '2014', '2012']
overall win ratio: 	0.407142857143
overall draw ratio: 	0.278571428571
overall loss ratio: 	0.314285714286

Rivalry stats:
Manchester United beats 	Leicester City 50.00% of the time.
Manchester United draws 	Leicester City 50.00% of the time.
Manchester United loses to 	Leic

84.81818181818181

In [9]:
players['fifa_OVA'].sum(axis=0)

46986L

In [11]:
print pl_teams

['Bournemouth', 'Arsenal', 'Burnley', 'Chelsea', 'Crystal Palace', 'Everton', 'Hull City', 'Leicester City', 'Liverpool', 'Manchester City', 'Manchester United', 'Middlesbrough', 'Southampton', 'Stoke City', 'Sunderland', 'Swansea City', 'Tottenham Hotspur', 'Watford', 'West Bromwich Albion', 'West Ham United']


In [12]:
for team in pl_teams:
    if len(players[players['team'] == team].index) == 0:
        print team

Unnamed: 0,name,team,fifa_OVA,fifa_POT
625,M. Gradel,Bournemouth,79,79
