In [1]:
import pandas as pd


In [2]:
def init_pl_teams():
    teams = []
    teamstring = """
     AFC Bournemouth
     Arsenal
     Burnley
     Chelsea
     Crystal Palace
     Everton
     Hull City
     Leicester City
     Liverpool
     Manchester City
     Manchester United
     Middlesbrough
     Southampton
     Stoke City
     Sunderland
     Swansea City
     Tottenham Hotspur
     Watford
     West Bromwich Albion
     West Ham United"""

    for team in teamstring.split('\n'):
        if len(team.strip()) > 1:
            teams.append(team.strip())

    return teams
pl_teams = init_pl_teams()

def is_pl_team(team):
    global pl_teams
    return team in pl_teams
        
pl_teams
    

['AFC Bournemouth',
 'Arsenal',
 'Burnley',
 'Chelsea',
 'Crystal Palace',
 'Everton',
 'Hull City',
 'Leicester City',
 'Liverpool',
 'Manchester City',
 'Manchester United',
 'Middlesbrough',
 'Southampton',
 'Stoke City',
 'Sunderland',
 'Swansea City',
 'Tottenham Hotspur',
 'Watford',
 'West Bromwich Albion',
 'West Ham United']

In [3]:


all_players = pd.read_csv(filepath_or_buffer='17500_players.csv')
players = all_players[all_players['team'].map(is_pl_team)]
players

#conveniently, the players are sorted for us. 


Unnamed: 0,name,team,fifa_OVA,fifa_POT
5,De Gea,Manchester United,90,92
9,Z. Ibrahimović,Manchester United,90,90
10,T. Courtois,Chelsea,89,93
12,M. Özil,Arsenal,89,89
15,S. Agüero,Manchester City,89,89
16,P. Pogba,Manchester United,88,94
18,K. De Bruyne,Manchester City,88,91
20,E. Hazard,Chelsea,88,90
23,H. Lloris,Tottenham Hotspur,88,88
28,P. Čech,Arsenal,88,88


In [4]:
#print the two best players in each team
for team in teams:
    print team
    print players[players['team'] == team].head(n=2)

AFC Bournemouth
Empty DataFrame
Columns: [name, team, fifa_OVA, fifa_POT]
Index: []
Arsenal
       name     team  fifa_OVA  fifa_POT
12  M. Özil  Arsenal        89        89
28  P. Čech  Arsenal        88        88
Burnley
          name     team  fifa_OVA  fifa_POT
648  S. Defour  Burnley        79        79
894  T. Heaton  Burnley        78        78
Chelsea
           name     team  fifa_OVA  fifa_POT
10  T. Courtois  Chelsea        89        93
20    E. Hazard  Chelsea        88        90
Crystal Palace
            name            team  fifa_OVA  fifa_POT
118  S. Mandanda  Crystal Palace        84        84
333   C. Benteke  Crystal Palace        81        83
Everton
            name     team  fifa_OVA  fifa_POT
93     R. Lukaku  Everton        84        90
173  A. Williams  Everton        83        83
Hull City
            name       team  fifa_OVA  fifa_POT
633   D. Mbokani  Hull City        79        79
1186   C. Davies  Hull City        77        77
Leicester City
           na

In [5]:
# teams_from_players = set()
# for row in all_players.iterrows():
#     teams_from_players.add(row[1]['team'])
# This creates a set from the teams in the all_players.csv. 

In [6]:
#player names might be different across different datasets. For example, D. Payet instead of Dmitriy Payet or Dmitri Payet..
#if necessary, this function should have some logic to resolve similar names (D. Payet -> Dmitriy Payet) within the context of a team
def get_player_fifa_OVA(name, team=None):
    if (team == None):
        score = players[players['name'] == name].iloc[0]['fifa_OVA']
    else:
        score = players[(players['team'] == team) & (players['name'] == name)].iloc[0]['fifa_OVA']
    return int(score)

print get_player_fifa_OVA('D. Payet')
print get_player_fifa_OVA('J. Defoe')

    

86
80


## Creating match history dataframes
Here we will read in the .csv files of the match histories. Each csv file contains information in the following format:
date, home, away, home_goals, away_goals, result, league

In [36]:
seasons = ['2016','2015','2014','2013','2012']
season_histories = {season:None for season in seasons}
for season in seasons:
    df = pd.read_csv('premier-league-' + season +'.csv')
    season_histories[season] = df
    
season_histories['2015']
len(season_histories['2015'].index)
home_games = df[(df['home'] == 'Arsenal') | (df['away'] == 'Arsenal')]
home_games

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

# Baseline Predictor
The baseline predictor will attempt to predict the outcome of a football match (either a home win, draw, or home loss), by considering the following factors:

    1) The overall win/loss ratio of each of the two teams
    2) The win/loss ratio of the two teams specifically against each other
    3) The sum of the FIFA_OVA ratings for the players on each team (more involved because we need the roster for each team for each game we want to predict).

In [48]:
#home is name of home team, away is name of away team, seasons is a list of seasons (['2016','2015',...]) to take into account
def predict(home,away,seasons,home_roster=None, away_roster=None):
    #home overall win count, home overall draw count, home overall loss count, home overall number of games
    home_ow, home_od, home_ol, home_og = 0,0,0,0
    #away overall win count, away overall draw count, away overall loss count, away oveall number of games
    away_ow, away_od, away_ol, away_og = 0,0,0,0
    
    #count of home wins/draw/loss/ number of total games against this specific team, no need for another set of variables for the away team,
    #for they are complementary
    home_sw, home_sd, home_sl, home_sg = 0,0,0,0
    
    #iterate through the seasons
    for season in seasons:
        #change the name for ease of use
        df = season_histories[season]
        
        #get a dataframe of all the games from this season that the home team played in (either as home or away)
        home_games = df[(df['home'] == home) | (df['away'] == home)]
        
        #same for the away team
        away_games = df[(df['home'] == away) | (df['away'] == away)]
        
        
        # add on total number of overall games for each    
        home_og += len(home_games.index)
        away_og += len(away_games.index)
        
        # add on total number of games against each other
        against_each_other_games = home_games[(home_games['home'] == away) | (home_games['away'] == away)]
        home_sg += len(against_each_other_games.index)
        
        # lets count how many games the home team won, drew, and lost (overall)
        home_ow += len(home_games[home_games['result'] == 'W'].index)
        home_od += len(home_games[home_games['result'] == 'D'].index)
        home_ol += len(home_games[home_games['result'] == 'L'].index)
        
        # lets count how many games the home team won, drew, and lost (overall)
        away_ow += len(away_games[away_games['result'] == 'W'].index)
        away_od += len(away_games[away_games['result'] == 'D'].index)
        away_ol += len(away_games[away_games['result'] == 'L'].index)
        
        # lets count how many games the home team won, drew, and lost against the away team. 
        home_sw += len(home_games[((home_games['home'] == away) | (home_games['away'] == away)) & (home_games['result'] == 'W')].index)
        home_sd += len(home_games[((home_games['home'] == away) | (home_games['away'] == away)) & (home_games['result'] == 'D')].index)
        home_sl += len(home_games[((home_games['home'] == away) | (home_games['away'] == away)) & (home_games['result'] == 'L')].index)
               
        print home, "won", len(home_games[home_games['result'] == 'W'].index), "games during the",season,'season.'
        print away, "won", len(away_games[away_games['result'] == 'W'].index), "games during the",season,'season.'
        print '\n'
        
    #divide by total number of games to create the ratio
    home_ow_rat = home_ow / float(home_og)
    home_od_rat = home_od / float(home_og)
    home_ol_rat = home_ol / float(home_og)

    #divide by the total number of games to create the ratio
    away_ow_rat = away_ow / float(away_og)
    away_od_rat = away_od / float(away_og)
    away_ol_rat = away_ol / float(away_og)
    
    print home, 'stats for season range:',seasons
    print "overall win ratio:",'\t',home_ow_rat
    print "overall draw ratio:",'\t',home_od_rat
    print "Home overall loss ratio:",'\t',home_ol_rat
    
    print away, 'stats for season range:',seasons
    print "away overall win ratio:",'\t',away_ow_rat
    print "away overall draw ratio:",'\t',away_od_rat
    print "away overall loss ratio:",'\t',away_ol_rat
    
    
       
            
predict('Manchester United','Leicester City',['2016','2015','2014','2012'])

    

Manchester United won 50 games during the 2016 season.
Leicester City won 27 games during the 2016 season.


Manchester United won 37 games during the 2015 season.
Leicester City won 34 games during the 2015 season.


Manchester United won 39 games during the 2014 season.
Leicester City won 27 games during the 2014 season.


Manchester United won 32 games during the 2012 season.
Leicester City won 32 games during the 2012 season.


Manchester United stats for season range: ['2016', '2015', '2014', '2012']
Home overall win ratio: 	0.463343108504
Home overall draw ratio: 	0.208211143695
Home overall loss ratio: 	0.328445747801
Leicester City stats for season range: ['2016', '2015', '2014', '2012']
away overall win ratio: 	0.421052631579
away overall draw ratio: 	0.249122807018
away overall loss ratio: 	0.329824561404
