In [1]:
import numpy as np
import pandas as pd

import itertools

In [2]:
matches = pd.read_csv('data/matches/results.csv')
shootouts = pd.read_csv('data/matches/results.csv')
ranking = pd.read_csv('data/ranking/fifa_ranking-2021-05-27.csv')

In [3]:
matches = matches.loc[matches.date > '1993-01-01', ]

In [4]:
exceptions = ['Comoros', 'Gambia', 'Equatorial Guinea']
matches = matches.loc[matches.tournament.isin(['African Cup of Nations']) | matches.home_team.isin(exceptions) | matches.away_team.isin(exceptions), ]

In [5]:
GAMES = len(matches)

home_rank = np.zeros(GAMES, dtype=np.int)
away_rank = np.zeros(GAMES, dtype=np.int)
home_total_points = np.zeros(GAMES, dtype=np.float)
away_total_points = np.zeros(GAMES, dtype=np.float)
for i in range(GAMES):
    home_listing = ranking[((ranking.country_full == matches.iloc[i].home_team) & 
                            (ranking.rank_date <= matches.iloc[i].date))].sort_values(by='rank_date', ascending=False)
    
    try:
        home_rank[i] = int(home_listing.iloc[0]['rank'])
    except:
        home_rank[i] = 155
        
    away_listing = ranking[((ranking.country_full == matches.iloc[i].away_team) & 
                            (ranking.rank_date <= matches.iloc[i].date))].sort_values(by='rank_date', ascending=False)
        
    try:
        away_rank[i] = int(away_listing.iloc[0]['rank'])
    except:
        away_rank[i] = 155

In [6]:
matches['home_rank'] = home_rank
matches['away_rank'] = away_rank
matches.drop(labels=['tournament', 'date', 'country'], axis=1, inplace=True)
matches.neutral = matches.neutral.astype(int)

In [7]:
X = matches[['home_team', 'away_team', 'neutral', 'home_rank', 'away_rank']]
y1 = matches['home_score']
y2 = matches['away_score']

onehot_columns = ['home_team', 'away_team']
onehot_df = X[onehot_columns]
onehot_df = pd.get_dummies(onehot_df, columns = onehot_columns)
match_onehot_drop = X.drop(onehot_columns, axis = 1)
match_onehot = pd.concat([match_onehot_drop, onehot_df], axis = 1)

In [8]:
from xgboost import XGBRegressor

# home team score model
hmodel = XGBRegressor()
hmodel.fit(match_onehot.values, y1.values)
#away team score model
amodel = XGBRegressor()
amodel.fit(match_onehot.values, y2.values)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [9]:
import uuid
all_games = {}

def predict(h_country, a_country, neutral=True):
    # create vector
    cols = ['neutral', 'home_rank', 'away_rank']
    for c in onehot_df.columns.to_list():
        cols.append(c)
    df = pd.DataFrame(np.zeros((1,len(cols)), dtype=int), columns=cols)
    if neutral:
        df.neutral.iloc[0] = 1
    else:
        df.neutral.iloc[0] = 0
    
    try:
        df.home_rank.iloc[0] = ranking[((ranking.rank_date == '2021-05-27') & (ranking.country_full == h_country))]['rank'].values[0]
    except:
        df.home_rank.iloc[0] = 155
        
    try:
        df.away_rank.iloc[0] = ranking[((ranking.rank_date == '2021-05-27') & (ranking.country_full == a_country))]['rank'].values[0]
    except:
        df.away_rank.iloc[0] = 155
        
    df['home_team_'+h_country].iloc[0] = 1
    df['away_team_'+a_country].iloc[0] = 1
    #df = df[hmodel.get_booster().feature_names]
    # predict
    hscore = int(hmodel.predict(df.iloc[0].to_numpy().reshape(1,118))[0])
    ascore = int(amodel.predict(df.iloc[0].to_numpy().reshape(1,118))[0])        
    
    hval = hmodel.predict(df.iloc[0].to_numpy().reshape(1,118))[0]
    aval = amodel.predict(df.iloc[0].to_numpy().reshape(1,118))[0]
    
    #all_games[uuid.uuid4()] = {'team' : h_country, 'opponent' : a_country, 'goals_team':hscore, 'goals_opponent':ascore}
    
    return h_country, a_country, [hscore, ascore], [hval, aval]

def get_match_result(foot_model, team1, team2, elimination=False):
    
    h, a, score, scored = predict(team1, team2)
    
    all_games[uuid.uuid4()] = {'team' : team1, 'opponent' : team2, 'goals_team':score[0], 'goals_opponent':score[1]}
    
    if score[0] == score[1]:
        result = 'draw'
        looser = 'draw'
    elif score[0] > score[1]:
        result = team1
        looser = team2
    else:
        result = team2
        looser = team1
        score.reverse()
        
    
    if not elimination:
        return result, looser, score
    else:
        if result != 'draw':
            return result, looser, score
        else:
            if scored[0] >= scored[1]:
                return team1, team2, score
            else:
                return team2, team1, score

In [10]:
groupA = ['Burkina Faso', 'Cape Verde', 'Cameroon', 'Ethiopia']
groupB = ['Guinea', 'Malawi', 'Senegal', 'Zimbabwe']
groupC = ['Comoros', 'Gabon', 'Ghana', 'Morocco']
groupD = ['Egypt', 'Guinea-Bissau', 'Nigeria', 'Sudan']
groupE = ['Algeria', 'Ivory Coast', 'Equatorial Guinea', 'Sierra Leone']
groupF = ['Gambia', 'Mali', 'Mauritania', 'Tunisia']
groups = [groupA, groupB, groupC, groupD, groupE, groupF]
groups_names = ['A', 'B', 'C', 'D', 'E', 'F']

In [11]:
round_16 = pd.read_csv('data/round_16/round_16.csv', sep=';')
round_16_possibilities = pd.read_csv('data/round_16/round_16_possibilities.csv', sep=';')

In [12]:
def get_group_result(foot_model, group):
    ranking = pd.DataFrame({'points':[0,0,0,0], 'diff':[0,0,0,0], 'goals':[0,0,0,0]}, index=group)
    for team1, team2 in itertools.combinations(group, 2):
        result, looser, score = get_match_result(foot_model, team1, team2)
        #print(result, '-', looser,':', score)
        if result == 'draw':
            ranking.loc[[team1, team2], 'points'] += 1
            ranking.loc[[team1, team2], 'goals'] += score[0]
        else:
            ranking.loc[result, 'points'] += 3
            ranking.loc[result, 'goals'] += score[0]
            ranking.loc[looser, 'goals'] += score[1]
            ranking.loc[result, 'diff'] += score[0]-score[1]
            ranking.loc[looser, 'diff'] -= score[0]-score[1]
            
    return ranking.sort_values(by=['points','diff','goals'], ascending=False)

def get_team(code, gr, r16p):
    if len(code)==2:
        return gr.loc[code]['name']
    else:
        return gr.loc[r16p.iloc[0][code]]['name']

def get_final_result(foot_model, groups_result, r16=round_16, r16p=round_16_possibilities):
    round_of_16 = []
    quarter_finals = []
    semi_finals = []
    
    
    # SIMULATE ROUND OF 16
    for i in range(0, 8):
        round_of_16.append(get_match_result(foot_model, 
                                            get_team(r16.iloc[i]['home_team'], groups_result, r16p), 
                                            get_team(r16.iloc[i]['away_team'], groups_result, r16p), 
                                            elimination=True))
    
    
    # SIMULATE QUATER FINALS
    quarter_finals.append(get_match_result(foot_model, round_of_16[0][0], round_of_16[2][0], elimination=True))
    quarter_finals.append(get_match_result(foot_model, round_of_16[1][0], round_of_16[3][0], elimination=True))
    quarter_finals.append(get_match_result(foot_model, round_of_16[4][0], round_of_16[6][0], elimination=True))
    quarter_finals.append(get_match_result(foot_model, round_of_16[5][0], round_of_16[7][0], elimination=True))
    
    # SIMULATE SEMI FINALS
    semi_finals.append(get_match_result(foot_model, quarter_finals[0][0], quarter_finals[2][0], elimination=True))
    semi_finals.append(get_match_result(foot_model, quarter_finals[1][0], quarter_finals[3][0], elimination=True))
    
    # SIMULATE 3RD PLACE MATCH
    little_final = get_match_result(foot_model, semi_finals[0][1], semi_finals[1][1], elimination=True)
    
    # SIMULATE FINAL
    final = get_match_result(foot_model, semi_finals[0][0], semi_finals[1][0], elimination=True)
    
    return round_of_16, quarter_finals, semi_finals, little_final, final

In [13]:
#Needed for our african CUP
groupe_stage = {}
for i in range(0, 6):
    r = get_group_result(None, groups[i])
    for k in range(1, 5):
        groupe_stage[groups_names[i]+str(k)] = {'group' : groups_names[i],
                                                'name' : r.index[k-1], 
                                                'points' : r.iloc[k-1]['points'], 
                                                'goals_difference' : r.iloc[k-1]['goals'], 
                                                'goals_scored' : r.iloc[k-1]['goals'], 
                                                'rank' : k}

combination = ''.join(list(pd.DataFrame.from_dict(groupe_stage, orient='index').query('rank==3')\
                           .sort_values(by=['points', 'goals_difference', 'goals_scored'], ascending=False).group[0:4].sort_values()))

groups_ranking = pd.DataFrame.from_dict(groupe_stage, orient='index')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [14]:
round_of_16, quarter_finals, semi_finals, little_final, final = get_final_result(None, 
                                                                                 groups_ranking, 
                                                                                 round_16, 
                                                                                 round_16_possibilities.query('combination=="{0}"'.format(combination)))

In [49]:
get_match_result(None, 'Egypt', 'Tunisia', elimination=True)

('Egypt', 'Tunisia', [1, 0])

In [16]:
pd.DataFrame.from_dict(all_games, orient='index')

Unnamed: 0,team,opponent,goals_team,goals_opponent
01dbbc06-7ae3-444e-8cac-72ba6f169720,Guinea,Malawi,1,1
0afff280-43ac-48ad-ad8c-d4643861fee1,Egypt,Ghana,1,0
0fef6d24-ea51-44ab-adf4-7152581697bf,Gabon,Morocco,1,1
110ab8d7-3b01-488d-a672-bc09af54fd1c,Guinea-Bissau,Sudan,1,0
1d0225fa-50c7-45d2-bbe2-02d3ada9d187,Comoros,Ghana,0,1
20a3c019-b09c-4464-8050-f489ad70d30c,Senegal,Zimbabwe,1,0
29d7d057-8ee0-4662-a1f7-225cab58349e,Ivory Coast,Equatorial Guinea,2,0
2a700905-2d36-406e-97f5-ba17e595da23,Cameroon,Morocco,1,0
2a8cee43-045f-4f61-81e0-0442f0a14be4,Burkina Faso,Ethiopia,1,0
3429dc88-0b12-4904-90a4-3c8152c3dca3,Nigeria,Sudan,1,0
