In [1]:
import pandas as pd
import pickle
from scipy.stats import poisson

In [2]:
dict_table = pickle.load(open('dict_table', 'rb'))
df_historical_data = pd.read_csv('clean_fifa_worldcup_matches.csv')
df_fixtures = pd.read_csv('clean_fifa_worldcup_fixture.csv')

1. Calculating Team Strength

Spliting data in to df_home and df_away data frames

In [3]:
df_home = df_historical_data[['HomeTeam', 'HomeGoals', 'AwayGoals']]
df_away = df_historical_data[['AwayTeam', 'HomeGoals', 'AwayGoals']]

Renaming the HOME and AWAY TEAMS as just TEAMS for better understanding

In [4]:
df_home = df_home.rename(columns={'HomeTeam': 'Team', 'HomeGoals': 'GoalsScored', 'AwayGoals': 'GoalsConceded'})
df_away = df_away.rename(columns={'AwayTeam': 'Team', 'HomeGoals': 'GoalsConceded', 'AwayGoals': 'GoalsScored'})

Concatenate home and away data for getting the team strength

In [6]:
df_team_strength = pd.concat([df_home, df_away], ignore_index=True).groupby('Team').mean()
df_team_strength

Unnamed: 0_level_0,GoalsScored,GoalsConceded
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,1.000000,1.461538
Angola,0.333333,0.666667
Argentina,1.691358,1.148148
Australia,0.812500,1.937500
Austria,1.482759,1.620690
...,...,...
Uruguay,1.553571,1.321429
Wales,0.800000,0.800000
West Germany,2.112903,1.241935
Yugoslavia,1.666667,1.272727


Create Function to give points to the team
>> Choosing distrubution of points
>>> 1. The number of events can be counted - for example -> number of goals scored
>>> 2. the occurrence of event are independent - for example -> the goals scored by team is independent
>>> 3. the rate at witch event occur is constant - for example -> the probability of score in one match is can be similar to goal scored in previous match
>>> 4. two events cannot occur at exactly the same instant in time - for example -> both team can not score goals at same exact time

In [7]:
def predict_points(home, away):
    if home in df_team_strength.index and away in df_team_strength.index:
        # goals_scored * goals_conceded
        # for Poisson Distribution in this case -> lamda is median of goals in 90 minutes for both teams and X is the no. of goals that can be scored in a match
        lamb_home = df_team_strength.at[home, 'GoalsScored'] * df_team_strength.at[away, 'GoalsConceded']
        lamb_away = df_team_strength.at[away, 'GoalsScored'] * df_team_strength.at[home, 'GoalsConceded']
        prob_home, prob_away, prob_draw = 0,0,0
        for x in range(0,11): # number of goals home team
            for y in range(0,11): # number of goals away team
                p = poisson.pmf(x, lamb_home) * poisson.pmf(y, lamb_away)
                if x == y:
                    prob_draw += p
                elif x > y:
                    prob_home += p
                else:
                    prob_away += p

        points_home = 3 * prob_home + prob_draw
        points_away = 3 * prob_away + prob_draw
        return points_home, points_away
    else:
        return 0, 0


2. Testing the above Function

In [8]:
# checking with Argentina - Mexico match prediction

predict_points('Argentina', 'Mexico')

(2.3129151525530505, 0.5378377125059863)

<h1> 3. Predicting the Worldcup 2022 </h1>

Building Group Stage from fixtures dataset

In [78]:
df_fixture_group_48 = df_fixtures[:48].copy()
df_fixture_knockout = df_fixtures[48:56].copy()
df_fixture_quarter = df_fixtures[56:60].copy()
df_fixture_semi = df_fixtures[60:62].copy()
df_fixture_final = df_fixtures[62:].copy()

In [79]:
df_fixture_group_48

Unnamed: 0,Home Team,Match Scores,Away Team,year
0,Qatar,0–2,Ecuador,2022
1,Senegal,0–2,Netherlands,2022
2,Qatar,1–3,Senegal,2022
3,Netherlands,1–1,Ecuador,2022
4,Ecuador,1–2,Senegal,2022
5,Netherlands,2–0,Qatar,2022
6,England,6–2,Iran,2022
7,United States,1–1,Wales,2022
8,Wales,0–2,Iran,2022
9,England,0–0,United States,2022


Running all the matches in the group stage and updating the table

In [80]:
for group in dict_table:
    print(dict_table[group]['Team'].values)

['Netherlands' 'Senegal' 'Ecuador' 'Qatar (H)']
['England' 'Wales' 'United States' 'Iran']
['Argentina' 'Poland' 'Mexico' 'Saudi Arabia']
['France' 'Denmark' 'Tunisia' 'Australia']
['Germany' 'Spain' 'Japan' 'Costa Rica']
['Croatia' 'Belgium' 'Morocco' 'Canada']
['Brazil' 'Switzerland' 'Serbia' 'Cameroon']
['Portugal' 'Uruguay' 'Ghana' 'South Korea']


In [81]:
for group in dict_table:
    teams_in_group = dict_table[group]['Team'].values
    df_fixture_group_6 = df_fixture_group_48[df_fixture_group_48['Home Team'].isin(teams_in_group)]
    for index, row in df_fixture_group_6.iterrows():
        home, away = row['Home Team'], row['Away Team']
        points_home, points_away = predict_points(home, away)
        dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += points_home
        dict_table[group].loc[dict_table[group]['Team'] == away, 'Pts'] += points_away

    dict_table[group] = dict_table[group].sort_values('Pts', ascending=False).reset_index()
    dict_table[group] = dict_table[group][['Team', 'Pts']]
    dict_table[group] = dict_table[group].round(0)

In [82]:
# checking Updated table
dict_table['Group A']

Unnamed: 0,Team,Pts
0,Netherlands,31.0
1,Senegal,18.0
2,Ecuador,16.0
3,Qatar (H),0.0


In [83]:
df_fixture_knockout

Unnamed: 0,Home Team,Match Scores,Away Team,year
48,Netherlands,3–1,United States,2022
49,Argentina,2–1,Australia,2022
50,France,3–1,Poland,2022
51,England,3–0,Senegal,2022
52,Japan,1–1,Croatia,2022
53,Brazil,4–1,South Korea,2022
54,Morocco,0–0,Spain,2022
55,Portugal,6–1,Switzerland,2022


In [84]:
# update the knockout fixture with group winner and runner-up
for group in dict_table:
    group_winner = dict_table[group].loc[0, 'Team']
    runner_up = dict_table[group].loc[1, 'Team']
    df_fixture_knockout.replace({f'Winners{group}': group_winner, f'Runner-up{group}': runner_up}, inplace=True)
df_fixture_knockout['winner'] = '?'
df_fixture_knockout


Unnamed: 0,Home Team,Match Scores,Away Team,year,winner
48,Netherlands,3–1,United States,2022,?
49,Argentina,2–1,Australia,2022,?
50,France,3–1,Poland,2022,?
51,England,3–0,Senegal,2022,?
52,Japan,1–1,Croatia,2022,?
53,Brazil,4–1,South Korea,2022,?
54,Morocco,0–0,Spain,2022,?
55,Portugal,6–1,Switzerland,2022,?



Knockout stage predicton

In [89]:
# create get_winner Function
def get_winner(df_fixture_updated):
    for index, row in df_fixture_updated.iterrows():
        home, away  = row['Home Team'], row['Away Team']
        points_home, points_away = predict_points(home, away)
        if points_home > points_away:
            winner = home
        else:
            winner = away
    df_fixture_updated.loc[index, 'winner'] = winner
    return df_fixture_updated

In [90]:
get_winner(df_fixture_knockout)

Unnamed: 0,Home Team,Match Scores,Away Team,year,winner
48,Netherlands,3–1,United States,2022,?
49,Argentina,2–1,Australia,2022,?
50,France,3–1,Poland,2022,?
51,England,3–0,Senegal,2022,?
52,Japan,1–1,Croatia,2022,Croatia
53,Brazil,4–1,South Korea,2022,?
54,Morocco,0–0,Spain,2022,Spain
55,Portugal,6–1,Switzerland,2022,Portugal


Quarter Final Prediction

In [93]:
def update_table(df_fixture_round_1, df_fixture_round_2):
    for index, row in df_fixture_round_1.iterrows():
        winner = df_fixture_round_1.loc[index, 'winner']
        match = df_fixture_round_1.loc[index, 'winner']
        df_fixture_round_2.replace({f'Winners {match}':winner}, inplace=True)
    df_fixture_round_2['winner'] = '?'
    return df_fixture_round_2

In [94]:
update_table(df_fixture_knockout, df_fixture_quarter)

Unnamed: 0,Home Team,Match Scores,Away Team,year,winner
56,Croatia,1–1,Brazil,2022,?
57,Netherlands,2–2,Argentina,2022,?
58,Morocco,1–0,Portugal,2022,?
59,England,1–2,France,2022,?


In [95]:
get_winner(df_fixture_quarter)

Unnamed: 0,Home Team,Match Scores,Away Team,year,winner
56,Croatia,1–1,Brazil,2022,?
57,Netherlands,2–2,Argentina,2022,?
58,Morocco,1–0,Portugal,2022,?
59,England,1–2,France,2022,France


Semi Final prediction

In [96]:
update_table(df_fixture_quarter, df_fixture_semi)

Unnamed: 0,Home Team,Match Scores,Away Team,year,winner
60,Argentina,3–0,Croatia,2022,?
61,France,2–0,Morocco,2022,?


In [97]:
get_winner(df_fixture_semi)

Unnamed: 0,Home Team,Match Scores,Away Team,year,winner
60,Argentina,3–0,Croatia,2022,?
61,France,2–0,Morocco,2022,France


In [98]:
update_table(df_fixture_semi, df_fixture_final)

Unnamed: 0,Home Team,Match Scores,Away Team,year,winner
62,Croatia,2–1,Morocco,2022,?
63,Argentina,3–3 (a.e.t.),France,2022,?


In [77]:
get_winner(df_fixture_final)

Unnamed: 0,Home Team,Match Scores,Away Team,year,winner
62,Croatia,2–1,Morocco,2022,?
63,Argentina,3–3 (a.e.t.),France,2022,France
