In [76]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mathan/fifa-2018-match-statistics")

print("Path to dataset files:", path)

Path to dataset files: /Users/jeongho/.cache/kagglehub/datasets/mathan/fifa-2018-match-statistics/versions/20


In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split


from sklearn.neural_network import MLPClassifier
import tensorflow as tf

import os

df = pd.read_csv(os.path.join(path, "FIFA 2018 Statistics.csv"))

In [78]:
df

Unnamed: 0,Date,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,...,Yellow Card,Yellow & Red,Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO,Own goals,Own goal Time
0,14-06-2018,Russia,Saudi Arabia,5,40,13,7,3,3,6,...,0,0,0,Yes,12.0,Group Stage,No,0,,
1,14-06-2018,Saudi Arabia,Russia,0,60,6,0,3,3,2,...,0,0,0,No,,Group Stage,No,0,,
2,15-06-2018,Egypt,Uruguay,0,43,8,3,3,2,0,...,2,0,0,No,,Group Stage,No,0,,
3,15-06-2018,Uruguay,Egypt,1,57,14,4,6,4,5,...,0,0,0,Yes,89.0,Group Stage,No,0,,
4,15-06-2018,Morocco,Iran,0,64,13,3,6,4,5,...,1,0,0,No,,Group Stage,No,0,1.0,90.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,11-07-2018,England,Croatia,1,46,11,1,6,4,4,...,1,0,0,No,5.0,Semi- Finals,No,0,,
124,14-07-2018,Belgium,England,2,43,12,4,3,5,4,...,1,0,0,Yes,4.0,3rd Place,No,0,,
125,14-07-2018,England,Belgium,0,57,15,5,7,3,5,...,2,0,0,No,,3rd Place,No,0,,
126,15-07-2018,France,Croatia,4,39,8,6,1,1,2,...,2,0,0,Yes,18.0,Final,No,0,1.0,18.0


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Date                    128 non-null    object 
 1   Team                    128 non-null    object 
 2   Opponent                128 non-null    object 
 3   Goal Scored             128 non-null    int64  
 4   Ball Possession %       128 non-null    int64  
 5   Attempts                128 non-null    int64  
 6   On-Target               128 non-null    int64  
 7   Off-Target              128 non-null    int64  
 8   Blocked                 128 non-null    int64  
 9   Corners                 128 non-null    int64  
 10  Offsides                128 non-null    int64  
 11  Free Kicks              128 non-null    int64  
 12  Saves                   128 non-null    int64  
 13  Pass Accuracy %         128 non-null    int64  
 14  Passes                  128 non-null    in

In [80]:
np.sum(df.isnull())

  return reduction(axis=axis, out=out, **passkwargs)


Date                        0
Team                        0
Opponent                    0
Goal Scored                 0
Ball Possession %           0
Attempts                    0
On-Target                   0
Off-Target                  0
Blocked                     0
Corners                     0
Offsides                    0
Free Kicks                  0
Saves                       0
Pass Accuracy %             0
Passes                      0
Distance Covered (Kms)      0
Fouls Committed             0
Yellow Card                 0
Yellow & Red                0
Red                         0
Man of the Match            0
1st Goal                   34
Round                       0
PSO                         0
Goals in PSO                0
Own goals                 116
Own goal Time             116
dtype: int64

In [81]:
df = df.drop(["Own goals", "Own goal Time", "Date"], axis=1)

In [82]:
np.sum(df["1st Goal"]) / len(df["1st Goal"])

28.9765625

In [83]:
df["1st Goal"] = df["1st Goal"].fillna(df["1st Goal"].mean())

In [84]:
for col in df.select_dtypes("object").columns:
    print(f"{col}", df[col].unique())

Team ['Russia' 'Saudi Arabia' 'Egypt' 'Uruguay' 'Morocco' 'Iran' 'Portugal'
 'Spain' 'France' 'Australia' 'Argentina' 'Iceland' 'Peru' 'Denmark'
 'Croatia' 'Nigeria' 'Costa Rica' 'Serbia' 'Germany' 'Mexico' 'Brazil'
 'Switzerland' 'Sweden' 'Korea Republic' 'Belgium' 'Panama' 'Tunisia'
 'England' 'Colombia' 'Japan' 'Poland' 'Senegal']
Opponent ['Saudi Arabia' 'Russia' 'Uruguay' 'Egypt' 'Iran' 'Morocco' 'Spain'
 'Portugal' 'Australia' 'France' 'Iceland' 'Argentina' 'Denmark' 'Peru'
 'Nigeria' 'Croatia' 'Serbia' 'Costa Rica' 'Mexico' 'Germany'
 'Switzerland' 'Brazil' 'Korea Republic' 'Sweden' 'Panama' 'Belgium'
 'England' 'Tunisia' 'Japan' 'Colombia' 'Senegal' 'Poland']
Man of the Match ['Yes' 'No']
Round ['Group Stage' 'Round of 16' 'Quarter Finals' 'Semi- Finals' '3rd Place'
 'Final']
PSO ['No' 'Yes']


In [85]:
round_values = list(df["Round"].unique())

print(round_values)

round_mappings = {label: index for index, label in enumerate(round_values)}
print(round_mappings)

df["Round"] = df["Round"].apply(lambda x: round_mappings[x])

['Group Stage', 'Round of 16', 'Quarter Finals', 'Semi- Finals', '3rd Place', 'Final']
{'Group Stage': 0, 'Round of 16': 1, 'Quarter Finals': 2, 'Semi- Finals': 3, '3rd Place': 4, 'Final': 5}


In [86]:
encoder = LabelEncoder()

df["PSO"] = encoder.fit_transform(df["PSO"])
pso_mappings = {label: index for index, label in enumerate(encoder.classes_)}

In [87]:
df["Man of the Match"] = encoder.fit_transform(df["Man of the Match"])
motm_mappings = {label: index for index, label in enumerate(encoder.classes_)}

In [88]:
print(pso_mappings)
print(motm_mappings)

{'No': 0, 'Yes': 1}
{'No': 0, 'Yes': 1}


In [91]:
pd.get_dummies(df["Team"], dtype=int)

Unnamed: 0,Argentina,Australia,Belgium,Brazil,Colombia,Costa Rica,Croatia,Denmark,Egypt,England,...,Portugal,Russia,Saudi Arabia,Senegal,Serbia,Spain,Sweden,Switzerland,Tunisia,Uruguay
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
124,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
125,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
126,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
pd.get_dummies(df["Opponent"], dtype=int)

Unnamed: 0,Argentina,Australia,Belgium,Brazil,Colombia,Costa Rica,Croatia,Denmark,Egypt,England,...,Portugal,Russia,Saudi Arabia,Senegal,Serbia,Spain,Sweden,Switzerland,Tunisia,Uruguay
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
125,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
126,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [95]:
pd.get_dummies(df["Opponent"].apply(lambda x: "opp_" + x), dtype=int)

Unnamed: 0,opp_Argentina,opp_Australia,opp_Belgium,opp_Brazil,opp_Colombia,opp_Costa Rica,opp_Croatia,opp_Denmark,opp_Egypt,opp_England,...,opp_Portugal,opp_Russia,opp_Saudi Arabia,opp_Senegal,opp_Serbia,opp_Spain,opp_Sweden,opp_Switzerland,opp_Tunisia,opp_Uruguay
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
125,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
126,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
df["Opponent"] = df["Opponent"].apply(lambda x: "opp_" + x)

In [98]:
df_concat = pd.concat(
    [df, pd.get_dummies(df["Team"]), pd.get_dummies(df["Opponent"])], axis=1
)

In [101]:
df_concat.drop(["Team", "Opponent"], axis=1, inplace=True)

In [102]:
df_concat

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,...,opp_Portugal,opp_Russia,opp_Saudi Arabia,opp_Senegal,opp_Serbia,opp_Spain,opp_Sweden,opp_Switzerland,opp_Tunisia,opp_Uruguay
0,5,40,13,7,3,3,6,3,11,0,...,False,False,True,False,False,False,False,False,False,False
1,0,60,6,0,3,3,2,1,25,2,...,False,True,False,False,False,False,False,False,False,False
2,0,43,8,3,3,2,0,1,7,3,...,False,False,False,False,False,False,False,False,False,True
3,1,57,14,4,6,4,5,1,13,3,...,False,False,False,False,False,False,False,False,False,False
4,0,64,13,3,6,4,5,0,14,2,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,1,46,11,1,6,4,4,3,24,5,...,False,False,False,False,False,False,False,False,False,False
124,2,43,12,4,3,5,4,1,5,5,...,False,False,False,False,False,False,False,False,False,False
125,0,57,15,5,7,3,5,0,12,2,...,False,False,False,False,False,False,False,False,False,False
126,4,39,8,6,1,1,2,1,14,1,...,False,False,False,False,False,False,False,False,False,False


In [104]:
np.sum(df_concat.dtypes == "object")

0

In [105]:
y = df_concat["Man of the Match"]
X = df_concat.drop(["Man of the Match"], axis=1)

In [124]:
y

0      1
1      0
2      0
3      1
4      0
      ..
123    0
124    1
125    0
126    1
127    0
Name: Man of the Match, Length: 128, dtype: int64

In [107]:
scaler = RobustScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [108]:
X

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,...,opp_Portugal,opp_Russia,opp_Saudi Arabia,opp_Senegal,opp_Serbia,opp_Spain,opp_Sweden,opp_Switzerland,opp_Tunisia,opp_Uruguay
0,2.0,-0.6250,0.166667,1.166667,-0.666667,0.000000,0.333333,1.0,-0.571429,-0.666667,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.5,0.6250,-1.000000,-1.166667,-0.666667,0.000000,-1.000000,0.0,1.428571,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.5,-0.4375,-0.666667,-0.166667,-0.666667,-0.444444,-1.666667,0.0,-1.142857,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.4375,0.333333,0.166667,0.333333,0.444444,0.000000,0.0,-0.285714,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.5,0.8750,0.166667,-0.166667,0.333333,0.444444,0.000000,-0.5,-0.142857,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,0.0,-0.2500,-0.166667,-0.833333,0.333333,0.444444,-0.333333,1.0,1.285714,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
124,0.5,-0.4375,0.000000,0.166667,-0.666667,0.888889,-0.333333,0.0,-1.428571,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
125,-0.5,0.4375,0.500000,0.500000,0.666667,0.000000,0.000000,-0.5,-0.428571,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126,1.5,-0.6875,-0.666667,0.833333,-1.333333,-0.888889,-1.000000,0.0,-0.142857,-0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True)

In [110]:
sk_model = MLPClassifier(hidden_layer_sizes=(32, 32))
sk_model.fit(X_train, y_train)



In [112]:
inputs = tf.keras.Input(shape=(85,))
x = tf.keras.layers.Dense(32, activation=tf.nn.relu)(inputs)
x = tf.keras.layers.Dense(32, activation=tf.nn.relu)(x)
outputs = tf.keras.layers.Dense(2, activation=tf.nn.softmax)(x)

tf_model = tf.keras.Model(inputs=inputs, outputs=outputs)

tf_model.compile(
    optimizer="adam",
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"],
)

In [113]:
tf_model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=16,
    epochs=200,
)

Epoch 1/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.4789 - loss: 0.7420 - val_accuracy: 0.3889 - val_loss: 0.7718
Epoch 2/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4678 - loss: 0.7302 - val_accuracy: 0.3889 - val_loss: 0.7464
Epoch 3/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6395 - loss: 0.6754 - val_accuracy: 0.3889 - val_loss: 0.7272
Epoch 4/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5362 - loss: 0.6756 - val_accuracy: 0.4444 - val_loss: 0.7115
Epoch 5/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6203 - loss: 0.6320 - val_accuracy: 0.5556 - val_loss: 0.7005
Epoch 6/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7030 - loss: 0.6211 - val_accuracy: 0.5556 - val_loss: 0.6913
Epoch 7/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x32e3149a0>

In [122]:
print(sk_model.score(X_test, y_test))
print(tf_model.evaluate(X_test, y_test, verbose=False)[1])

0.5897435897435898
0.5641025900840759


In [126]:
from pycaret.classification import *

setup(df_concat, target=df_concat["Man of the Match"], train_size=0.7, session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Man of the Match_y
2,Target type,Binary
3,Original data shape,"(128, 87)"
4,Transformed data shape,"(128, 87)"
5,Transformed train set shape,"(89, 87)"
6,Transformed test set shape,"(39, 87)"
7,Numeric features,22
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x331ef2fd0>

In [127]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.575,0.5875,0.58,0.5267,0.5474,0.1297,0.1366,0.043
catboost,CatBoost Classifier,0.5625,0.6025,0.68,0.5879,0.6156,0.1248,0.1331,0.146
rf,Random Forest Classifier,0.5514,0.6138,0.615,0.6012,0.5803,0.1025,0.1162,0.023
qda,Quadratic Discriminant Analysis,0.55,0.545,0.595,0.5614,0.5655,0.0972,0.108,0.007
xgboost,Extreme Gradient Boosting,0.5292,0.5738,0.575,0.5705,0.5445,0.0425,0.0539,0.015
et,Extra Trees Classifier,0.5278,0.5163,0.56,0.5875,0.5418,0.0625,0.0583,0.029
lr,Logistic Regression,0.5194,0.585,0.555,0.5656,0.5176,0.0338,0.0361,0.264
dt,Decision Tree Classifier,0.4958,0.4925,0.615,0.5112,0.5497,-0.0189,-0.0142,0.006
nb,Naive Bayes,0.4944,0.475,0.455,0.5352,0.4635,-0.0008,-0.0027,0.005
svm,SVM - Linear Kernel,0.4944,0.44,0.52,0.3444,0.3608,0.0182,0.0316,0.005


In [128]:
setup(df, target=df["Man of the Match"], train_size=0.7, session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Man of the Match_y
2,Target type,Binary
3,Original data shape,"(128, 25)"
4,Transformed data shape,"(128, 25)"
5,Transformed train set shape,"(89, 25)"
6,Transformed test set shape,"(39, 25)"
7,Numeric features,22
8,Categorical features,2
9,Preprocess,True


<pycaret.classification.oop.ClassificationExperiment at 0x3341d8610>

In [129]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.55,0.6175,0.625,0.5667,0.5411,0.1284,0.151,0.008
dt,Decision Tree Classifier,0.5264,0.5325,0.41,0.6167,0.4399,0.0704,0.0991,0.01
lda,Linear Discriminant Analysis,0.5264,0.51,0.555,0.5548,0.524,0.0555,0.0781,0.007
lr,Logistic Regression,0.5208,0.595,0.52,0.5262,0.502,0.0492,0.044,0.019
ridge,Ridge Classifier,0.5194,0.6,0.585,0.5187,0.5354,0.046,0.0064,0.009
ada,Ada Boost Classifier,0.5167,0.545,0.555,0.5333,0.5244,0.0303,0.0442,0.013
et,Extra Trees Classifier,0.5167,0.4975,0.555,0.5567,0.5264,0.0365,0.0532,0.019
svm,SVM - Linear Kernel,0.4944,0.445,0.52,0.3444,0.3608,0.0182,0.0316,0.019
gbc,Gradient Boosting Classifier,0.4944,0.4638,0.535,0.5516,0.4985,-0.0105,-0.0013,0.013
rf,Random Forest Classifier,0.4833,0.5062,0.495,0.5469,0.4696,-0.0238,-0.001,0.023
