In [1]:
# Command to reload all modules before executing the code
%load_ext autoreload 
%autoreload 2

In [2]:
# ML libraries
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import xgboost as xgb
# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
# Utils
import os
import sys
from collections import defaultdict
from tqdm import tqdm
import warnings
from typing import List, Dict, Any, Tuple, Union, Optional, Callable, TypeVar
# Project modules
from src.data_loading import load_dataframe_teamfeatures, load_dataframe_playersfeatures

warnings.filterwarnings('ignore')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [27]:
print("Teamfeatures :")
df_teamfeatures_train = load_dataframe_teamfeatures("train")
print("Team features train shape: ", df_teamfeatures_train.shape)
df_teamfeatures_test = load_dataframe_teamfeatures("test")
print("Team features test shape: ", df_teamfeatures_test.shape)

print()
print("Playersfeatures :")
df_playersfeatures_train_home, df_playersfeatures_train_away = load_dataframe_playersfeatures("train")
print("Players features train home shape: ", df_playersfeatures_train_home.shape)
print("Players features train away shape: ", df_playersfeatures_train_away.shape)
df_playersfeatures_test_home, df_playersfeatures_test_away = load_dataframe_playersfeatures("test")
print("Players features test home shape: ", df_playersfeatures_test_home.shape)
print("Players features test away shape: ", df_playersfeatures_test_away.shape)


Teamfeatures :
Team features train shape:  (12303, 286)
Team features test shape:  (25368, 282)

Playersfeatures :
Players features train home shape:  (237079, 307)
Players features train away shape:  (236132, 307)
Players features test home shape:  (509816, 304)
Players features test away shape:  (504626, 304)


In [23]:
print("Number of data for train teamfeatures: ", df_teamfeatures_train.shape[0])
print("Number of missing values per column for train teamfeatures, for non-zero values, sorted by number of missing values in descending order")
serie = df_teamfeatures_train.isnull().sum()
serie = serie.sort_values(ascending=False)
for i in range(len(serie)):
    print(f"{serie.index[i]}: {serie[i]}, i.e. {serie[i] / df_teamfeatures_train.shape[0] * 100:.2f}%")

Number of data for train teamfeatures:  12303
Number of missing values per column for train teamfeatures, for non-zero values, sorted by number of missing values in descending order
HOME_TEAM_INJURIES_5_last_match_std: 3318, i.e. 26.97%
AWAY_TEAM_INJURIES_5_last_match_std: 3313, i.e. 26.93%
AWAY_TEAM_INJURIES_5_last_match_sum: 2933, i.e. 23.84%
AWAY_TEAM_INJURIES_5_last_match_average: 2932, i.e. 23.83%
HOME_TEAM_INJURIES_5_last_match_sum: 2932, i.e. 23.83%
HOME_TEAM_INJURIES_5_last_match_average: 2931, i.e. 23.82%
AWAY_TEAM_INJURIES_season_std: 2926, i.e. 23.78%
HOME_TEAM_INJURIES_season_std: 2924, i.e. 23.77%
AWAY_TEAM_INJURIES_season_sum: 1961, i.e. 15.94%
AWAY_TEAM_INJURIES_season_average: 1960, i.e. 15.93%
HOME_TEAM_INJURIES_season_sum: 1960, i.e. 15.93%
HOME_TEAM_INJURIES_season_average: 1959, i.e. 15.92%
AWAY_TEAM_BALL_SAFE_5_last_match_std: 1836, i.e. 14.92%
HOME_TEAM_BALL_SAFE_5_last_match_std: 1795, i.e. 14.59%
HOME_TEAM_PASSES_5_last_match_std: 1589, i.e. 12.92%
HOME_TEAM_SUC

In [24]:
print("Number of data for test teamfeatures : ", df_teamfeatures_test.shape[0])
print("Number of missing values per column for test teamfeatures, sorted by number of missing values in descending order")
serie = df_teamfeatures_test.isnull().sum()
serie = serie.sort_values(ascending=False)
for i in range(len(serie)):
    print(f"{serie.index[i]}: {serie[i]}, i.e. {serie[i] / df_teamfeatures_test.shape[0] * 100:.2f}%")

Number of data for test teamfeatures :  25368
Number of missing values per column for test teamfeatures, sorted by number of missing values in descending order
AWAY_TEAM_INJURIES_5_last_match_std: 3676, i.e. 14.49%
HOME_TEAM_INJURIES_5_last_match_std: 3634, i.e. 14.33%
HOME_TEAM_BALL_SAFE_5_last_match_std: 2893, i.e. 11.40%
AWAY_TEAM_BALL_SAFE_5_last_match_std: 2890, i.e. 11.39%
HOME_TEAM_INJURIES_5_last_match_sum: 2705, i.e. 10.66%
HOME_TEAM_INJURIES_5_last_match_average: 2700, i.e. 10.64%
AWAY_TEAM_INJURIES_5_last_match_sum: 2692, i.e. 10.61%
AWAY_TEAM_INJURIES_5_last_match_average: 2687, i.e. 10.59%
AWAY_TEAM_INJURIES_season_std: 2481, i.e. 9.78%
HOME_TEAM_INJURIES_season_std: 2473, i.e. 9.75%
HOME_TEAM_BALL_SAFE_5_last_match_sum: 2254, i.e. 8.89%
HOME_TEAM_BALL_SAFE_5_last_match_average: 2254, i.e. 8.89%
AWAY_TEAM_BALL_SAFE_5_last_match_average: 2250, i.e. 8.87%
AWAY_TEAM_BALL_SAFE_5_last_match_sum: 2250, i.e. 8.87%
HOME_TEAM_BALL_SAFE_season_std: 2091, i.e. 8.24%
AWAY_TEAM_BALL_SA

In [25]:
print("Number of data for train playersfeatures home: ", df_playersfeatures_train_home.shape[0])
print("Number of missing values per column for train playersfeatures home, for non-zero values, sorted by number of missing values in descending order")
serie = df_playersfeatures_train_home.isnull().sum()
serie = serie.sort_values(ascending=False)
for i in range(len(serie)):
    print(f"{serie.index[i]}: {serie[i]}, i.e. {serie[i] / df_playersfeatures_train_home.shape[0] * 100:.2f}%")

Number of data for train playersfeatures home:  237079
Number of missing values per column for train playersfeatures home, for non-zero values, sorted by number of missing values in descending order
PLAYER_LONG_BALLS_season_std: 237079, i.e. 100.00%
PLAYER_LONG_BALLS_5_last_match_sum: 237079, i.e. 100.00%
PLAYER_SHOTS_OFF_TARGET_season_sum: 237079, i.e. 100.00%
PLAYER_CAPTAIN_season_average: 237079, i.e. 100.00%
PLAYER_LONG_BALLS_season_average: 237079, i.e. 100.00%
PLAYER_LONG_BALLS_WON_season_average: 237079, i.e. 100.00%
PLAYER_SHOTS_OFF_TARGET_season_average: 237079, i.e. 100.00%
PLAYER_CAPTAIN_season_std: 237079, i.e. 100.00%
PLAYER_LONG_BALLS_WON_season_std: 237079, i.e. 100.00%
PLAYER_SHOTS_OFF_TARGET_season_std: 237079, i.e. 100.00%
PLAYER_LONG_BALLS_WON_5_last_match_sum: 237079, i.e. 100.00%
PLAYER_LONG_BALLS_season_sum: 237079, i.e. 100.00%
PLAYER_SHOTS_OFF_TARGET_5_last_match_sum: 237079, i.e. 100.00%
PLAYER_CAPTAIN_5_last_match_average: 237079, i.e. 100.00%
PLAYER_LONG_BALL

In [26]:
print("Number of data for test playersfeatures home: ", df_playersfeatures_test_home.shape[0])
print("Number of missing values per column for test playersfeatures home, sorted by number of missing values in descending order")
serie = df_playersfeatures_test_home.isnull().sum()
serie = serie.sort_values(ascending=False)
for i in range(len(serie)):
    print(f"{serie.index[i]}: {serie[i]}, i.e. {serie[i] / df_playersfeatures_test_home.shape[0] * 100:.2f}%")

Number of data for test playersfeatures home:  509816
Number of missing values per column for test playersfeatures home, sorted by number of missing values in descending order
PLAYER_CAPTAIN_5_last_match_std: 509816, i.e. 100.00%
PLAYER_CAPTAIN_season_average: 509816, i.e. 100.00%
PLAYER_CAPTAIN_5_last_match_average: 509816, i.e. 100.00%
PLAYER_CAPTAIN_season_std: 509816, i.e. 100.00%
PLAYER_CAPTAIN_5_last_match_sum: 471607, i.e. 92.51%
PLAYER_CAPTAIN_season_sum: 455890, i.e. 89.42%
PLAYER_LONG_BALLS_WON_season_std: 341389, i.e. 66.96%
PLAYER_LONG_BALLS_WON_5_last_match_std: 341389, i.e. 66.96%
PLAYER_LONG_BALLS_5_last_match_std: 341389, i.e. 66.96%
PLAYER_LONG_BALLS_season_std: 341389, i.e. 66.96%
PLAYER_LONG_BALLS_season_average: 338923, i.e. 66.48%
PLAYER_LONG_BALLS_WON_season_average: 338923, i.e. 66.48%
PLAYER_LONG_BALLS_WON_season_sum: 338923, i.e. 66.48%
PLAYER_LONG_BALLS_5_last_match_sum: 338923, i.e. 66.48%
PLAYER_LONG_BALLS_season_sum: 338923, i.e. 66.48%
PLAYER_LONG_BALLS_WO