In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats

import pickle
import os

In [8]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("beridzeg45/top-league-footballer-stats-2000-2023-seasons")

print("Path to dataset files:", path)

Path to dataset files: /Users/tirdodbehbehani/.cache/kagglehub/datasets/beridzeg45/top-league-footballer-stats-2000-2023-seasons/versions/10


In [9]:
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns=100
plt.style.use('ggplot')

In [10]:
# List of all files to load
files = [
    # Tirdod files 
    'standard_stats.csv',
    'goal_and_shot_creation.csv',
    'defensive_actions.csv',
    'goalkeeping.csv',
    'passing.csv',
    # Me files 
    'miscellaneous_stats.csv',
    'advanced_goalkeeping.csv',
    'shooting.csv',
    'possession.csv',
    'playing_time.csv'
]

# Load each file into a variable
for file in files:
    # Generate variable name from file name (e.g., 'standard_stats' from 'standard_stats.csv')
    var_name = file.split('.')[0]
    file_path = os.path.join(path, file)
    
    # Dynamically assign to a variable
    globals()[var_name] = pd.read_csv(file_path)


standard_stats.head()

Unnamed: 0,Season,League,Team,Unnamed: 0_level_0_Player,Unnamed: 1_level_0_Nation,Unnamed: 2_level_0_Pos,Unnamed: 3_level_0_Age,Unnamed: 4_level_0_MP,Playing Time_Starts,Playing Time_Min,Playing Time_90s,Performance_Gls,Performance_Ast,Performance_G+A,Performance_G-PK,Performance_PK,Performance_PKatt,Performance_CrdY,Performance_CrdR,Progression_PrgC,Progression_PrgP,Progression_PrgR,Per 90 Minutes_Gls,Per 90 Minutes_Ast,Per 90 Minutes_G+A,Per 90 Minutes_G-PK,Per 90 Minutes_G+A-PK,Unnamed: 24_level_0_Matches,Playing Time_MP,Expected_xG,Expected_npxG,Expected_xAG,Expected_npxG+xAG,Per 90 Minutes_xG,Per 90 Minutes_xAG,Per 90 Minutes_xG+xAG,Per 90 Minutes_npxG,Per 90 Minutes_npxG+xAG,Unnamed: 33_level_0_Matches,S_e,L_e,T_e,U_n,U_n.1,U_n.2,U_n.3,P_l,P_l.1,P_l.2,P_l.3,P_e,P_e.1,P_e.2,P_e.3,P_e.4,P_e.5,P_e.6,P_e.7,E_x,E_x.1,E_x.2,E_x.3,P_r,P_r.1,P_r.2,P_e.8,P_e.9,P_e.10,P_e.11,P_e.12,P_e.13,P_e.14,P_e.15,P_e.16,P_e.17,U_n.4,U_n.5
0,2000-2001,EPL,Manchester Utd,Gary Neville,eng ENG,DF,25.0,32.0,32.0,2849.0,31.7,1.0,1.0,2.0,1.0,0.0,0.0,4.0,0.0,,,,0.03,0.03,0.06,0.03,0.06,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2000-2001,EPL,Manchester Utd,Fabien Barthez,fr FRA,GK,29.0,30.0,30.0,2675.0,29.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2000-2001,EPL,Manchester Utd,David Beckham,eng ENG,MF,25.0,31.0,29.0,2648.0,29.4,9.0,12.0,21.0,8.0,1.0,1.0,3.0,0.0,,,,0.31,0.41,0.71,0.27,0.68,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2000-2001,EPL,Manchester Utd,Paul Scholes,eng ENG,MF,25.0,32.0,28.0,2450.0,27.2,6.0,5.0,11.0,6.0,0.0,1.0,3.0,0.0,,,,0.22,0.18,0.4,0.22,0.4,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2000-2001,EPL,Manchester Utd,Roy Keane,ie IRL,MF,28.0,28.0,28.0,2380.0,26.4,2.0,7.0,9.0,2.0,0.0,0.0,2.0,1.0,,,,0.08,0.26,0.34,0.08,0.34,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [11]:
# Select the desired seasons and leagues 
desired_seasons = ['2019-2020', '2020-2021', '2021-2022', '2022-2023']
desired_leagues = ['EPL', 'Bundesliga', 'LaLiga', 'Ligue1', 'SeriaA']

In [13]:
# Remove the totals rows 
standard_stats=standard_stats.query('`Unnamed: 0_level_0_Player`.isin(["Squad Total","Opponent Total"])==False')
goalkeeping=goalkeeping.query('`Unnamed: 0_level_0_Player`.isin(["Squad Total","Opponent Total"])==False')
advanced_goalkeeping=advanced_goalkeeping.query('`Unnamed: 0_level_0_Player`.isin(["Squad Total","Opponent Total"])==False')
shooting=shooting.query('`Unnamed: 0_level_0_Player`.isin(["Squad Total","Opponent Total"])==False')
passing=passing.query('`Unnamed: 0_level_0_Player`.isin(["Squad Total","Opponent Total"])==False')
goal_and_shot_creation=goal_and_shot_creation.query('`Unnamed: 0_level_0_Player`.isin(["Squad Total","Opponent Total"])==False')
defensive_actions=defensive_actions.query('`Unnamed: 0_level_0_Player`.isin(["Squad Total","Opponent Total"])==False')
possession=possession.query('`Unnamed: 0_level_0_Player`.isin(["Squad Total","Opponent Total"])==False')
playing_time=playing_time.query('`Unnamed: 0_level_0_Player`.isin(["Squad Total","Opponent Total"])==False')
miscellaneous_stats=miscellaneous_stats.query('`Unnamed: 0_level_0_Player`.isin(["Squad Total","Opponent Total"])==False')

SyntaxError: Failed to parse backticks in '`Unnamed: 0_level_0_Player`.isin(["Squad Total","Opponent Total"])==False'. (<string>)

In [748]:
# Create a list storing all our datasets 
dataframes = [
    standard_stats,
    goal_and_shot_creation,
    defensive_actions,
    goalkeeping,
    passing,
    advanced_goalkeeping,
    shooting,
    possession,
    playing_time,
    miscellaneous_stats
]

In [749]:
# Filter for top 5 leagues, and desired seasons 
for df in dataframes:
    if 'Season' in df.columns and 'League' in df.columns:
        df.query(
            '`Unnamed: 0_level_0_Player`.isin(["Squad Total", "Opponent Total"]) == False and '
            'Season in @desired_seasons and '
            'League in @desired_leagues',
            inplace=True
        )
    else:
        print("Columns 'Season' and/or 'League' not found in the DataFrame.")

for i, df in enumerate(dataframes, start=1):
    print(f"DataFrame {i}:")
    print(df[['Unnamed: 0_level_0_Player', 'Season', 'League', 'Team']].drop_duplicates())
    print("-" * 40)

DataFrame 1:
       Unnamed: 0_level_0_Player     Season  League       Team
76825            Virgil van Dijk  2019-2020     EPL  Liverpool
76826     Trent Alexander-Arnold  2019-2020     EPL  Liverpool
76827        Georginio Wijnaldum  2019-2020     EPL  Liverpool
76828           Andrew Robertson  2019-2020     EPL  Liverpool
76829            Roberto Firmino  2019-2020     EPL  Liverpool
...                          ...        ...     ...        ...
100436            Arttu Lötjönen  2022-2023  SeriaA  Sampdoria
100437          Elia Tantalocchi  2022-2023  SeriaA  Sampdoria
100438           Simone Trimboli  2022-2023  SeriaA  Sampdoria
100439             Lorenzo Villa  2022-2023  SeriaA  Sampdoria
100440            Federico Zorzi  2022-2023  SeriaA  Sampdoria

[13808 rows x 4 columns]
----------------------------------------
DataFrame 2:
      Unnamed: 0_level_0_Player     Season  League       Team
6829            Virgil van Dijk  2019-2020     EPL  Liverpool
6830     Trent Alexander-Ar

In [750]:
# Check shape of all data 
print('-------------------------')
print(standard_stats.shape)
print('-------------------------')
print(goalkeeping.shape)
print('-------------------------')
print(advanced_goalkeeping.shape)
print('-------------------------')
print(shooting.shape)
print('-------------------------')
print(passing.shape)
print('-------------------------')
print(goal_and_shot_creation.shape)
print('-------------------------')
print(defensive_actions.shape)
print('-------------------------')
print(possession.shape)
print('-------------------------')
print(playing_time.shape)
print('-------------------------')
print(miscellaneous_stats.shape)
print('-------------------------')

-------------------------
(13808, 77)
-------------------------
(839, 54)
-------------------------
(839, 68)
-------------------------
(11364, 53)
-------------------------
(11364, 66)
-------------------------
(11364, 50)
-------------------------
(11364, 50)
-------------------------
(11364, 62)
-------------------------
(13808, 63)
-------------------------
(11364, 51)
-------------------------


In [751]:
# Check columns of all data 
print('-------------------------')
print(standard_stats.columns)
print('-------------------------')
print(goalkeeping.columns)
print('-------------------------')
print(shooting.columns)
print('-------------------------')
print(passing.columns)
print('-------------------------')
print(goal_and_shot_creation.columns)
print('-------------------------')
print(defensive_actions.columns)
print('-------------------------')
print(possession.columns)
print('-------------------------')
print(playing_time.columns)
print('-------------------------')
print(miscellaneous_stats.columns)
print('-------------------------')

-------------------------
Index(['Season', 'League', 'Team', 'Unnamed: 0_level_0_Player',
       'Unnamed: 1_level_0_Nation', 'Unnamed: 2_level_0_Pos',
       'Unnamed: 3_level_0_Age', 'Unnamed: 4_level_0_MP',
       'Playing Time_Starts', 'Playing Time_Min', 'Playing Time_90s',
       'Performance_Gls', 'Performance_Ast', 'Performance_G+A',
       'Performance_G-PK', 'Performance_PK', 'Performance_PKatt',
       'Performance_CrdY', 'Performance_CrdR', 'Progression_PrgC',
       'Progression_PrgP', 'Progression_PrgR', 'Per 90 Minutes_Gls',
       'Per 90 Minutes_Ast', 'Per 90 Minutes_G+A', 'Per 90 Minutes_G-PK',
       'Per 90 Minutes_G+A-PK', 'Unnamed: 24_level_0_Matches',
       'Playing Time_MP', 'Expected_xG', 'Expected_npxG', 'Expected_xAG',
       'Expected_npxG+xAG', 'Per 90 Minutes_xG', 'Per 90 Minutes_xAG',
       'Per 90 Minutes_xG+xAG', 'Per 90 Minutes_npxG',
       'Per 90 Minutes_npxG+xAG', 'Unnamed: 33_level_0_Matches', 'S_e', 'L_e',
       'T_e', 'U_n', 'U_n.1', 'U_n.2',

In [752]:
# Define a function to clean column names
def clean_columns(df):
    df.columns = (
        df.columns
        .str.replace('Unnamed: \d+_level_0_', '', regex=True)  # Remove 'Unnamed: n_level_0_' prefixes
        .str.replace(' ', '_')  # Replace spaces with underscores
        .str.replace('-', '_')  # Replace hyphens with underscores
        .str.lower()  # Convert to lowercase
    )
    return df

# Apply this function to all datasets
datasets = [clean_columns(df) for df in dataframes]

### Specific Dataset Analysis

## 1. Shooting

In [753]:
shooting.head()

Unnamed: 0,season,league,team,player,nation,pos,age,90s,standard_gls,standard_sh,standard_sot,standard_sot%,standard_sh/90,standard_sot/90,standard_g/sh,standard_g/sot,standard_dist,standard_pk,standard_pkatt,matches,standard_fk,expected_xg,expected_npxg,expected_npxg/sh,expected_g_xg,expected_np:g_xg,matches.1,s_e,l_e,t_e,u_n,u_n.1,u_n.2,u_n.3,u_n.4,s_t,s_t.1,s_t.2,s_t.3,s_t.4,s_t.5,s_t.6,s_t.7,s_t.8,s_t.9,s_t.10,s_t.11,e_x,e_x.1,e_x.2,e_x.3,e_x.4,u_n.5
73654,2019-2020,EPL,Liverpool,Virgil van Dijk,nl NED,DF,28.0,38.0,5.0,31.0,14.0,45.2,0.82,0.37,0.16,0.36,11.8,0.0,0.0,,0.0,2.9,2.9,0.09,2.1,2.1,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,
73655,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,eng ENG,DF,20.0,35.3,4.0,44.0,13.0,29.5,1.25,0.37,0.09,0.31,22.6,0.0,0.0,,13.0,2.8,2.8,0.06,1.2,1.2,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,
73656,2019-2020,EPL,Liverpool,Georginio Wijnaldum,nl NED,MF,28.0,32.6,4.0,36.0,15.0,41.7,1.1,0.46,0.11,0.27,16.2,0.0,0.0,,0.0,2.6,2.6,0.07,1.4,1.4,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,
73657,2019-2020,EPL,Liverpool,Andrew Robertson,sct SCO,DF,25.0,34.6,2.0,22.0,6.0,27.3,0.64,0.17,0.09,0.33,18.6,0.0,0.0,,0.0,1.8,1.8,0.08,0.2,0.2,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,
73658,2019-2020,EPL,Liverpool,Roberto Firmino,br BRA,FW,27.0,33.2,9.0,99.0,38.0,38.4,2.98,1.14,0.09,0.24,13.7,0.0,0.0,,0.0,14.0,14.0,0.14,-5.0,-5.0,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,


In [754]:
def clean_columns(df): 
    # Get the index of the first occurrence of 'matches'
    matches_index = df.columns.get_indexer_for(['matches'])[1]
    
    # Slice the DataFrame to include columns up to (but not including) the first 'matches'
    df = df.iloc[:, :matches_index]
    
    # Verify the columns
    print("Remaining columns:", list(df.columns))

    return df 

In [755]:
shooting_cleaned = clean_columns(shooting)
shooting_cleaned.head()

Remaining columns: ['season', 'league', 'team', 'player', 'nation', 'pos', 'age', '90s', 'standard_gls', 'standard_sh', 'standard_sot', 'standard_sot%', 'standard_sh/90', 'standard_sot/90', 'standard_g/sh', 'standard_g/sot', 'standard_dist', 'standard_pk', 'standard_pkatt', 'matches', 'standard_fk', 'expected_xg', 'expected_npxg', 'expected_npxg/sh', 'expected_g_xg', 'expected_np:g_xg']


Unnamed: 0,season,league,team,player,nation,pos,age,90s,standard_gls,standard_sh,standard_sot,standard_sot%,standard_sh/90,standard_sot/90,standard_g/sh,standard_g/sot,standard_dist,standard_pk,standard_pkatt,matches,standard_fk,expected_xg,expected_npxg,expected_npxg/sh,expected_g_xg,expected_np:g_xg
73654,2019-2020,EPL,Liverpool,Virgil van Dijk,nl NED,DF,28.0,38.0,5.0,31.0,14.0,45.2,0.82,0.37,0.16,0.36,11.8,0.0,0.0,,0.0,2.9,2.9,0.09,2.1,2.1
73655,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,eng ENG,DF,20.0,35.3,4.0,44.0,13.0,29.5,1.25,0.37,0.09,0.31,22.6,0.0,0.0,,13.0,2.8,2.8,0.06,1.2,1.2
73656,2019-2020,EPL,Liverpool,Georginio Wijnaldum,nl NED,MF,28.0,32.6,4.0,36.0,15.0,41.7,1.1,0.46,0.11,0.27,16.2,0.0,0.0,,0.0,2.6,2.6,0.07,1.4,1.4
73657,2019-2020,EPL,Liverpool,Andrew Robertson,sct SCO,DF,25.0,34.6,2.0,22.0,6.0,27.3,0.64,0.17,0.09,0.33,18.6,0.0,0.0,,0.0,1.8,1.8,0.08,0.2,0.2
73658,2019-2020,EPL,Liverpool,Roberto Firmino,br BRA,FW,27.0,33.2,9.0,99.0,38.0,38.4,2.98,1.14,0.09,0.24,13.7,0.0,0.0,,0.0,14.0,14.0,0.14,-5.0,-5.0


In [756]:
shooting_cleaned.columns

Index(['season', 'league', 'team', 'player', 'nation', 'pos', 'age', '90s',
       'standard_gls', 'standard_sh', 'standard_sot', 'standard_sot%',
       'standard_sh/90', 'standard_sot/90', 'standard_g/sh', 'standard_g/sot',
       'standard_dist', 'standard_pk', 'standard_pkatt', 'matches',
       'standard_fk', 'expected_xg', 'expected_npxg', 'expected_npxg/sh',
       'expected_g_xg', 'expected_np:g_xg'],
      dtype='object')

In [757]:
import pandas as pd
import numpy as np

def generate_per90_and_total(df):
    """
    Generates two datasets: per90 and total.

    Parameters:
        df (pd.DataFrame): The input DataFrame with all columns.

    Returns:
        tuple: A tuple containing the per90 DataFrame and the total DataFrame.
    """
    # Identify columns in "per90" format or containing '/sh', '/sot', etc.
    per_90_existing_columns = [col for col in df.columns if '/90' in col.lower()]

    # Create the total dataset by dropping per90 columns
    df = df.drop(columns=per_90_existing_columns)
    
    exclude_columns = [col for col in df.columns if '/sh' in col.lower() or '/sot' in col.lower() or '%' in col.lower()]
    
    # Columns that should not be divided
    non_divisible_columns = ['season', 'league', 'team', 'player', 'nation', 'pos', 'age', '90s', 'matches', 'standard_dist'] + exclude_columns
    
    # Columns to divide by '90s'
    columns_to_divide = [col for col in df.columns if col not in non_divisible_columns]
    
    # Replace zeros in '90s' with NaN to avoid ZeroDivisionError
    df['90s'] = df['90s'].replace(0, np.nan)
    
    # Create the per90 dataset
    per90_df = df.copy()
    for col in columns_to_divide:
        per90_df[col] = per90_df[col] / per90_df['90s']

    # Add '_per90' suffix to transformed columns
    rename_columns = {col: f"{col}_per90" for col in columns_to_divide}
    per90_df.rename(columns=rename_columns, inplace=True)
    
    # Round the newly created _per90 columns to 2 decimal places
    per90_df[list(rename_columns.values())] = per90_df[list(rename_columns.values())].round(2)
    
    # Create the total dataset by dropping per90 columns
    total_df = df
    
    return per90_df, total_df

def drop_redundant_columns(df):
    # Drop redundant columns
    df.drop(columns=['nation', 'pos', 'age', '90s', 'matches'], inplace=True, axis=1) 
    return df

In [758]:
shooting_per90, shooting_total = generate_per90_and_total(shooting_cleaned)
shooting_per90 = drop_redundant_columns(shooting_per90)
shooting_total = drop_redundant_columns(shooting_total)

In [759]:
shooting_per90.head()

Unnamed: 0,season,league,team,player,standard_gls_per90,standard_sh_per90,standard_sot_per90,standard_sot%,standard_g/sh,standard_g/sot,standard_dist,standard_pk_per90,standard_pkatt_per90,standard_fk_per90,expected_xg_per90,expected_npxg_per90,expected_npxg/sh,expected_g_xg_per90,expected_np:g_xg_per90
73654,2019-2020,EPL,Liverpool,Virgil van Dijk,0.13,0.82,0.37,45.2,0.16,0.36,11.8,0.0,0.0,0.0,0.08,0.08,0.09,0.06,0.06
73655,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,0.11,1.25,0.37,29.5,0.09,0.31,22.6,0.0,0.0,0.37,0.08,0.08,0.06,0.03,0.03
73656,2019-2020,EPL,Liverpool,Georginio Wijnaldum,0.12,1.1,0.46,41.7,0.11,0.27,16.2,0.0,0.0,0.0,0.08,0.08,0.07,0.04,0.04
73657,2019-2020,EPL,Liverpool,Andrew Robertson,0.06,0.64,0.17,27.3,0.09,0.33,18.6,0.0,0.0,0.0,0.05,0.05,0.08,0.01,0.01
73658,2019-2020,EPL,Liverpool,Roberto Firmino,0.27,2.98,1.14,38.4,0.09,0.24,13.7,0.0,0.0,0.0,0.42,0.42,0.14,-0.15,-0.15


In [760]:
shooting_total.head()

Unnamed: 0,season,league,team,player,standard_gls,standard_sh,standard_sot,standard_sot%,standard_g/sh,standard_g/sot,standard_dist,standard_pk,standard_pkatt,standard_fk,expected_xg,expected_npxg,expected_npxg/sh,expected_g_xg,expected_np:g_xg
73654,2019-2020,EPL,Liverpool,Virgil van Dijk,5.0,31.0,14.0,45.2,0.16,0.36,11.8,0.0,0.0,0.0,2.9,2.9,0.09,2.1,2.1
73655,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,4.0,44.0,13.0,29.5,0.09,0.31,22.6,0.0,0.0,13.0,2.8,2.8,0.06,1.2,1.2
73656,2019-2020,EPL,Liverpool,Georginio Wijnaldum,4.0,36.0,15.0,41.7,0.11,0.27,16.2,0.0,0.0,0.0,2.6,2.6,0.07,1.4,1.4
73657,2019-2020,EPL,Liverpool,Andrew Robertson,2.0,22.0,6.0,27.3,0.09,0.33,18.6,0.0,0.0,0.0,1.8,1.8,0.08,0.2,0.2
73658,2019-2020,EPL,Liverpool,Roberto Firmino,9.0,99.0,38.0,38.4,0.09,0.24,13.7,0.0,0.0,0.0,14.0,14.0,0.14,-5.0,-5.0


## 2. Possession

In [761]:
possession.head()

Unnamed: 0,season,league,team,player,nation,pos,age,90s,touches_touches,touches_def_pen,touches_def_3rd,touches_mid_3rd,touches_att_3rd,touches_att_pen,touches_live,take_ons_att,take_ons_succ,take_ons_succ%,take_ons_tkld,take_ons_tkld%,carries_carries,carries_totdist,carries_prgdist,carries_prgc,carries_1/3,carries_cpa,carries_mis,carries_dis,receiving_rec,receiving_prgr,matches,s_e,l_e,t_e,u_n,u_n.1,u_n.2,u_n.3,u_n.4,t_o,t_o.1,t_o.2,t_o.3,t_o.4,t_o.5,t_o.6,t_a,t_a.1,t_a.2,t_a.3,t_a.4,c_a,c_a.1,c_a.2,c_a.3,c_a.4,c_a.5,c_a.6,c_a.7,r_e,r_e.1,u_n.5
6829,2019-2020,EPL,Liverpool,Virgil van Dijk,nl NED,DF,28.0,38.0,3588.0,315.0,1365.0,2126.0,102.0,49.0,3588.0,4.0,3.0,75.0,1.0,25.0,2484.0,9365.0,4594.0,21.0,19.0,3.0,2.0,2.0,2580.0,14.0,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6830,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,eng ENG,DF,20.0,35.3,3582.0,99.0,825.0,1590.0,1193.0,54.0,3582.0,51.0,21.0,41.2,30.0,58.8,2136.0,9096.0,5332.0,98.0,113.0,16.0,47.0,12.0,2281.0,226.0,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6831,2019-2020,EPL,Liverpool,Georginio Wijnaldum,nl NED,MF,28.0,32.6,1947.0,41.0,267.0,1168.0,530.0,74.0,1947.0,74.0,41.0,55.4,33.0,44.6,1410.0,6609.0,3045.0,74.0,67.0,5.0,43.0,30.0,1439.0,99.0,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6832,2019-2020,EPL,Liverpool,Andrew Robertson,sct SCO,DF,25.0,34.6,3296.0,89.0,622.0,1575.0,1129.0,83.0,3296.0,44.0,22.0,50.0,22.0,50.0,2152.0,10659.0,6603.0,149.0,115.0,20.0,50.0,9.0,2317.0,277.0,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6833,2019-2020,EPL,Liverpool,Roberto Firmino,br BRA,FW,27.0,33.2,1558.0,16.0,103.0,697.0,775.0,211.0,1558.0,90.0,52.0,57.8,38.0,42.2,1082.0,5483.0,2646.0,90.0,62.0,35.0,92.0,39.0,1203.0,218.0,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [762]:
def clean_columns(df): 
    # Get the index of the first occurrence of 'matches'
    matches_index = df.columns.get_indexer_for(['matches'])[0]
    
    # Slice the DataFrame to include columns up to (but not including) the first 'matches'
    df = df.iloc[:, :matches_index]
    
    # Verify the columns
    print("Remaining columns:", list(df.columns))

    return df 

In [763]:
possession_cleaned = clean_columns(possession)

Remaining columns: ['season', 'league', 'team', 'player', 'nation', 'pos', 'age', '90s', 'touches_touches', 'touches_def_pen', 'touches_def_3rd', 'touches_mid_3rd', 'touches_att_3rd', 'touches_att_pen', 'touches_live', 'take_ons_att', 'take_ons_succ', 'take_ons_succ%', 'take_ons_tkld', 'take_ons_tkld%', 'carries_carries', 'carries_totdist', 'carries_prgdist', 'carries_prgc', 'carries_1/3', 'carries_cpa', 'carries_mis', 'carries_dis', 'receiving_rec', 'receiving_prgr']


In [764]:
possession_cleaned.head()

Unnamed: 0,season,league,team,player,nation,pos,age,90s,touches_touches,touches_def_pen,touches_def_3rd,touches_mid_3rd,touches_att_3rd,touches_att_pen,touches_live,take_ons_att,take_ons_succ,take_ons_succ%,take_ons_tkld,take_ons_tkld%,carries_carries,carries_totdist,carries_prgdist,carries_prgc,carries_1/3,carries_cpa,carries_mis,carries_dis,receiving_rec,receiving_prgr
6829,2019-2020,EPL,Liverpool,Virgil van Dijk,nl NED,DF,28.0,38.0,3588.0,315.0,1365.0,2126.0,102.0,49.0,3588.0,4.0,3.0,75.0,1.0,25.0,2484.0,9365.0,4594.0,21.0,19.0,3.0,2.0,2.0,2580.0,14.0
6830,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,eng ENG,DF,20.0,35.3,3582.0,99.0,825.0,1590.0,1193.0,54.0,3582.0,51.0,21.0,41.2,30.0,58.8,2136.0,9096.0,5332.0,98.0,113.0,16.0,47.0,12.0,2281.0,226.0
6831,2019-2020,EPL,Liverpool,Georginio Wijnaldum,nl NED,MF,28.0,32.6,1947.0,41.0,267.0,1168.0,530.0,74.0,1947.0,74.0,41.0,55.4,33.0,44.6,1410.0,6609.0,3045.0,74.0,67.0,5.0,43.0,30.0,1439.0,99.0
6832,2019-2020,EPL,Liverpool,Andrew Robertson,sct SCO,DF,25.0,34.6,3296.0,89.0,622.0,1575.0,1129.0,83.0,3296.0,44.0,22.0,50.0,22.0,50.0,2152.0,10659.0,6603.0,149.0,115.0,20.0,50.0,9.0,2317.0,277.0
6833,2019-2020,EPL,Liverpool,Roberto Firmino,br BRA,FW,27.0,33.2,1558.0,16.0,103.0,697.0,775.0,211.0,1558.0,90.0,52.0,57.8,38.0,42.2,1082.0,5483.0,2646.0,90.0,62.0,35.0,92.0,39.0,1203.0,218.0


In [765]:
def drop_redundant_columns(df):
    # Drop redundant columns
    df.drop(columns=['nation', 'pos', 'age'], inplace=True, axis=1) 
    return df

possession_cleaned = drop_redundant_columns(possession_cleaned)
possession_cleaned

Unnamed: 0,season,league,team,player,90s,touches_touches,touches_def_pen,touches_def_3rd,touches_mid_3rd,touches_att_3rd,touches_att_pen,touches_live,take_ons_att,take_ons_succ,take_ons_succ%,take_ons_tkld,take_ons_tkld%,carries_carries,carries_totdist,carries_prgdist,carries_prgc,carries_1/3,carries_cpa,carries_mis,carries_dis,receiving_rec,receiving_prgr
6829,2019-2020,EPL,Liverpool,Virgil van Dijk,38.0,3588.0,315.0,1365.0,2126.0,102.0,49.0,3588.0,4.0,3.0,75.0,1.0,25.0,2484.0,9365.0,4594.0,21.0,19.0,3.0,2.0,2.0,2580.0,14.0
6830,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,35.3,3582.0,99.0,825.0,1590.0,1193.0,54.0,3582.0,51.0,21.0,41.2,30.0,58.8,2136.0,9096.0,5332.0,98.0,113.0,16.0,47.0,12.0,2281.0,226.0
6831,2019-2020,EPL,Liverpool,Georginio Wijnaldum,32.6,1947.0,41.0,267.0,1168.0,530.0,74.0,1947.0,74.0,41.0,55.4,33.0,44.6,1410.0,6609.0,3045.0,74.0,67.0,5.0,43.0,30.0,1439.0,99.0
6832,2019-2020,EPL,Liverpool,Andrew Robertson,34.6,3296.0,89.0,622.0,1575.0,1129.0,83.0,3296.0,44.0,22.0,50.0,22.0,50.0,2152.0,10659.0,6603.0,149.0,115.0,20.0,50.0,9.0,2317.0,277.0
6833,2019-2020,EPL,Liverpool,Roberto Firmino,33.2,1558.0,16.0,103.0,697.0,775.0,211.0,1558.0,90.0,52.0,57.8,38.0,42.2,1082.0,5483.0,2646.0,90.0,62.0,35.0,92.0,39.0,1203.0,218.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26675,2022-2023,SeriaA,Sampdoria,Manuel De Luca,0.5,14.0,0.0,0.0,9.0,5.0,2.0,14.0,0.0,0.0,,0.0,,14.0,52.0,7.0,0.0,0.0,0.0,2.0,2.0,14.0,4.0
26676,2022-2023,SeriaA,Sampdoria,Andrea Conti,0.3,11.0,0.0,0.0,8.0,3.0,0.0,11.0,0.0,0.0,,0.0,,4.0,10.0,5.0,0.0,0.0,0.0,0.0,1.0,6.0,2.0
26677,2022-2023,SeriaA,Sampdoria,Mihailo Ivanović,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26678,2022-2023,SeriaA,Sampdoria,Samuel Ntanda,0.0,2.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,100.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0


In [766]:
def generate_per90_and_total_possession(df):
    """
    Generates two datasets: per90 and total.

    Parameters:
        df (pd.DataFrame): The input DataFrame with all columns.

    Returns:
        tuple: A tuple containing the per90 DataFrame and the total DataFrame.
    """
    # Identify columns in "per90" format or containing '/sh', '/sot', etc.
    per_90_existing_columns = [col for col in df.columns if '/90' in col.lower()]

    # Create the total dataset by dropping per90 columns
    df = df.drop(columns=per_90_existing_columns)
    
    exclude_columns = [col for col in df.columns if '%' in col.lower()]
    
    # Columns that should not be divided
    non_divisible_columns = ['season', 'league', 'team', 'player', '90s',] + exclude_columns
    
    # Columns to divide by '90s'
    columns_to_divide = [col for col in df.columns if col not in non_divisible_columns]
    
    # Replace zeros in '90s' with NaN to avoid ZeroDivisionError
    df['90s'] = df['90s'].replace(0, np.nan)
    
    # Create the per90 dataset
    per90_df = df.copy()
    for col in columns_to_divide:
        per90_df[col] = per90_df[col] / per90_df['90s']

    # Add '_per90' suffix to transformed columns
    rename_columns = {col: f"{col}_per90" for col in columns_to_divide}
    per90_df.rename(columns=rename_columns, inplace=True)
    
    # Round the newly created _per90 columns to 2 decimal places
    per90_df[list(rename_columns.values())] = per90_df[list(rename_columns.values())].round(2)
    
    # Create the total dataset by dropping per90 columns
    total_df = df

    total_df.drop(columns='90s', inplace=True)
    per90_df.drop(columns='90s', inplace=True)
    
    return per90_df, total_df

In [767]:
possession_per90, possession_total = generate_per90_and_total_possession(possession_cleaned)

In [768]:
possession_per90

Unnamed: 0,season,league,team,player,touches_touches_per90,touches_def_pen_per90,touches_def_3rd_per90,touches_mid_3rd_per90,touches_att_3rd_per90,touches_att_pen_per90,touches_live_per90,take_ons_att_per90,take_ons_succ_per90,take_ons_succ%,take_ons_tkld_per90,take_ons_tkld%,carries_carries_per90,carries_totdist_per90,carries_prgdist_per90,carries_prgc_per90,carries_1/3_per90,carries_cpa_per90,carries_mis_per90,carries_dis_per90,receiving_rec_per90,receiving_prgr_per90
6829,2019-2020,EPL,Liverpool,Virgil van Dijk,94.42,8.29,35.92,55.95,2.68,1.29,94.42,0.11,0.08,75.0,0.03,25.0,65.37,246.45,120.89,0.55,0.50,0.08,0.05,0.05,67.89,0.37
6830,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,101.47,2.80,23.37,45.04,33.80,1.53,101.47,1.44,0.59,41.2,0.85,58.8,60.51,257.68,151.05,2.78,3.20,0.45,1.33,0.34,64.62,6.40
6831,2019-2020,EPL,Liverpool,Georginio Wijnaldum,59.72,1.26,8.19,35.83,16.26,2.27,59.72,2.27,1.26,55.4,1.01,44.6,43.25,202.73,93.40,2.27,2.06,0.15,1.32,0.92,44.14,3.04
6832,2019-2020,EPL,Liverpool,Andrew Robertson,95.26,2.57,17.98,45.52,32.63,2.40,95.26,1.27,0.64,50.0,0.64,50.0,62.20,308.06,190.84,4.31,3.32,0.58,1.45,0.26,66.97,8.01
6833,2019-2020,EPL,Liverpool,Roberto Firmino,46.93,0.48,3.10,20.99,23.34,6.36,46.93,2.71,1.57,57.8,1.14,42.2,32.59,165.15,79.70,2.71,1.87,1.05,2.77,1.17,36.23,6.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26675,2022-2023,SeriaA,Sampdoria,Manuel De Luca,28.00,0.00,0.00,18.00,10.00,4.00,28.00,0.00,0.00,,0.00,,28.00,104.00,14.00,0.00,0.00,0.00,4.00,4.00,28.00,8.00
26676,2022-2023,SeriaA,Sampdoria,Andrea Conti,36.67,0.00,0.00,26.67,10.00,0.00,36.67,0.00,0.00,,0.00,,13.33,33.33,16.67,0.00,0.00,0.00,0.00,3.33,20.00,6.67
26677,2022-2023,SeriaA,Sampdoria,Mihailo Ivanović,,,,,,,,,,,,,,,,,,,,,,
26678,2022-2023,SeriaA,Sampdoria,Samuel Ntanda,,,,,,,,,,100.0,,0.0,,,,,,,,,,


In [769]:
possession_total

Unnamed: 0,season,league,team,player,touches_touches,touches_def_pen,touches_def_3rd,touches_mid_3rd,touches_att_3rd,touches_att_pen,touches_live,take_ons_att,take_ons_succ,take_ons_succ%,take_ons_tkld,take_ons_tkld%,carries_carries,carries_totdist,carries_prgdist,carries_prgc,carries_1/3,carries_cpa,carries_mis,carries_dis,receiving_rec,receiving_prgr
6829,2019-2020,EPL,Liverpool,Virgil van Dijk,3588.0,315.0,1365.0,2126.0,102.0,49.0,3588.0,4.0,3.0,75.0,1.0,25.0,2484.0,9365.0,4594.0,21.0,19.0,3.0,2.0,2.0,2580.0,14.0
6830,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,3582.0,99.0,825.0,1590.0,1193.0,54.0,3582.0,51.0,21.0,41.2,30.0,58.8,2136.0,9096.0,5332.0,98.0,113.0,16.0,47.0,12.0,2281.0,226.0
6831,2019-2020,EPL,Liverpool,Georginio Wijnaldum,1947.0,41.0,267.0,1168.0,530.0,74.0,1947.0,74.0,41.0,55.4,33.0,44.6,1410.0,6609.0,3045.0,74.0,67.0,5.0,43.0,30.0,1439.0,99.0
6832,2019-2020,EPL,Liverpool,Andrew Robertson,3296.0,89.0,622.0,1575.0,1129.0,83.0,3296.0,44.0,22.0,50.0,22.0,50.0,2152.0,10659.0,6603.0,149.0,115.0,20.0,50.0,9.0,2317.0,277.0
6833,2019-2020,EPL,Liverpool,Roberto Firmino,1558.0,16.0,103.0,697.0,775.0,211.0,1558.0,90.0,52.0,57.8,38.0,42.2,1082.0,5483.0,2646.0,90.0,62.0,35.0,92.0,39.0,1203.0,218.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26675,2022-2023,SeriaA,Sampdoria,Manuel De Luca,14.0,0.0,0.0,9.0,5.0,2.0,14.0,0.0,0.0,,0.0,,14.0,52.0,7.0,0.0,0.0,0.0,2.0,2.0,14.0,4.0
26676,2022-2023,SeriaA,Sampdoria,Andrea Conti,11.0,0.0,0.0,8.0,3.0,0.0,11.0,0.0,0.0,,0.0,,4.0,10.0,5.0,0.0,0.0,0.0,0.0,1.0,6.0,2.0
26677,2022-2023,SeriaA,Sampdoria,Mihailo Ivanović,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26678,2022-2023,SeriaA,Sampdoria,Samuel Ntanda,2.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,100.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0


## 3. Playing Time

In [770]:
playing_time = playing_time[playing_time['playing_time_min'] > 0]
playing_time

Unnamed: 0,season,league,team,player,nation,pos,age,mp,playing_time_min,playing_time_mn/mp,playing_time_min%,playing_time_90s,starts_starts,starts_mn/start,starts_compl,subs_subs,subs_mn/sub,subs_unsub,team_success_ppm,team_success_ong,team_success_onga,team_success_+/_,team_success_+/_90,team_success_on_off,team_success_(xg)_onxg,team_success_(xg)_onxga,team_success_(xg)_xg+/_,team_success_(xg)_xg+/_90,team_success_(xg)_on_off,matches,playing_time_mp,matches.1,s_e,l_e,t_e,u_n,u_n.1,u_n.2,u_n.3,p_l,p_l.1,p_l.2,p_l.3,p_l.4,s_t,s_t.1,s_t.2,s_u,s_u.1,s_u.2,t_e.1,t_e.2,t_e.3,t_e.4,t_e.5,t_e.6,t_e.7,t_e.8,t_e.9,t_e.10,t_e.11,u_n.4,u_n.5
9131,2019-2020,EPL,Liverpool,Virgil van Dijk,nl NED,DF,28.0,38.0,3420.0,90.0,100.0,38.0,38.0,90.0,38.0,0.0,,0.0,2.61,85.0,33.0,52.0,1.37,,68.9,37.7,31.2,0.82,,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9132,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,eng ENG,DF,20.0,38.0,3175.0,84.0,92.8,35.3,35.0,88.0,31.0,3.0,26.0,0.0,2.61,80.0,32.0,48.0,1.36,-0.11,65.7,36.7,29.0,0.82,0.04,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9133,2019-2020,EPL,Liverpool,Georginio Wijnaldum,nl NED,MF,28.0,37.0,2935.0,79.0,85.8,32.6,35.0,82.0,20.0,2.0,28.0,0.0,2.59,74.0,28.0,46.0,1.41,0.30,57.9,34.1,23.8,0.73,-0.64,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9134,2019-2020,EPL,Liverpool,Andrew Robertson,sct SCO,DF,25.0,36.0,3111.0,86.0,91.0,34.6,34.0,89.0,30.0,2.0,39.0,0.0,2.64,80.0,31.0,49.0,1.42,0.54,64.7,33.3,31.5,0.91,1.01,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9135,2019-2020,EPL,Liverpool,Roberto Firmino,br BRA,FW,27.0,38.0,2988.0,79.0,87.4,33.2,34.0,84.0,14.0,4.0,33.0,0.0,2.61,75.0,26.0,49.0,1.48,0.85,62.2,31.9,30.3,0.91,0.74,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32735,2022-2023,SeriaA,Sampdoria,Manuel De Luca,it ITA,"FW,MF",24.0,2.0,44.0,22.0,1.3,0.5,0.0,,0.0,2.0,22.0,7.0,0.50,0.0,1.0,-1.0,-2.05,-0.82,0.6,0.8,-0.2,-0.37,0.48,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32736,2022-2023,SeriaA,Sampdoria,Andrea Conti,it ITA,DF,28.0,1.0,23.0,23.0,0.7,0.3,0.0,,0.0,1.0,23.0,7.0,0.00,0.0,1.0,-1.0,-3.91,-2.69,0.0,0.2,-0.2,-0.71,0.14,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32737,2022-2023,SeriaA,Sampdoria,Mihailo Ivanović,rs SRB,FW,17.0,1.0,4.0,4.0,0.1,0.0,0.0,,0.0,1.0,4.0,1.0,0.00,0.0,0.0,0.0,0.00,1.24,0.1,0.1,0.0,0.00,0.85,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32738,2022-2023,SeriaA,Sampdoria,Samuel Ntanda,be BEL,MF,17.0,1.0,4.0,4.0,0.1,0.0,0.0,,0.0,1.0,4.0,0.0,0.00,0.0,0.0,0.0,0.00,1.24,0.1,0.1,0.0,0.00,0.85,Matches,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [771]:
def clean_columns(df): 
    # Get the index of the first occurrence of 'matches'
    matches_index = df.columns.get_indexer_for(['matches'])[0]
    
    # Slice the DataFrame to include columns up to (but not including) the first 'matches'
    df = df.iloc[:, :matches_index]

    # Drop matches column
    df.drop(columns=['nation', 'pos', 'age'], inplace=True)
    
    # Verify the columns
    print("Remaining columns:", list(df.columns))

    return df 

In [772]:
playing_time_cleaned = clean_columns(playing_time)
playing_time_cleaned

Remaining columns: ['season', 'league', 'team', 'player', 'mp', 'playing_time_min', 'playing_time_mn/mp', 'playing_time_min%', 'playing_time_90s', 'starts_starts', 'starts_mn/start', 'starts_compl', 'subs_subs', 'subs_mn/sub', 'subs_unsub', 'team_success_ppm', 'team_success_ong', 'team_success_onga', 'team_success_+/_', 'team_success_+/_90', 'team_success_on_off', 'team_success_(xg)_onxg', 'team_success_(xg)_onxga', 'team_success_(xg)_xg+/_', 'team_success_(xg)_xg+/_90', 'team_success_(xg)_on_off']


Unnamed: 0,season,league,team,player,mp,playing_time_min,playing_time_mn/mp,playing_time_min%,playing_time_90s,starts_starts,starts_mn/start,starts_compl,subs_subs,subs_mn/sub,subs_unsub,team_success_ppm,team_success_ong,team_success_onga,team_success_+/_,team_success_+/_90,team_success_on_off,team_success_(xg)_onxg,team_success_(xg)_onxga,team_success_(xg)_xg+/_,team_success_(xg)_xg+/_90,team_success_(xg)_on_off
9131,2019-2020,EPL,Liverpool,Virgil van Dijk,38.0,3420.0,90.0,100.0,38.0,38.0,90.0,38.0,0.0,,0.0,2.61,85.0,33.0,52.0,1.37,,68.9,37.7,31.2,0.82,
9132,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,38.0,3175.0,84.0,92.8,35.3,35.0,88.0,31.0,3.0,26.0,0.0,2.61,80.0,32.0,48.0,1.36,-0.11,65.7,36.7,29.0,0.82,0.04
9133,2019-2020,EPL,Liverpool,Georginio Wijnaldum,37.0,2935.0,79.0,85.8,32.6,35.0,82.0,20.0,2.0,28.0,0.0,2.59,74.0,28.0,46.0,1.41,0.30,57.9,34.1,23.8,0.73,-0.64
9134,2019-2020,EPL,Liverpool,Andrew Robertson,36.0,3111.0,86.0,91.0,34.6,34.0,89.0,30.0,2.0,39.0,0.0,2.64,80.0,31.0,49.0,1.42,0.54,64.7,33.3,31.5,0.91,1.01
9135,2019-2020,EPL,Liverpool,Roberto Firmino,38.0,2988.0,79.0,87.4,33.2,34.0,84.0,14.0,4.0,33.0,0.0,2.61,75.0,26.0,49.0,1.48,0.85,62.2,31.9,30.3,0.91,0.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32735,2022-2023,SeriaA,Sampdoria,Manuel De Luca,2.0,44.0,22.0,1.3,0.5,0.0,,0.0,2.0,22.0,7.0,0.50,0.0,1.0,-1.0,-2.05,-0.82,0.6,0.8,-0.2,-0.37,0.48
32736,2022-2023,SeriaA,Sampdoria,Andrea Conti,1.0,23.0,23.0,0.7,0.3,0.0,,0.0,1.0,23.0,7.0,0.00,0.0,1.0,-1.0,-3.91,-2.69,0.0,0.2,-0.2,-0.71,0.14
32737,2022-2023,SeriaA,Sampdoria,Mihailo Ivanović,1.0,4.0,4.0,0.1,0.0,0.0,,0.0,1.0,4.0,1.0,0.00,0.0,0.0,0.0,0.00,1.24,0.1,0.1,0.0,0.00,0.85
32738,2022-2023,SeriaA,Sampdoria,Samuel Ntanda,1.0,4.0,4.0,0.1,0.0,0.0,,0.0,1.0,4.0,0.0,0.00,0.0,0.0,0.0,0.00,1.24,0.1,0.1,0.0,0.00,0.85


In [773]:
def generate_per90_and_total_playing_time_stats(df):
    """
    Generates two datasets: per90 and total.

    Parameters:
        df (pd.DataFrame): The input DataFrame with all columns.

    Returns:
        tuple: A tuple containing the per90 DataFrame and the total DataFrame.
    """
    # Identify columns in "per90" format or containing '/sh', '/sot', etc.
    per_90_existing_columns = [col for col in df.columns if '/90' in col.lower()]

    # Create the total dataset by dropping per90 columns
    df = df.drop(columns=per_90_existing_columns)
    
    # Don't convert to 90s 
    exclude_columns = ['mp', 'playing_time_min', 'playing_time_mn/mp', 'playing_time_min%', 'PPM', 'subs_mn/sub', 'starts_mn/start', 'team_success_on_off', 'team_success_(xg)_on_off']
    
    # Columns that should not be divided
    non_divisible_columns = ['season', 'league', 'team', 'player', 'playing_time_90s'] + exclude_columns
    
    # Columns to divide by '90s'
    columns_to_divide = [col for col in df.columns if col not in non_divisible_columns]
    
    # Replace zeros in '90s' with NaN to avoid ZeroDivisionError
    df['playing_time_90s'] = df['playing_time_90s'].replace(0, np.nan)
    
    # Create the per90 dataset
    per90_df = df.copy()
    for col in columns_to_divide:
        per90_df[col] = per90_df[col] / per90_df['playing_time_90s']

    # Add '_per90' suffix to transformed columns
    rename_columns = {col: f"{col}_per90" for col in columns_to_divide}
    per90_df.rename(columns=rename_columns, inplace=True)
    
    # Round the newly created _per90 columns to 2 decimal places
    per90_df[list(rename_columns.values())] = per90_df[list(rename_columns.values())].round(2)
    
    # Create the total dataset by dropping per90 columns
    total_df = df

    total_df.drop(columns='playing_time_90s', inplace=True)
    
    return per90_df, total_df

In [774]:
playing_time_per90, playing_time_total = generate_per90_and_total_playing_time_stats(playing_time_cleaned)

In [775]:
playing_time_per90

Unnamed: 0,season,league,team,player,mp,playing_time_min,playing_time_mn/mp,playing_time_min%,playing_time_90s,starts_starts_per90,starts_mn/start,starts_compl_per90,subs_subs_per90,subs_mn/sub,subs_unsub_per90,team_success_ppm_per90,team_success_ong_per90,team_success_onga_per90,team_success_+/__per90,team_success_+/_90_per90,team_success_on_off,team_success_(xg)_onxg_per90,team_success_(xg)_onxga_per90,team_success_(xg)_xg+/__per90,team_success_(xg)_xg+/_90_per90,team_success_(xg)_on_off
9131,2019-2020,EPL,Liverpool,Virgil van Dijk,38.0,3420.0,90.0,100.0,38.0,1.00,90.0,1.00,0.00,,0.00,0.07,2.24,0.87,1.37,0.04,,1.81,0.99,0.82,0.02,
9132,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,38.0,3175.0,84.0,92.8,35.3,0.99,88.0,0.88,0.08,26.0,0.00,0.07,2.27,0.91,1.36,0.04,-0.11,1.86,1.04,0.82,0.02,0.04
9133,2019-2020,EPL,Liverpool,Georginio Wijnaldum,37.0,2935.0,79.0,85.8,32.6,1.07,82.0,0.61,0.06,28.0,0.00,0.08,2.27,0.86,1.41,0.04,0.30,1.78,1.05,0.73,0.02,-0.64
9134,2019-2020,EPL,Liverpool,Andrew Robertson,36.0,3111.0,86.0,91.0,34.6,0.98,89.0,0.87,0.06,39.0,0.00,0.08,2.31,0.90,1.42,0.04,0.54,1.87,0.96,0.91,0.03,1.01
9135,2019-2020,EPL,Liverpool,Roberto Firmino,38.0,2988.0,79.0,87.4,33.2,1.02,84.0,0.42,0.12,33.0,0.00,0.08,2.26,0.78,1.48,0.04,0.85,1.87,0.96,0.91,0.03,0.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32735,2022-2023,SeriaA,Sampdoria,Manuel De Luca,2.0,44.0,22.0,1.3,0.5,0.00,,0.00,4.00,22.0,14.00,1.00,0.00,2.00,-2.00,-4.10,-0.82,1.20,1.60,-0.40,-0.74,0.48
32736,2022-2023,SeriaA,Sampdoria,Andrea Conti,1.0,23.0,23.0,0.7,0.3,0.00,,0.00,3.33,23.0,23.33,0.00,0.00,3.33,-3.33,-13.03,-2.69,0.00,0.67,-0.67,-2.37,0.14
32737,2022-2023,SeriaA,Sampdoria,Mihailo Ivanović,1.0,4.0,4.0,0.1,,,,,,4.0,,,,,,,1.24,,,,,0.85
32738,2022-2023,SeriaA,Sampdoria,Samuel Ntanda,1.0,4.0,4.0,0.1,,,,,,4.0,,,,,,,1.24,,,,,0.85


In [776]:
playing_time_total

Unnamed: 0,season,league,team,player,mp,playing_time_min,playing_time_mn/mp,playing_time_min%,starts_starts,starts_mn/start,starts_compl,subs_subs,subs_mn/sub,subs_unsub,team_success_ppm,team_success_ong,team_success_onga,team_success_+/_,team_success_+/_90,team_success_on_off,team_success_(xg)_onxg,team_success_(xg)_onxga,team_success_(xg)_xg+/_,team_success_(xg)_xg+/_90,team_success_(xg)_on_off
9131,2019-2020,EPL,Liverpool,Virgil van Dijk,38.0,3420.0,90.0,100.0,38.0,90.0,38.0,0.0,,0.0,2.61,85.0,33.0,52.0,1.37,,68.9,37.7,31.2,0.82,
9132,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,38.0,3175.0,84.0,92.8,35.0,88.0,31.0,3.0,26.0,0.0,2.61,80.0,32.0,48.0,1.36,-0.11,65.7,36.7,29.0,0.82,0.04
9133,2019-2020,EPL,Liverpool,Georginio Wijnaldum,37.0,2935.0,79.0,85.8,35.0,82.0,20.0,2.0,28.0,0.0,2.59,74.0,28.0,46.0,1.41,0.30,57.9,34.1,23.8,0.73,-0.64
9134,2019-2020,EPL,Liverpool,Andrew Robertson,36.0,3111.0,86.0,91.0,34.0,89.0,30.0,2.0,39.0,0.0,2.64,80.0,31.0,49.0,1.42,0.54,64.7,33.3,31.5,0.91,1.01
9135,2019-2020,EPL,Liverpool,Roberto Firmino,38.0,2988.0,79.0,87.4,34.0,84.0,14.0,4.0,33.0,0.0,2.61,75.0,26.0,49.0,1.48,0.85,62.2,31.9,30.3,0.91,0.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32735,2022-2023,SeriaA,Sampdoria,Manuel De Luca,2.0,44.0,22.0,1.3,0.0,,0.0,2.0,22.0,7.0,0.50,0.0,1.0,-1.0,-2.05,-0.82,0.6,0.8,-0.2,-0.37,0.48
32736,2022-2023,SeriaA,Sampdoria,Andrea Conti,1.0,23.0,23.0,0.7,0.0,,0.0,1.0,23.0,7.0,0.00,0.0,1.0,-1.0,-3.91,-2.69,0.0,0.2,-0.2,-0.71,0.14
32737,2022-2023,SeriaA,Sampdoria,Mihailo Ivanović,1.0,4.0,4.0,0.1,0.0,,0.0,1.0,4.0,1.0,0.00,0.0,0.0,0.0,0.00,1.24,0.1,0.1,0.0,0.00,0.85
32738,2022-2023,SeriaA,Sampdoria,Samuel Ntanda,1.0,4.0,4.0,0.1,0.0,,0.0,1.0,4.0,0.0,0.00,0.0,0.0,0.0,0.00,1.24,0.1,0.1,0.0,0.00,0.85


## 4. Miscellaneous Stats

In [777]:
def clean_columns(df): 
    # Get the index of the first occurrence of 'matches'
    matches_index = df.columns.get_indexer_for(['matches'])[1]
    
    # Slice the DataFrame to include columns up to (but not including) the first 'matches'
    df = df.iloc[:, :matches_index]

    # Drop matches column
    df.drop(columns=['matches', 'nation', 'pos', 'age'], inplace=True)
    
    # Verify the columns
    print("Remaining columns:", list(df.columns))

    return df 

In [778]:
miscellaneous_stats_cleaned = clean_columns(miscellaneous_stats)

Remaining columns: ['season', 'league', 'team', 'player', '90s', 'performance_crdy', 'performance_crdr', 'performance_2crdy', 'performance_fls', 'performance_fld', 'performance_off', 'performance_crs', 'performance_int', 'performance_tklw', 'performance_pkwon', 'performance_pkcon', 'performance_og', 'performance_recov', 'aerial_duels_won', 'aerial_duels_lost', 'aerial_duels_won%']


In [779]:
miscellaneous_stats_cleaned

Unnamed: 0,season,league,team,player,90s,performance_crdy,performance_crdr,performance_2crdy,performance_fls,performance_fld,performance_off,performance_crs,performance_int,performance_tklw,performance_pkwon,performance_pkcon,performance_og,performance_recov,aerial_duels_won,aerial_duels_lost,aerial_duels_won%
73654,2019-2020,EPL,Liverpool,Virgil van Dijk,38.0,1.0,0.0,0.0,16.0,25.0,1.0,0.0,40.0,12.0,0.0,0.0,0.0,220.0,188.0,56.0,77.0
73655,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,35.3,5.0,0.0,0.0,22.0,9.0,3.0,382.0,45.0,34.0,0.0,0.0,0.0,221.0,15.0,30.0,33.3
73656,2019-2020,EPL,Liverpool,Georginio Wijnaldum,32.6,0.0,0.0,0.0,15.0,32.0,1.0,5.0,17.0,20.0,0.0,0.0,0.0,189.0,30.0,51.0,37.0
73657,2019-2020,EPL,Liverpool,Andrew Robertson,34.6,2.0,0.0,0.0,18.0,13.0,2.0,210.0,38.0,31.0,0.0,0.0,0.0,210.0,33.0,34.0,49.3
73658,2019-2020,EPL,Liverpool,Roberto Firmino,33.2,0.0,0.0,0.0,29.0,16.0,6.0,9.0,7.0,19.0,0.0,0.0,0.0,136.0,26.0,74.0,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93500,2022-2023,SeriaA,Sampdoria,Manuel De Luca,0.5,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0
93501,2022-2023,SeriaA,Sampdoria,Andrea Conti,0.3,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,
93502,2022-2023,SeriaA,Sampdoria,Mihailo Ivanović,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
93503,2022-2023,SeriaA,Sampdoria,Samuel Ntanda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [780]:
def generate_per90_and_total_miscellaneous_stats(df):
    """
    Generates two datasets: per90 and total.

    Parameters:
        df (pd.DataFrame): The input DataFrame with all columns.

    Returns:
        tuple: A tuple containing the per90 DataFrame and the total DataFrame.
    """
    
    exclude_columns = [col for col in df.columns if '%' in col.lower()]
    
    # Columns that should not be divided
    non_divisible_columns = ['season', 'league', 'team', 'player', '90s',] + exclude_columns
    
    # Columns to divide by '90s'
    columns_to_divide = [col for col in df.columns if col not in non_divisible_columns]
    
    # Replace zeros in '90s' with NaN to avoid ZeroDivisionError
    df['90s'] = df['90s'].replace(0, np.nan)
    
    # Create the per90 dataset
    per90_df = df.copy()
    for col in columns_to_divide:
        per90_df[col] = per90_df[col] / per90_df['90s']

    # Add '_per90' suffix to transformed columns
    rename_columns = {col: f"{col}_per90" for col in columns_to_divide}
    per90_df.rename(columns=rename_columns, inplace=True)
    
    # Round the newly created _per90 columns to 2 decimal places
    per90_df[list(rename_columns.values())] = per90_df[list(rename_columns.values())].round(2)
    
    # Create the total dataset by dropping per90 columns
    total_df = df

    total_df.drop(columns='90s', inplace=True)
    per90_df.drop(columns='90s', inplace=True)
    
    return per90_df, total_df

In [781]:
misc_per90, misc_total = generate_per90_and_total_miscellaneous_stats(miscellaneous_stats_cleaned)

In [782]:
misc_per90

Unnamed: 0,season,league,team,player,performance_crdy_per90,performance_crdr_per90,performance_2crdy_per90,performance_fls_per90,performance_fld_per90,performance_off_per90,performance_crs_per90,performance_int_per90,performance_tklw_per90,performance_pkwon_per90,performance_pkcon_per90,performance_og_per90,performance_recov_per90,aerial_duels_won_per90,aerial_duels_lost_per90,aerial_duels_won%
73654,2019-2020,EPL,Liverpool,Virgil van Dijk,0.03,0.0,0.0,0.42,0.66,0.03,0.00,1.05,0.32,0.0,0.0,0.0,5.79,4.95,1.47,77.0
73655,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,0.14,0.0,0.0,0.62,0.25,0.08,10.82,1.27,0.96,0.0,0.0,0.0,6.26,0.42,0.85,33.3
73656,2019-2020,EPL,Liverpool,Georginio Wijnaldum,0.00,0.0,0.0,0.46,0.98,0.03,0.15,0.52,0.61,0.0,0.0,0.0,5.80,0.92,1.56,37.0
73657,2019-2020,EPL,Liverpool,Andrew Robertson,0.06,0.0,0.0,0.52,0.38,0.06,6.07,1.10,0.90,0.0,0.0,0.0,6.07,0.95,0.98,49.3
73658,2019-2020,EPL,Liverpool,Roberto Firmino,0.00,0.0,0.0,0.87,0.48,0.18,0.27,0.21,0.57,0.0,0.0,0.0,4.10,0.78,2.23,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93500,2022-2023,SeriaA,Sampdoria,Manuel De Luca,0.00,0.0,0.0,6.00,4.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0,2.00,0.00,6.00,0.0
93501,2022-2023,SeriaA,Sampdoria,Andrea Conti,0.00,0.0,0.0,0.00,3.33,3.33,0.00,0.00,0.00,0.0,0.0,0.0,3.33,0.00,0.00,
93502,2022-2023,SeriaA,Sampdoria,Mihailo Ivanović,,,,,,,,,,,,,,,,
93503,2022-2023,SeriaA,Sampdoria,Samuel Ntanda,,,,,,,,,,,,,,,,


In [783]:
misc_total

Unnamed: 0,season,league,team,player,performance_crdy,performance_crdr,performance_2crdy,performance_fls,performance_fld,performance_off,performance_crs,performance_int,performance_tklw,performance_pkwon,performance_pkcon,performance_og,performance_recov,aerial_duels_won,aerial_duels_lost,aerial_duels_won%
73654,2019-2020,EPL,Liverpool,Virgil van Dijk,1.0,0.0,0.0,16.0,25.0,1.0,0.0,40.0,12.0,0.0,0.0,0.0,220.0,188.0,56.0,77.0
73655,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,5.0,0.0,0.0,22.0,9.0,3.0,382.0,45.0,34.0,0.0,0.0,0.0,221.0,15.0,30.0,33.3
73656,2019-2020,EPL,Liverpool,Georginio Wijnaldum,0.0,0.0,0.0,15.0,32.0,1.0,5.0,17.0,20.0,0.0,0.0,0.0,189.0,30.0,51.0,37.0
73657,2019-2020,EPL,Liverpool,Andrew Robertson,2.0,0.0,0.0,18.0,13.0,2.0,210.0,38.0,31.0,0.0,0.0,0.0,210.0,33.0,34.0,49.3
73658,2019-2020,EPL,Liverpool,Roberto Firmino,0.0,0.0,0.0,29.0,16.0,6.0,9.0,7.0,19.0,0.0,0.0,0.0,136.0,26.0,74.0,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93500,2022-2023,SeriaA,Sampdoria,Manuel De Luca,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0
93501,2022-2023,SeriaA,Sampdoria,Andrea Conti,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,
93502,2022-2023,SeriaA,Sampdoria,Mihailo Ivanović,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
93503,2022-2023,SeriaA,Sampdoria,Samuel Ntanda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


## Merging standard stats, shooting, playing time and misc stats 

In [784]:
# Note: We don't consider goalkeepers 
standard_stats.columns

Index(['season', 'league', 'team', 'player', 'nation', 'pos', 'age', 'mp',
       'playing_time_starts', 'playing_time_min', 'playing_time_90s',
       'performance_gls', 'performance_ast', 'performance_g+a',
       'performance_g_pk', 'performance_pk', 'performance_pkatt',
       'performance_crdy', 'performance_crdr', 'progression_prgc',
       'progression_prgp', 'progression_prgr', 'per_90_minutes_gls',
       'per_90_minutes_ast', 'per_90_minutes_g+a', 'per_90_minutes_g_pk',
       'per_90_minutes_g+a_pk', 'matches', 'playing_time_mp', 'expected_xg',
       'expected_npxg', 'expected_xag', 'expected_npxg+xag',
       'per_90_minutes_xg', 'per_90_minutes_xag', 'per_90_minutes_xg+xag',
       'per_90_minutes_npxg', 'per_90_minutes_npxg+xag', 'matches', 's_e',
       'l_e', 't_e', 'u_n', 'u_n.1', 'u_n.2', 'u_n.3', 'p_l', 'p_l.1', 'p_l.2',
       'p_l.3', 'p_e', 'p_e.1', 'p_e.2', 'p_e.3', 'p_e.4', 'p_e.5', 'p_e.6',
       'p_e.7', 'e_x', 'e_x.1', 'e_x.2', 'e_x.3', 'p_r', 'p_r.1', 'p_r

In [785]:
# Only if playing time is greater than 0
standard_stats = standard_stats[standard_stats['playing_time_min'] > 0]

# Subset to relevant columns 
standard_stats = standard_stats[['season', 'league', 'team', 'player', 'nation', 'pos', 'age']]

# Split the 'nation' column and keep only the second part
standard_stats['nation'] = standard_stats['nation'].str.split().str[1]

# Verify the result
standard_stats

Unnamed: 0,season,league,team,player,nation,pos,age
76825,2019-2020,EPL,Liverpool,Virgil van Dijk,NED,DF,28.0
76826,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,ENG,DF,20.0
76827,2019-2020,EPL,Liverpool,Georginio Wijnaldum,NED,MF,28.0
76828,2019-2020,EPL,Liverpool,Andrew Robertson,SCO,DF,25.0
76829,2019-2020,EPL,Liverpool,Roberto Firmino,BRA,FW,27.0
...,...,...,...,...,...,...,...
100429,2022-2023,SeriaA,Sampdoria,Manuel De Luca,ITA,"FW,MF",24.0
100430,2022-2023,SeriaA,Sampdoria,Andrea Conti,ITA,DF,28.0
100431,2022-2023,SeriaA,Sampdoria,Mihailo Ivanović,SRB,FW,17.0
100432,2022-2023,SeriaA,Sampdoria,Samuel Ntanda,BEL,MF,17.0


### Per90 Merge

In [786]:
playing_and_shooting_per90 = playing_time_per90.merge(shooting_per90, how='left', on=['season','league', 'team', 'player'])

In [787]:
playing_shooting_misc_per90 = playing_and_shooting_per90.merge(misc_per90, how='left', on=['season','league', 'team', 'player'])

In [788]:
drop_cols = ['subs_mn/sub', 'subs_unsub_per90', 'team_success_+/__per90',
       'team_success_+/_90_per90', 'team_success_on_off',
       'team_success_(xg)_onxg_per90', 'team_success_(xg)_onxga_per90',
       'team_success_(xg)_xg+/__per90', 'team_success_(xg)_xg+/_90_per90',
       'team_success_(xg)_on_off', 'starts_mn/start', 'mp']

In [789]:
playing_shooting_misc_per90.drop(columns=drop_cols, inplace=True)

In [790]:
final_df_per90 = standard_stats.merge(playing_shooting_misc_per90, how='left', on=['season','league', 'team', 'player'])

### Totals Mergre

In [791]:
playing_and_shooting_total = playing_time_total.merge(shooting_total, how='left', on=['season','league', 'team', 'player'])

In [792]:
playing_shooting_misc_total = playing_and_shooting_total.merge(misc_total, how='left', on=['season','league', 'team', 'player'])

In [793]:
drop_cols = ['subs_mn/sub', 'subs_unsub', 'team_success_+/_',
       'team_success_+/_90', 'team_success_on_off',
       'team_success_(xg)_onxg', 'team_success_(xg)_onxga',
       'team_success_(xg)_xg+/_', 'team_success_(xg)_xg+/_90',
       'team_success_(xg)_on_off', 'starts_mn/start', 'mp']

In [794]:
playing_shooting_misc_total.drop(columns=drop_cols, inplace=True)

In [795]:
final_df_total = standard_stats.merge(playing_shooting_misc_total, how='left', on=['season','league', 'team', 'player'])

In [796]:
# Replace NaN values in the 'nation' and 'pos' column with "unknown"
final_df_per90['nation'] = final_df_per90['nation'].fillna('unknown')
final_df_per90['pos'] = final_df_per90['pos'].fillna('unknown')

In [797]:
# Replace NaN values in the 'nation' and 'pos' column with "unknown"
final_df_total['nation'] = final_df_total['nation'].fillna('unknown')
final_df_total['pos'] = final_df_total['pos'].fillna('unknown')

In [798]:
# List of columns to fill NaN with 0
columns_to_fill = [
    'playing_time_min', 'playing_time_mn/mp', 'playing_time_min%', 'playing_time_90s',
    'starts_starts_per90', 'starts_compl_per90', 'subs_subs_per90',
    'team_success_ppm_per90', 'team_success_ong_per90', 'team_success_onga_per90',
    'standard_gls_per90', 'standard_sh_per90', 'standard_sot_per90', 'standard_sot%',
    'standard_g/sh', 'standard_g/sot', 'standard_dist', 'standard_pk_per90',
    'standard_pkatt_per90', 'standard_fk_per90', 'expected_xg_per90',
    'expected_npxg_per90', 'expected_npxg/sh', 'expected_g_xg_per90',
    'expected_np:g_xg_per90', 'performance_crdy_per90', 'performance_crdr_per90',
    'performance_2crdy_per90', 'performance_fls_per90', 'performance_fld_per90',
    'performance_off_per90', 'performance_crs_per90', 'performance_int_per90',
    'performance_tklw_per90', 'performance_pkwon_per90', 'performance_pkcon_per90',
    'performance_og_per90', 'performance_recov_per90', 'aerial_duels_won_per90',
    'aerial_duels_lost_per90', 'aerial_duels_won%'
]

# Fill NaN with 0 for the specified columns
final_df_per90[columns_to_fill] = final_df_per90[columns_to_fill].fillna(0)

# Verify that NaN values have been replaced
print(final_df_per90[columns_to_fill].isna().sum())  # Should print 0 for all columns

playing_time_min           0
playing_time_mn/mp         0
playing_time_min%          0
playing_time_90s           0
starts_starts_per90        0
starts_compl_per90         0
subs_subs_per90            0
team_success_ppm_per90     0
team_success_ong_per90     0
team_success_onga_per90    0
standard_gls_per90         0
standard_sh_per90          0
standard_sot_per90         0
standard_sot%              0
standard_g/sh              0
standard_g/sot             0
standard_dist              0
standard_pk_per90          0
standard_pkatt_per90       0
standard_fk_per90          0
expected_xg_per90          0
expected_npxg_per90        0
expected_npxg/sh           0
expected_g_xg_per90        0
expected_np:g_xg_per90     0
performance_crdy_per90     0
performance_crdr_per90     0
performance_2crdy_per90    0
performance_fls_per90      0
performance_fld_per90      0
performance_off_per90      0
performance_crs_per90      0
performance_int_per90      0
performance_tklw_per90     0
performance_pk

## Merge Everything

In [799]:
# 3 other datasets from Tirdod
combined_per_90_remaining_df = pd.read_csv('tirdod_data/combined_per_90.csv')
combined_totals_remaining_df = pd.read_csv('tirdod_data/combined_totals.csv')

In [800]:
cols_to_drop = ['xA_per90', 'ast_minus_xA_per90']

In [801]:
combined_per_90_remaining_df.isna().sum()

season                               0
league                               0
team                                 0
player                               0
sca_sca90                           12
gca_gca90                           12
tackles_tkl90                      194
tackles_tklw90                     190
tackles_def_3rd90                  204
tackles_mid_3rd90                  209
tackles_att_3rd90                  211
challenges_tkl90                   206
challenges_att90                   193
challenges_tkl_perc               1351
challenges_lost90                  201
blocks_blocks90                    205
blocks_sh90                        213
blocks_pass90                      207
int90                              196
tklint90                           187
clr90                              204
err90                              215
total_cmp_per90                     98
total_att_per90                     73
total_cmp_perc                      83
total_att_per90.1        

In [802]:
# List of additional columns to fill NaN with 0
columns_to_fill = [
    'sca_sca90', 'gca_gca90', 'tackles_tkl90', 'tackles_tklw90', 'tackles_def_3rd90',
    'tackles_mid_3rd90', 'tackles_att_3rd90', 'challenges_tkl90', 'challenges_att90',
    'challenges_tkl_perc', 'challenges_lost90', 'blocks_blocks90', 'blocks_sh90',
    'blocks_pass90', 'int90', 'tklint90', 'clr90', 'err90', 'total_cmp_per90',
    'total_att_per90', 'total_cmp_perc', 'total_att_per90.1', 'short_cmp_per90',
    'short_att_per90', 'short_cmp_perc', 'short_att_per90.1', 'medium_cmp_per90',
    'medium_att_per90', 'medium_cmp_perc', 'medium_att_per90.1', 'long_cmp_per90',
    'long_att_per90', 'long_cmp_perc', 'long_att_per90.1', 'ast_per90', 'xA_per90',
    'ast_minus_xA_per90', 'kp_per90', 'passes_into_final_third_per90',
    'passes_into_penalty_area_per90', 'progressive_passes_per90'
]

# Fill NaN with 0 for the specified additional columns
combined_per_90_remaining_df[columns_to_fill] = combined_per_90_remaining_df[columns_to_fill].fillna(0)

# Verify that NaN values have been replaced
print(combined_per_90_remaining_df[columns_to_fill].isna().sum())  # Should print 0 for all columns

sca_sca90                         0
gca_gca90                         0
tackles_tkl90                     0
tackles_tklw90                    0
tackles_def_3rd90                 0
tackles_mid_3rd90                 0
tackles_att_3rd90                 0
challenges_tkl90                  0
challenges_att90                  0
challenges_tkl_perc               0
challenges_lost90                 0
blocks_blocks90                   0
blocks_sh90                       0
blocks_pass90                     0
int90                             0
tklint90                          0
clr90                             0
err90                             0
total_cmp_per90                   0
total_att_per90                   0
total_cmp_perc                    0
total_att_per90.1                 0
short_cmp_per90                   0
short_att_per90                   0
short_cmp_perc                    0
short_att_per90.1                 0
medium_cmp_per90                  0
medium_att_per90            

In [803]:
final_all_cols_df_per90 = final_df_per90.merge(combined_per_90_remaining_df, how='left', on=['season','league', 'team', 'player'])

In [804]:
final_all_cols_df_per90.columns

Index(['season', 'league', 'team', 'player', 'nation', 'pos', 'age',
       'playing_time_min', 'playing_time_mn/mp', 'playing_time_min%',
       'playing_time_90s', 'starts_starts_per90', 'starts_compl_per90',
       'subs_subs_per90', 'team_success_ppm_per90', 'team_success_ong_per90',
       'team_success_onga_per90', 'standard_gls_per90', 'standard_sh_per90',
       'standard_sot_per90', 'standard_sot%', 'standard_g/sh',
       'standard_g/sot', 'standard_dist', 'standard_pk_per90',
       'standard_pkatt_per90', 'standard_fk_per90', 'expected_xg_per90',
       'expected_npxg_per90', 'expected_npxg/sh', 'expected_g_xg_per90',
       'expected_np:g_xg_per90', 'performance_crdy_per90',
       'performance_crdr_per90', 'performance_2crdy_per90',
       'performance_fls_per90', 'performance_fld_per90',
       'performance_off_per90', 'performance_crs_per90',
       'performance_int_per90', 'performance_tklw_per90',
       'performance_pkwon_per90', 'performance_pkcon_per90',
       'pe

In [805]:
league_distribution = final_all_cols_df_per90['league'].value_counts(normalize=True) * 100
print("\nLeague Distribution (%):")
print(league_distribution)


League Distribution (%):
SeriaA        21.594509
LaLiga        20.811334
Ligue1        20.547342
EPL           19.086589
Bundesliga    17.960225
Name: league, dtype: float64


In [806]:
top_players = final_all_cols_df_per90[['player', 'standard_gls_per90']].sort_values(by='standard_gls_per90', ascending=False).head(10)
print("\nTop 10 Players by Goals Per 90:")
print(top_players)


Top 10 Players by Goals Per 90:
                    player  standard_gls_per90
9700                 Pascu               10.00
1496                Losada               10.00
2101            Jack Lahne               10.00
10508         Sacha Delaye                5.00
7268         Pablo Sarabia                5.00
6325       Delano Burgzorg                3.33
2215           Amad Diallo                3.33
8188   Fernando Forestieri                3.33
10723       Ante Palaversa                3.33
1962         Ferris N'Goma                3.33


In [807]:
top_players = final_all_cols_df_per90[['player', 'standard_gls_per90']].sort_values(by='standard_gls_per90', ascending=False).head(10)
print("\nTop 10 Players by Goals Per 90:")
print(top_players)


Top 10 Players by Goals Per 90:
                    player  standard_gls_per90
9700                 Pascu               10.00
1496                Losada               10.00
2101            Jack Lahne               10.00
10508         Sacha Delaye                5.00
7268         Pablo Sarabia                5.00
6325       Delano Burgzorg                3.33
2215           Amad Diallo                3.33
8188   Fernando Forestieri                3.33
10723       Ante Palaversa                3.33
1962         Ferris N'Goma                3.33


In [808]:
goals_by_league = final_df_per90.groupby('league')['standard_gls_per90'].mean().sort_values(ascending=False)
print("\nAverage Goals Per 90 by League:")
print(goals_by_league)


Average Goals Per 90 by League:
league
Bundesliga    0.125081
Ligue1        0.116296
SeriaA        0.107673
LaLiga        0.106000
EPL           0.105334
Name: standard_gls_per90, dtype: float64


## Total DF

In [809]:
combined_totals_remaining_df.head()

Unnamed: 0,season,league,team,player,sca_sca,gca_gca,tackles_tkl,tackles_tklw,tackles_def_3rd,tackles_mid_3rd,tackles_att_3rd,challenges_tkl,challenges_att,challenges_tkl_perc,challenges_lost,blocks_blocks,blocks_sh,blocks_pass,int,tklint,clr,err,total_cmp,total_att,total_cmp_perc,short_cmp,short_att,short_cmp_perc,medium_cmp,medium_att,medium_cmp_perc,long_cmp,long_att,long_cmp_perc,ast,xA,ast_minus_xA,kp,passes_into_final_third,passes_into_penalty_area,progressive_passes
0,2019-2020,EPL,Liverpool,Virgil van Dijk,35.0,7.0,23.0,12.0,18.0,5.0,0.0,6.0,13.0,46.2,7.0,20.0,15.0,5.0,40.0,63.0,166.0,2.0,2919.0,3276.0,89.1,914.0,988.0,92.5,1638.0,1740.0,94.1,331.0,487.0,68.0,1.0,1.5,-0.5,7.0,182.0,9.0,170.0
1,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,155.0,24.0,55.0,34.0,35.0,17.0,3.0,36.0,78.0,46.2,42.0,27.0,9.0,18.0,45.0,100.0,84.0,3.0,2278.0,3230.0,70.5,993.0,1095.0,90.7,912.0,1250.0,73.0,315.0,700.0,45.0,13.0,9.7,3.3,87.0,190.0,88.0,247.0
2,2019-2020,EPL,Liverpool,Georginio Wijnaldum,55.0,4.0,38.0,20.0,12.0,21.0,5.0,10.0,40.0,25.0,30.0,18.0,4.0,14.0,17.0,55.0,34.0,3.0,1492.0,1646.0,90.6,849.0,914.0,92.9,511.0,544.0,93.9,67.0,79.0,84.8,0.0,1.6,-1.6,18.0,131.0,23.0,137.0
3,2019-2020,EPL,Liverpool,Andrew Robertson,130.0,22.0,54.0,31.0,17.0,25.0,12.0,27.0,61.0,44.3,34.0,29.0,7.0,22.0,38.0,92.0,72.0,1.0,2391.0,2996.0,79.8,1298.0,1441.0,90.1,874.0,1072.0,81.5,153.0,313.0,48.9,12.0,5.9,6.1,60.0,166.0,56.0,219.0
4,2019-2020,EPL,Liverpool,Roberto Firmino,114.0,20.0,37.0,19.0,8.0,14.0,15.0,10.0,43.0,23.3,33.0,34.0,3.0,31.0,7.0,44.0,11.0,1.0,941.0,1189.0,79.1,568.0,680.0,83.5,245.0,293.0,83.6,39.0,47.0,83.0,8.0,4.0,4.0,52.0,74.0,31.0,111.0


In [810]:
cols_to_drop = ['xA', 'ast_minus_xA']

In [811]:
combined_totals_remaining_df.isna().sum()

season                         0
league                         0
team                           0
player                         0
sca_sca                       12
gca_gca                       12
tackles_tkl                   12
tackles_tklw                   0
tackles_def_3rd               12
tackles_mid_3rd               12
tackles_att_3rd               12
challenges_tkl                12
challenges_att                12
challenges_tkl_perc         1351
challenges_lost               12
blocks_blocks                 12
blocks_sh                     12
blocks_pass                   12
int                            0
tklint                        12
clr                           12
err                           12
total_cmp                     12
total_att                     12
total_cmp_perc                83
short_cmp                     12
short_att                     12
short_cmp_perc               191
medium_cmp                    12
medium_att                    12
medium_cmp

In [812]:
final_df_all_cols_totals = final_df_total.merge(combined_totals_remaining_df, how='left', on=['season', 'league', 'team', 'player'])

In [813]:
# Fill NaN values with zero
final_df_all_cols_totals = final_df_all_cols_totals.fillna(0)

In [814]:
final_df_all_cols_totals.isna().sum()

season                      0
league                      0
team                        0
player                      0
nation                      0
                           ..
ast_minus_xA                0
kp                          0
passes_into_final_third     0
passes_into_penalty_area    0
progressive_passes          0
Length: 84, dtype: int64

In [815]:
filtered_totals_df = final_df_all_cols_totals[final_df_all_cols_totals['playing_time_min'] >= 500]
filtered_per90_df = final_all_cols_df_per90[final_all_cols_df_per90['playing_time_min'] >= 500]

In [816]:
# Identify duplicates
duplicates = filtered_totals_df[filtered_totals_df.duplicated(subset=['player', 'season'], keep=False)]

# Display duplicates
duplicates.sort_values(by='player')

Unnamed: 0,season,league,team,player,nation,pos,age,playing_time_min,playing_time_mn/mp,playing_time_min%,starts_starts,starts_compl,subs_subs,team_success_ppm,team_success_ong,team_success_onga,standard_gls,standard_sh,standard_sot,standard_sot%,standard_g/sh,standard_g/sot,standard_dist,standard_pk,standard_pkatt,standard_fk,expected_xg,expected_npxg,expected_npxg/sh,expected_g_xg,expected_np:g_xg,performance_crdy,performance_crdr,performance_2crdy,performance_fls,performance_fld,performance_off,performance_crs,performance_int,performance_tklw,performance_pkwon,performance_pkcon,performance_og,performance_recov,aerial_duels_won,aerial_duels_lost,aerial_duels_won%,sca_sca,gca_gca,tackles_tkl,tackles_tklw,tackles_def_3rd,tackles_mid_3rd,tackles_att_3rd,challenges_tkl,challenges_att,challenges_tkl_perc,challenges_lost,blocks_blocks,blocks_sh,blocks_pass,int,tklint,clr,err,total_cmp,total_att,total_cmp_perc,short_cmp,short_att,short_cmp_perc,medium_cmp,medium_att,medium_cmp_perc,long_cmp,long_att,long_cmp_perc,ast,xA,ast_minus_xA,kp,passes_into_final_third,passes_into_penalty_area,progressive_passes
1975,2019-2020,Ligue1,Metz,Adama Traoré,MLI,"FW,MF",24.0,1007.0,67.0,40.0,12.0,5.0,3.0,1.20,11.0,11.0,1.0,17.0,4.0,23.5,0.06,0.25,25.3,0.0,0.0,7.0,1.3,1.3,0.08,-0.3,-0.3,1.0,0.0,0.0,7.0,33.0,0.0,44.0,3.0,1.0,0.0,0.0,0.0,55.0,9.0,18.0,33.3,61.0,4.0,3.0,1.0,1.0,2.0,0.0,3.0,12.0,25.0,9.0,7.0,2.0,5.0,3.0,6.0,5.0,0.0,394.0,484.0,81.4,209.0,235.0,88.9,136.0,166.0,81.9,36.0,57.0,63.2,4.0,2.4,1.6,35.0,38.0,12.0,60.0
166,2019-2020,EPL,Wolves,Adama Traoré,ESP,"FW,DF",23.0,2605.0,70.0,76.2,27.0,18.0,10.0,1.57,44.0,33.0,4.0,43.0,12.0,27.9,0.09,0.33,19.8,0.0,0.0,0.0,3.8,3.8,0.09,0.2,0.2,1.0,0.0,0.0,39.0,77.0,6.0,183.0,13.0,21.0,0.0,0.0,0.0,118.0,45.0,52.0,46.4,111.0,15.0,33.0,21.0,6.0,14.0,13.0,8.0,23.0,34.8,15.0,19.0,3.0,16.0,13.0,46.0,14.0,2.0,614.0,952.0,64.5,335.0,425.0,78.8,202.0,311.0,65.0,41.0,97.0,42.3,9.0,6.5,2.5,48.0,55.0,41.0,88.0
10193,2022-2023,Ligue1,Lens,Adrien Thomasson,FRA,"MF,FW",28.0,1278.0,64.0,37.4,16.0,3.0,4.0,2.15,31.0,14.0,5.0,21.0,9.0,42.9,0.24,0.56,14.5,0.0,0.0,0.0,3.0,3.0,0.14,2.0,2.0,1.0,0.0,0.0,21.0,18.0,5.0,13.0,4.0,17.0,1.0,0.0,0.0,79.0,12.0,18.0,40.0,42.0,9.0,24.0,17.0,8.0,11.0,5.0,12.0,45.0,26.7,33.0,25.0,2.0,23.0,4.0,28.0,17.0,0.0,440.0,559.0,78.7,223.0,267.0,83.5,166.0,193.0,86.0,28.0,48.0,58.3,5.0,0.0,0.0,19.0,44.0,15.0,49.0
10587,2022-2023,Ligue1,Strasbourg,Adrien Thomasson,FRA,MF,28.0,937.0,62.0,27.4,10.0,6.0,5.0,0.73,7.0,18.0,0.0,17.0,5.0,29.4,0.00,0.00,12.1,0.0,0.0,0.0,1.5,1.5,0.09,-1.5,-1.5,2.0,0.0,0.0,16.0,20.0,4.0,8.0,3.0,9.0,0.0,0.0,0.0,80.0,7.0,6.0,53.8,38.0,3.0,19.0,9.0,8.0,6.0,5.0,9.0,30.0,30.0,21.0,18.0,0.0,18.0,3.0,22.0,2.0,0.0,429.0,524.0,81.9,214.0,251.0,85.3,161.0,191.0,84.3,29.0,41.0,70.7,2.0,0.0,0.0,17.0,25.0,13.0,41.0
2263,2019-2020,SeriaA,Roma,Alessandro Florenzi,ITA,"DF,FW",28.0,997.0,71.0,29.2,12.0,5.0,2.0,1.79,21.0,15.0,0.0,14.0,2.0,14.3,0.00,0.00,25.7,0.0,0.0,0.0,0.5,0.5,0.04,-0.5,-0.5,5.0,0.0,0.0,11.0,12.0,2.0,29.0,16.0,11.0,0.0,0.0,0.0,68.0,12.0,14.0,46.2,30.0,2.0,14.0,11.0,8.0,4.0,2.0,7.0,20.0,35.0,13.0,9.0,3.0,6.0,16.0,30.0,19.0,0.0,551.0,683.0,80.7,258.0,284.0,90.8,214.0,253.0,84.6,73.0,113.0,64.6,1.0,1.9,-0.9,12.0,55.0,13.0,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1511,2019-2020,LaLiga,Leganés,Youssef En-Nesyri,MAR,FW,22.0,1385.0,77.0,40.5,15.0,13.0,3.0,0.61,14.0,27.0,4.0,40.0,17.0,42.5,0.10,0.24,15.0,0.0,0.0,0.0,5.0,5.0,0.13,-1.0,-1.0,2.0,0.0,0.0,21.0,19.0,17.0,9.0,5.0,7.0,1.0,0.0,0.0,30.0,70.0,102.0,40.7,30.0,6.0,9.0,7.0,1.0,2.0,6.0,3.0,17.0,17.6,14.0,15.0,2.0,13.0,5.0,14.0,22.0,0.0,153.0,270.0,56.7,88.0,142.0,62.0,35.0,59.0,59.3,13.0,25.0,52.0,2.0,0.5,1.5,8.0,7.0,7.0,16.0
1823,2019-2020,Ligue1,Monaco,Youssouf Fofana,FRA,MF,20.0,610.0,87.0,24.2,7.0,5.0,0.0,1.57,8.0,7.0,0.0,7.0,2.0,28.6,0.00,0.00,26.5,0.0,0.0,0.0,0.3,0.3,0.04,-0.3,-0.3,1.0,0.0,0.0,12.0,7.0,1.0,6.0,8.0,9.0,0.0,0.0,0.0,49.0,8.0,8.0,50.0,16.0,2.0,16.0,9.0,6.0,10.0,0.0,8.0,15.0,53.3,7.0,12.0,2.0,10.0,8.0,24.0,9.0,0.0,303.0,361.0,83.9,137.0,155.0,88.4,127.0,139.0,91.4,31.0,45.0,68.9,2.0,0.6,1.4,5.0,38.0,5.0,47.0
1847,2019-2020,Ligue1,Strasbourg,Youssouf Fofana,FRA,MF,20.0,1008.0,78.0,41.5,12.0,9.0,1.0,1.69,17.0,13.0,1.0,14.0,4.0,28.6,0.07,0.25,21.4,0.0,0.0,0.0,0.8,0.8,0.06,0.2,0.2,0.0,0.0,0.0,11.0,6.0,2.0,8.0,14.0,17.0,0.0,0.0,0.0,56.0,17.0,8.0,68.0,20.0,0.0,25.0,17.0,10.0,12.0,3.0,13.0,30.0,43.3,17.0,11.0,0.0,11.0,14.0,39.0,7.0,0.0,362.0,439.0,82.5,166.0,189.0,87.8,144.0,165.0,87.3,36.0,48.0,75.0,0.0,0.4,-0.4,8.0,29.0,4.0,56.0
9718,2022-2023,LaLiga,Betis,Álex Moreno,ESP,DF,29.0,1350.0,90.0,39.5,15.0,15.0,0.0,1.80,19.0,13.0,0.0,13.0,4.0,30.8,0.00,0.00,17.9,0.0,0.0,0.0,1.3,1.3,0.10,-1.3,-1.3,1.0,0.0,0.0,17.0,19.0,1.0,28.0,17.0,22.0,0.0,1.0,0.0,78.0,9.0,18.0,33.3,20.0,4.0,29.0,22.0,19.0,10.0,0.0,18.0,36.0,50.0,18.0,17.0,4.0,13.0,17.0,46.0,39.0,0.0,525.0,695.0,75.5,298.0,332.0,89.8,180.0,247.0,72.9,31.0,69.0,44.9,3.0,0.0,0.0,9.0,38.0,13.0,44.0


In [817]:
valuations_df.team.unique()

array(['Werder Bremen', 'Augsburg', 'Liverpool', 'Milan', 'Schalke 04',
       'Montpellier', 'Juventus', 'Dijon', 'Marseille', 'Torino',
       'Leicester City', 'Sassuolo', 'Brescia', "M'Gladbach", 'Betis',
       'Cádiz', 'Aston Villa', 'Lazio', 'Villarreal', 'Hertha BSC',
       'Espanyol', 'Osasuna', 'Manchester Utd', 'Hoffenheim', 'Monaco',
       'Bologna', 'Paderborn 07', 'Düsseldorf', 'Strasbourg', 'Angers',
       'Sevilla', 'Bayern Munich', 'Venezia', 'Norwich City', 'Genoa',
       'Cagliari', 'Monza', 'Parma', 'Everton', 'Sheffield Utd',
       'Watford', 'Bournemouth', 'Inter', 'Burnley', 'Manchester City',
       'West Ham', 'Barcelona', 'Sampdoria', 'Nice', 'Real Madrid',
       'Crotone', 'Union Berlin', 'Spezia', 'Fiorentina', 'SPAL',
       'Atlético Madrid', 'Tottenham', 'Valladolid', 'Bordeaux', 'Lille',
       'Ajaccio', 'Lyon', 'Metz', 'Hellas Verona', 'Chelsea',
       'Southampton', 'Atalanta', 'Eint Frankfurt', 'Toulouse', 'Lens',
       'Benevento', 'Köln', '

In [818]:
# Mapping team names between dataframes
team_name_mappings = {
    'Sportverein Werder Bremen von 1899': 'Werder Bremen',
    'FC Augsburg 1907': 'Augsburg',
    'FC Schalke 04': 'Schalke 04',
    'Borussia Verein für Leibesübungen 1900 Mönchengladbach': "M'Gladbach",
    'Hertha BSC': 'Hertha BSC',
    'TSG 1899 Hoffenheim Fußball-Spielbetriebs GmbH': 'Hoffenheim',
    'SC Paderborn 07': 'Paderborn 07',
    'Fortuna Düsseldorf': 'Düsseldorf',
    'FC Bayern München': 'Bayern Munich',
    '1. FC Union Berlin': 'Union Berlin',
    'Eintracht Frankfurt Fußball AG': 'Eint Frankfurt',
    '1.FC Köln': 'Köln',
    'RasenBallsport Leipzig': 'RB Leipzig',
    'Borussia Dortmund': 'Dortmund',
    'Verein für Bewegungsspiele Stuttgart 1893': 'Stuttgart',
    'Arminia Bielefeld': 'Arminia',
    'Bayer 04 Leverkusen Fußball': 'Leverkusen',
    'Sport-Club Freiburg': 'Freiburg',
    'Verein für Leibesübungen Wolfsburg': 'Wolfsburg',
    'SpVgg Greuther Fürth': 'Greuther Fürth',
    '1. Fußball- und Sportverein Mainz 05': 'Mainz 05',
    'Verein für Leibesübungen Bochum 1848 Fußballgemeinschaft': 'Bochum',
    'SV Darmstadt 98': 'Darmstadt',
    'Real Betis Balompié S.A.D.': 'Betis',
    'Cádiz CF': 'Cádiz',
    'Villarreal Club de Fútbol S.A.D.': 'Villarreal',
    'Reial Club Deportiu Espanyol de Barcelona S.A.D.': 'Espanyol',
    'Club Atlético Osasuna': 'Osasuna',
    'Sevilla Fútbol Club S.A.D.': 'Sevilla',
    'Futbol Club Barcelona': 'Barcelona',
    'Real Madrid Club de Fútbol': 'Real Madrid',
    'Club Atlético de Madrid S.A.D.': 'Atlético Madrid',
    'Real Valladolid Club de Fútbol S.A.D.': 'Valladolid',
    'SD Huesca': 'Huesca',
    'Real Sociedad de Fútbol S.A.D.': 'Real Sociedad',
    'SD Eibar': 'Eibar',
    'Valencia Club de Fútbol S. A. D.': 'Valencia',
    'Elche CF': 'Elche',
    'Club Deportivo Leganés S.A.D.': 'Leganés',
    'Real Club Deportivo Mallorca S.A.D.': 'Mallorca',
    'Athletic Club Bilbao': 'Athletic Club',
    'Rayo Vallecano de Madrid S.A.D.': 'Rayo Vallecano',
    'Granada CF': 'Granada',
    'Levante UD': 'Levante',
    'Deportivo Alavés S.A.D.': 'Alavés',
    'Getafe Club de Fútbol S.A.D. Team Dubai': 'Getafe',
    'Real Club Celta de Vigo S. A. D.': 'Celta Vigo',
    'Girona Fútbol Club S. A. D.': 'Girona',
    'UD Almería': 'Almería',
    'Unión Deportiva Las Palmas S.A.D.': 'Las Palmas',
    'Liverpool Football Club': 'Liverpool',
    'Leicester City Football Club': 'Leicester City',
    'Aston Villa Football Club': 'Aston Villa',
    'Manchester United Football Club': 'Manchester Utd',
    'Norwich City': 'Norwich City',
    'Everton Football Club': 'Everton',
    'Sheffield United': 'Sheffield Utd',
    'Watford FC': 'Watford',
    'Association Football Club Bournemouth': 'Bournemouth',
    'Burnley FC': 'Burnley',
    'Manchester City Football Club': 'Manchester City',
    'West Ham United Football Club': 'West Ham',
    'Tottenham Hotspur Football Club': 'Tottenham',
    'Chelsea Football Club': 'Chelsea',
    'Southampton Football Club': 'Southampton',
    'West Bromwich Albion': 'West Brom',
    'Leeds United': 'Leeds United',
    'Crystal Palace Football Club': 'Crystal Palace',
    'Wolverhampton Wanderers Football Club': 'Wolves',
    'Arsenal Football Club': 'Arsenal',
    'Brighton and Hove Albion Football Club': 'Brighton',
    'Nottingham Forest Football Club': "Nott'ham Forest",
    'Newcastle United Football Club': 'Newcastle Utd',
    'Brentford Football Club': 'Brentford',
    'Fulham Football Club': 'Fulham',
    'Montpellier Hérault Sport Club': 'Montpellier',
    'Dijon FCO': 'Dijon',
    'Olympique de Marseille': 'Marseille',
    'Association sportive de Monaco Football Club': 'Monaco',
    'Racing Club de Strasbourg Alsace': 'Strasbourg',
    "Angers Sporting Club de l'Ouest": 'Angers',
    "Olympique Gymnaste Club Nice Côte d'Azur": 'Nice',
    'FC Girondins Bordeaux': 'Bordeaux',
    'Lille Olympique Sporting Club Lille Métropole': 'Lille',
    'AC Ajaccio': 'Ajaccio',
    'Olympique Lyonnais': 'Lyon',
    'FC Metz': 'Metz',
    'Toulouse Football Club': 'Toulouse',
    'Racing Club de Lens': 'Lens',
    'Association sportive de Saint-Étienne Loire': 'Saint-Étienne',
    'Stade Rennais Football Club': 'Rennes',
    'FC Lorient': 'Lorient',
    'Paris Saint-Germain Football Club': 'Paris S-G',
    'Amiens SC': 'Amiens',
    'Football Club de Nantes': 'Nantes',
    'ESTAC Troyes': 'Troyes',
    'Stade brestois 29': 'Brest',
    'Stade de Reims': 'Reims',
    'Nîmes Olympique': 'Nîmes',
    'Clermont Foot 63': 'Clermont Foot',
    'Association de la Jeunesse auxerroise': 'Auxerre',
    'Associazione Calcio Milan': 'Milan',
    'Juventus Football Club': 'Juventus',
    'Torino Calcio': 'Torino',
    'US Sassuolo': 'Sassuolo',
    'Brescia Calcio': 'Brescia',
    'Società Sportiva Lazio S.p.A.': 'Lazio',
    'Bologna Football Club 1909': 'Bologna',
    'Venezia Football Club': 'Venezia',
    'Genoa Cricket and Football Club': 'Genoa',
    'Cagliari Calcio': 'Cagliari',
    'Associazione Calcio Monza': 'Monza',
    'Parma Calcio 1913': 'Parma',
    'Football Club Internazionale Milano S.p.A.': 'Inter',
    'UC Sampdoria': 'Sampdoria',
    'FC Crotone': 'Crotone',
    'Spezia Calcio': 'Spezia',
    'Associazione Calcio Fiorentina': 'Fiorentina',
    'SPAL': 'SPAL',
    'Verona Hellas Football Club': 'Hellas Verona',
    'Atalanta Bergamasca Calcio S.p.a.': 'Atalanta',
    'Benevento Calcio': 'Benevento',
    'US Salernitana 1919': 'Salernitana',
    'Associazione Sportiva Roma': 'Roma',
    'Udinese Calcio': 'Udinese',
    'US Cremonese': 'Cremonese',
    'Società Sportiva Calcio Napoli': 'Napoli',
    'Empoli Football Club S.r.l.': 'Empoli',
    'Unione Sportiva Lecce': 'Lecce',
}

# Display the mapping dictionary
team_name_mappings

{'Sportverein Werder Bremen von 1899': 'Werder Bremen',
 'FC Augsburg 1907': 'Augsburg',
 'FC Schalke 04': 'Schalke 04',
 'Borussia Verein für Leibesübungen 1900 Mönchengladbach': "M'Gladbach",
 'Hertha BSC': 'Hertha BSC',
 'TSG 1899 Hoffenheim Fußball-Spielbetriebs GmbH': 'Hoffenheim',
 'SC Paderborn 07': 'Paderborn 07',
 'Fortuna Düsseldorf': 'Düsseldorf',
 'FC Bayern München': 'Bayern Munich',
 '1. FC Union Berlin': 'Union Berlin',
 'Eintracht Frankfurt Fußball AG': 'Eint Frankfurt',
 '1.FC Köln': 'Köln',
 'RasenBallsport Leipzig': 'RB Leipzig',
 'Borussia Dortmund': 'Dortmund',
 'Verein für Bewegungsspiele Stuttgart 1893': 'Stuttgart',
 'Arminia Bielefeld': 'Arminia',
 'Bayer 04 Leverkusen Fußball': 'Leverkusen',
 'Sport-Club Freiburg': 'Freiburg',
 'Verein für Leibesübungen Wolfsburg': 'Wolfsburg',
 'SpVgg Greuther Fürth': 'Greuther Fürth',
 '1. Fußball- und Sportverein Mainz 05': 'Mainz 05',
 'Verein für Leibesübungen Bochum 1848 Fußballgemeinschaft': 'Bochum',
 'SV Darmstadt 98'

### Load the valuations data

In [821]:
valuations_df = pd.read_csv('output_data/final_valuations_data.csv', index_col=0)

In [822]:
# Update the team names in the valuations DataFrame using the consolidated mapping
valuations_df['team'] = valuations_df['team'].replace(team_name_mappings)

# Display the updated DataFrame to confirm changes
valuations_df.head()

Unnamed: 0,season,league,team,player,market_value_in_eur
0,2019-2020,Bundesliga,Werder Bremen,Claudio Pizarro,400000
1,2019-2020,Bundesliga,Augsburg,Stephan Lichtsteiner,800000
2,2019-2020,EPL,Liverpool,James Milner,6500000
3,2020-2021,EPL,Liverpool,James Milner,3000000
4,2021-2022,EPL,Liverpool,James Milner,2000000


### Totals Combined DF

In [823]:
from rapidfuzz import process, fuzz

def fuzzy_match(left_df, right_df, left_key, right_key, threshold=80):
    """
    Improved fuzzy matching with debugging and token penalty for short matches.
    """
    left_df['normalized'] = left_df[left_key].apply(lambda x: unidecode(x.strip().lower()))
    right_df['normalized'] = right_df[right_key].apply(lambda x: unidecode(x.strip().lower()))

    matches = {}
    for value in left_df['normalized'].unique():
        # Direct match check
        if value in right_df['normalized'].values:
            matches[value] = value
        else:
            # Fuzzy match
            best_match = process.extractOne(value, right_df['normalized'], scorer=fuzz.token_set_ratio)
            if best_match and best_match[1] >= threshold:
                matches[value] = best_match[0]

    # Debugging: Print mismatches
    for key, match in matches.items():
        if key != match:
            print(f"Mismatch: {key} -> {match}")

    # Apply the matches
    left_df['fuzzy_match'] = left_df['normalized'].map(matches)

    return left_df

# Use the updated function
result_df = fuzzy_match(filtered_totals_df, valuations_df, 'player', 'player', threshold=75)

# Step 2: Merge the dataframes based on fuzzy matches and other keys
merged = filtered_totals_df.merge(
    valuations_df,
    how='left',
    left_on=['season', 'league', 'team', 'fuzzy_match'],
    right_on=['season', 'league', 'team', 'normalized']
)

# Step 3: Clean up and rename columns
merged = merged.drop(columns=['fuzzy_match', 'normalized_x', 'normalized_y']).rename(columns={'player_x': 'player'})

# Display the result
merged

Mismatch: emerson palmieri -> emerson
Mismatch: son heung-min -> heung-min son
Mismatch: martinelli -> gabriel martinelli
Mismatch: oliver mcburnie -> oli mcburnie
Mismatch: pierre hojbjerg -> pierre-emile hojbjerg
Mismatch: matthew longstaff -> matty longstaff
Mismatch: trezeguet -> mahmoud trezeguet
Mismatch: jose holebas -> jose cholevas
Mismatch: emi buendia -> emiliano buendia
Mismatch: moanes dabbur -> munas dabbur
Mismatch: joao victor santos sa -> victor sa
Mismatch: kwon chang-hoon -> chang-hoon kwon
Mismatch: leandro barreiro martins -> leandro barreiro
Mismatch: ohis felix uduokhai -> felix uduokhai
Mismatch: cauly oliveira souza -> cauly
Mismatch: dani carvajal -> daniel carvajal
Mismatch: andre-frank zambo anguissa -> frank anguissa
Mismatch: javier ontiveros -> javi ontiveros
Mismatch: yan brice eteki -> yan eteki
Mismatch: jose martinez -> jose antonio martinez
Mismatch: xabier etxeita -> xabi etxeita
Mismatch: daniel parejo -> dani parejo
Mismatch: jose luis gaya -> jos

Unnamed: 0,season,league,team,player,nation,pos,age,playing_time_min,playing_time_mn/mp,playing_time_min%,starts_starts,starts_compl,subs_subs,team_success_ppm,team_success_ong,team_success_onga,standard_gls,standard_sh,standard_sot,standard_sot%,standard_g/sh,standard_g/sot,standard_dist,standard_pk,standard_pkatt,standard_fk,expected_xg,expected_npxg,expected_npxg/sh,expected_g_xg,expected_np:g_xg,performance_crdy,performance_crdr,performance_2crdy,performance_fls,performance_fld,performance_off,performance_crs,performance_int,performance_tklw,performance_pkwon,performance_pkcon,performance_og,performance_recov,aerial_duels_won,aerial_duels_lost,aerial_duels_won%,sca_sca,gca_gca,tackles_tkl,tackles_tklw,tackles_def_3rd,tackles_mid_3rd,tackles_att_3rd,challenges_tkl,challenges_att,challenges_tkl_perc,challenges_lost,blocks_blocks,blocks_sh,blocks_pass,int,tklint,clr,err,total_cmp,total_att,total_cmp_perc,short_cmp,short_att,short_cmp_perc,medium_cmp,medium_att,medium_cmp_perc,long_cmp,long_att,long_cmp_perc,ast,xA,ast_minus_xA,kp,passes_into_final_third,passes_into_penalty_area,progressive_passes,player_y,market_value_in_eur
0,2019-2020,EPL,Liverpool,Virgil van Dijk,NED,DF,28.0,3420.0,90.0,100.0,38.0,38.0,0.0,2.61,85.0,33.0,5.0,31.0,14.0,45.2,0.16,0.36,11.8,0.0,0.0,0.0,2.9,2.9,0.09,2.1,2.1,1.0,0.0,0.0,16.0,25.0,1.0,0.0,40.0,12.0,0.0,0.0,0.0,220.0,188.0,56.0,77.0,35.0,7.0,23.0,12.0,18.0,5.0,0.0,6.0,13.0,46.2,7.0,20.0,15.0,5.0,40.0,63.0,166.0,2.0,2919.0,3276.0,89.1,914.0,988.0,92.5,1638.0,1740.0,94.1,331.0,487.0,68.0,1.0,1.5,-0.5,7.0,182.0,9.0,170.0,Virgil van Dijk,80000000.0
1,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,ENG,DF,20.0,3175.0,84.0,92.8,35.0,31.0,3.0,2.61,80.0,32.0,4.0,44.0,13.0,29.5,0.09,0.31,22.6,0.0,0.0,13.0,2.8,2.8,0.06,1.2,1.2,5.0,0.0,0.0,22.0,9.0,3.0,382.0,45.0,34.0,0.0,0.0,0.0,221.0,15.0,30.0,33.3,155.0,24.0,55.0,34.0,35.0,17.0,3.0,36.0,78.0,46.2,42.0,27.0,9.0,18.0,45.0,100.0,84.0,3.0,2278.0,3230.0,70.5,993.0,1095.0,90.7,912.0,1250.0,73.0,315.0,700.0,45.0,13.0,9.7,3.3,87.0,190.0,88.0,247.0,Trent Alexander-Arnold,99000000.0
2,2019-2020,EPL,Liverpool,Georginio Wijnaldum,NED,MF,28.0,2935.0,79.0,85.8,35.0,20.0,2.0,2.59,74.0,28.0,4.0,36.0,15.0,41.7,0.11,0.27,16.2,0.0,0.0,0.0,2.6,2.6,0.07,1.4,1.4,0.0,0.0,0.0,15.0,32.0,1.0,5.0,17.0,20.0,0.0,0.0,0.0,189.0,30.0,51.0,37.0,55.0,4.0,38.0,20.0,12.0,21.0,5.0,10.0,40.0,25.0,30.0,18.0,4.0,14.0,17.0,55.0,34.0,3.0,1492.0,1646.0,90.6,849.0,914.0,92.9,511.0,544.0,93.9,67.0,79.0,84.8,0.0,1.6,-1.6,18.0,131.0,23.0,137.0,Georginio Wijnaldum,40000000.0
3,2019-2020,EPL,Liverpool,Andrew Robertson,SCO,DF,25.0,3111.0,86.0,91.0,34.0,30.0,2.0,2.64,80.0,31.0,2.0,22.0,6.0,27.3,0.09,0.33,18.6,0.0,0.0,0.0,1.8,1.8,0.08,0.2,0.2,2.0,0.0,0.0,18.0,13.0,2.0,210.0,38.0,31.0,0.0,0.0,0.0,210.0,33.0,34.0,49.3,130.0,22.0,54.0,31.0,17.0,25.0,12.0,27.0,61.0,44.3,34.0,29.0,7.0,22.0,38.0,92.0,72.0,1.0,2391.0,2996.0,79.8,1298.0,1441.0,90.1,874.0,1072.0,81.5,153.0,313.0,48.9,12.0,5.9,6.1,60.0,166.0,56.0,219.0,Andrew Robertson,64000000.0
4,2019-2020,EPL,Liverpool,Roberto Firmino,BRA,FW,27.0,2988.0,79.0,87.4,34.0,14.0,4.0,2.61,75.0,26.0,9.0,99.0,38.0,38.4,0.09,0.24,13.7,0.0,0.0,0.0,14.0,14.0,0.14,-5.0,-5.0,0.0,0.0,0.0,29.0,16.0,6.0,9.0,7.0,19.0,0.0,0.0,0.0,136.0,26.0,74.0,26.0,114.0,20.0,37.0,19.0,8.0,14.0,15.0,10.0,43.0,23.3,33.0,34.0,3.0,31.0,7.0,44.0,11.0,1.0,941.0,1189.0,79.1,568.0,680.0,83.5,245.0,293.0,83.6,39.0,47.0,83.0,8.0,4.0,4.0,52.0,74.0,31.0,111.0,Roberto Firmino,72000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7865,2022-2023,SeriaA,Sampdoria,Koray Günter,GER,DF,27.0,776.0,78.0,22.7,8.0,7.0,2.0,0.60,8.0,20.0,0.0,1.0,0.0,0.0,0.00,0.00,9.6,0.0,0.0,0.0,0.1,0.1,0.07,-0.1,-0.1,2.0,0.0,0.0,7.0,3.0,0.0,1.0,11.0,8.0,0.0,1.0,0.0,47.0,24.0,9.0,72.7,3.0,0.0,10.0,8.0,6.0,4.0,0.0,6.0,9.0,66.7,3.0,10.0,5.0,5.0,11.0,21.0,46.0,0.0,344.0,420.0,81.9,138.0,157.0,87.9,172.0,198.0,86.9,32.0,55.0,58.2,0.0,0.0,0.0,2.0,21.0,2.0,21.0,Koray Günter,3500000.0
7866,2022-2023,SeriaA,Sampdoria,Gonzalo Villar,ESP,MF,24.0,696.0,46.0,20.4,8.0,1.0,7.0,0.40,5.0,14.0,0.0,1.0,0.0,0.0,0.00,0.00,26.1,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,2.0,0.0,0.0,17.0,18.0,0.0,7.0,4.0,7.0,0.0,1.0,0.0,55.0,9.0,5.0,64.3,9.0,0.0,13.0,7.0,4.0,8.0,1.0,7.0,15.0,46.7,8.0,11.0,3.0,8.0,4.0,17.0,9.0,1.0,317.0,381.0,83.2,136.0,162.0,84.0,145.0,165.0,87.9,27.0,41.0,65.9,0.0,0.0,0.0,0.0,25.0,1.0,26.0,Gonzalo Villar,2800000.0
7867,2022-2023,SeriaA,Sampdoria,Fabio Quagliarella,ITA,FW,39.0,800.0,35.0,23.4,7.0,0.0,16.0,0.39,6.0,21.0,1.0,36.0,6.0,16.7,0.03,0.17,16.5,0.0,0.0,0.0,2.8,2.8,0.08,-1.8,-1.8,2.0,0.0,0.0,4.0,19.0,6.0,14.0,0.0,1.0,0.0,0.0,0.0,29.0,9.0,28.0,24.3,23.0,3.0,2.0,1.0,1.0,1.0,0.0,1.0,7.0,14.3,6.0,5.0,1.0,4.0,0.0,2.0,9.0,0.0,159.0,245.0,64.9,78.0,100.0,78.0,49.0,72.0,68.1,18.0,37.0,48.6,1.0,0.0,0.0,7.0,15.0,7.0,25.0,Fabio Quagliarella,500000.0
7868,2022-2023,SeriaA,Sampdoria,Valerio Verre,ITA,MF,28.0,677.0,38.0,19.8,6.0,0.0,12.0,0.50,3.0,12.0,0.0,14.0,3.0,21.4,0.00,0.00,25.3,0.0,0.0,0.0,0.6,0.6,0.04,-0.6,-0.6,5.0,0.0,0.0,14.0,13.0,2.0,33.0,9.0,7.0,0.0,0.0,0.0,61.0,12.0,12.0,50.0,30.0,0.0,14.0,7.0,6.0,5.0,3.0,8.0,17.0,47.1,9.0,9.0,1.0,8.0,9.0,23.0,3.0,0.0,282.0,398.0,70.9,136.0,161.0,84.5,99.0,135.0,73.3,41.0,74.0,55.4,0.0,0.0,0.0,13.0,30.0,14.0,45.0,Valerio Verre,1800000.0


In [824]:
# Remove Goalkeepers
merged = merged[merged['pos'] != 'GK']

In [825]:
merged[merged['market_value_in_eur'].isna()]

Unnamed: 0,season,league,team,player,nation,pos,age,playing_time_min,playing_time_mn/mp,playing_time_min%,starts_starts,starts_compl,subs_subs,team_success_ppm,team_success_ong,team_success_onga,standard_gls,standard_sh,standard_sot,standard_sot%,standard_g/sh,standard_g/sot,standard_dist,standard_pk,standard_pkatt,standard_fk,expected_xg,expected_npxg,expected_npxg/sh,expected_g_xg,expected_np:g_xg,performance_crdy,performance_crdr,performance_2crdy,performance_fls,performance_fld,performance_off,performance_crs,performance_int,performance_tklw,performance_pkwon,performance_pkcon,performance_og,performance_recov,aerial_duels_won,aerial_duels_lost,aerial_duels_won%,sca_sca,gca_gca,tackles_tkl,tackles_tklw,tackles_def_3rd,tackles_mid_3rd,tackles_att_3rd,challenges_tkl,challenges_att,challenges_tkl_perc,challenges_lost,blocks_blocks,blocks_sh,blocks_pass,int,tklint,clr,err,total_cmp,total_att,total_cmp_perc,short_cmp,short_att,short_cmp_perc,medium_cmp,medium_att,medium_cmp_perc,long_cmp,long_att,long_cmp_perc,ast,xA,ast_minus_xA,kp,passes_into_final_third,passes_into_penalty_area,progressive_passes,player_y,market_value_in_eur
121,2019-2020,EPL,Wolves,Jonny Castro,ESP,"MF,DF",25.0,2887.0,82.0,84.4,33.0,27.0,2.0,1.57,42.0,35.0,2.0,18.0,5.0,27.8,0.11,0.40,19.0,0.0,0.0,0.0,1.6,1.6,0.09,0.4,0.4,4.0,0.0,0.0,34.0,28.0,2.0,54.0,53.0,55.0,1.0,0.0,0.0,158.0,35.0,50.0,41.2,64.0,7.0,91.0,55.0,54.0,26.0,11.0,29.0,82.0,35.4,53.0,45.0,10.0,35.0,53.0,144.0,45.0,2.0,1177.0,1451.0,81.1,731.0,810.0,90.2,333.0,426.0,78.2,38.0,68.0,55.9,2.0,1.3,0.7,23.0,47.0,16.0,97.0,,
135,2019-2020,EPL,Arsenal,Nicolas Pépé,CIV,FW,24.0,2010.0,65.0,58.8,22.0,11.0,9.0,1.55,38.0,30.0,5.0,49.0,17.0,34.7,0.08,0.24,20.0,1.0,1.0,7.0,4.9,4.1,0.08,0.1,-0.1,4.0,0.0,0.0,14.0,37.0,4.0,109.0,8.0,17.0,0.0,0.0,0.0,104.0,21.0,41.0,33.9,72.0,9.0,32.0,17.0,15.0,13.0,4.0,8.0,16.0,50.0,8.0,15.0,2.0,13.0,8.0,40.0,5.0,0.0,610.0,824.0,74.0,364.0,415.0,87.7,171.0,222.0,77.0,45.0,101.0,44.6,6.0,4.0,2.0,39.0,28.0,30.0,67.0,,
328,2019-2020,EPL,Aston Villa,Mbwana Samatta,TAN,FW,26.0,943.0,67.0,27.6,11.0,5.0,3.0,0.71,7.0,18.0,1.0,17.0,3.0,17.6,0.06,0.33,10.8,0.0,0.0,0.0,2.3,2.3,0.14,-1.3,-1.3,1.0,0.0,0.0,9.0,18.0,8.0,3.0,1.0,4.0,0.0,0.0,0.0,24.0,40.0,72.0,35.7,17.0,0.0,6.0,4.0,0.0,3.0,3.0,1.0,8.0,12.5,7.0,7.0,0.0,7.0,1.0,7.0,6.0,0.0,119.0,176.0,67.6,76.0,99.0,76.8,27.0,50.0,54.0,9.0,10.0,90.0,0.0,0.2,-0.2,10.0,8.0,2.0,11.0,,
553,2019-2020,Bundesliga,Eint Frankfurt,Obite N'Dicka,FRA,DF,19.0,1721.0,78.0,56.2,20.0,15.0,2.0,1.09,28.0,35.0,1.0,10.0,2.0,20.0,0.10,0.50,13.0,0.0,0.0,0.0,0.7,0.7,0.07,0.3,0.3,3.0,0.0,0.0,17.0,8.0,0.0,10.0,29.0,13.0,0.0,1.0,1.0,145.0,39.0,28.0,58.2,23.0,2.0,27.0,13.0,13.0,11.0,3.0,15.0,28.0,53.6,13.0,28.0,6.0,22.0,29.0,56.0,57.0,0.0,866.0,1069.0,81.0,449.0,499.0,90.0,355.0,419.0,84.7,44.0,99.0,44.4,1.0,0.8,0.2,8.0,62.0,5.0,75.0,,
722,2019-2020,Bundesliga,Düsseldorf,Kasim Nuhu,GHA,DF,24.0,999.0,77.0,32.6,11.0,10.0,2.0,0.77,10.0,28.0,1.0,6.0,3.0,50.0,0.17,0.33,7.4,0.0,0.0,0.0,1.2,1.2,0.20,-0.2,-0.2,0.0,0.0,0.0,9.0,9.0,0.0,0.0,25.0,7.0,0.0,0.0,0.0,61.0,16.0,16.0,50.0,9.0,2.0,14.0,7.0,12.0,1.0,1.0,9.0,19.0,47.4,10.0,16.0,12.0,4.0,25.0,39.0,54.0,0.0,417.0,519.0,80.3,121.0,143.0,84.6,219.0,250.0,87.6,75.0,116.0,64.7,0.0,0.2,-0.2,2.0,35.0,1.0,44.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7596,2022-2023,SeriaA,Fiorentina,Arthur,BRA,FW,24.0,1432.0,51.0,41.9,15.0,5.0,13.0,1.50,25.0,17.0,8.0,52.0,20.0,38.5,0.12,0.30,13.1,2.0,2.0,0.0,7.9,6.3,0.12,0.1,-0.3,4.0,0.0,0.0,26.0,18.0,2.0,4.0,1.0,2.0,0.0,1.0,0.0,39.0,49.0,49.0,50.0,36.0,2.0,9.0,2.0,1.0,4.0,4.0,4.0,8.0,50.0,4.0,10.0,1.0,9.0,1.0,10.0,12.0,0.0,183.0,271.0,67.5,118.0,155.0,76.1,41.0,52.0,78.8,9.0,14.0,64.3,1.0,0.0,0.0,14.0,3.0,7.0,21.0,,
7598,2022-2023,SeriaA,Fiorentina,Nicolás González,ARG,"FW,MF",24.0,1354.0,56.0,39.6,13.0,7.0,11.0,1.54,23.0,16.0,6.0,58.0,17.0,29.3,0.07,0.24,17.4,2.0,2.0,1.0,6.7,5.4,0.09,-0.7,-1.4,3.0,0.0,0.0,16.0,55.0,4.0,31.0,4.0,10.0,0.0,1.0,0.0,60.0,64.0,36.0,64.0,62.0,3.0,17.0,10.0,3.0,10.0,4.0,5.0,16.0,31.3,11.0,14.0,0.0,14.0,4.0,21.0,6.0,0.0,381.0,571.0,66.7,154.0,225.0,68.4,178.0,218.0,81.7,37.0,72.0,51.4,1.0,0.0,0.0,19.0,41.0,21.0,70.0,,
7647,2022-2023,SeriaA,Monza,Carlos,BRA,DF,23.0,3083.0,88.0,90.1,35.0,31.0,0.0,1.40,45.0,46.0,6.0,42.0,16.0,38.1,0.14,0.38,15.0,0.0,0.0,0.0,4.4,4.4,0.11,1.6,1.6,4.0,0.0,0.0,41.0,19.0,6.0,54.0,24.0,38.0,0.0,0.0,0.0,157.0,57.0,33.0,63.3,56.0,9.0,73.0,38.0,42.0,23.0,8.0,33.0,54.0,61.1,21.0,24.0,6.0,18.0,24.0,97.0,85.0,1.0,1430.0,1719.0,83.2,743.0,811.0,91.6,566.0,665.0,85.1,82.0,133.0,61.7,5.0,0.0,0.0,26.0,75.0,27.0,101.0,,
7660,2022-2023,SeriaA,Monza,Pepín,EQG,MF,25.0,1123.0,45.0,32.8,11.0,3.0,14.0,1.36,12.0,19.0,0.0,10.0,3.0,30.0,0.00,0.00,23.2,0.0,0.0,1.0,0.5,0.5,0.05,-0.5,-0.5,4.0,0.0,0.0,19.0,12.0,0.0,11.0,13.0,19.0,0.0,0.0,0.0,66.0,6.0,7.0,46.2,23.0,3.0,28.0,19.0,10.0,11.0,7.0,9.0,27.0,33.3,18.0,14.0,3.0,11.0,13.0,41.0,17.0,0.0,554.0,666.0,83.2,263.0,298.0,88.3,240.0,269.0,89.2,38.0,64.0,59.4,2.0,0.0,0.0,9.0,67.0,18.0,74.0,,


In [826]:
# Drop non-merged 
merged_totals = merged[merged['market_value_in_eur'].notna()]
merged_totals

# Drop irrelevant columns
merged_totals.drop(columns=['player_y'], inplace=True)

# Re-do positions
merged_totals['pos'] = merged_totals['pos'].str[:2]

In [827]:
country_to_continent = {
    # Europe
    "NED": "Europe", "ENG": "Europe", "SCO": "Europe", "CRO": "Europe", "BEL": "Europe", 
    "ESP": "Europe", "POR": "Europe", "GER": "Europe", "FRA": "Europe", "UKR": "Europe", 
    "SWE": "Europe", "WAL": "Europe", "SRB": "Europe", "ITA": "Europe", "DEN": "Europe", 
    "NIR": "Europe", "AUT": "Europe", "BIH": "Europe", "GRE": "Europe", "NOR": "Europe", 
    "CZE": "Europe", "ISL": "Europe", "POL": "Europe", "FIN": "Europe", "SVK": "Europe", 
    "HUN": "Europe", "SVN": "Europe", "LUX": "Europe", "KVX": "Europe", # KVX likely Kosovo
    "MNE": "Europe", "ROU": "Europe", "EST": "Europe", "MDA": "Europe", "BUL": "Europe", 
    "CYP": "Europe", "MKD": "Europe", "RUS": "Europe", # Russia considered UEFA
    "TUR": "Europe", # Turkey plays in UEFA
    "ARM": "Europe", # Armenia in UEFA
    "ALB": "Europe"
    
    # South America
    ,"BRA": "South America", "ARG": "South America", "COL": "South America", "URU": "South America", 
    "PAR": "South America", "CHI": "South America", "ECU": "South America", "PER": "South America", 
    "VEN": "South America", "GUF": "South America" # French Guiana (CONCACAF in football, but geographically in S. America)

    # Africa
    ,"EGY": "Africa", "SEN": "Africa", "GUI": "Africa", # Likely Guinea
    "CMR": "Africa", "ALG": "Africa", "NGA": "Africa", "CIV": "Africa", 
    "MAR": "Africa", "GAB": "Africa", "MLI": "Africa", "TUN": "Africa",
    "GHA": "Africa", "COD": "Africa", # DR Congo
    "ZIM": "Africa", "BFA": "Africa", "TOG": "Africa", "EQG": "Africa", # Equatorial Guinea
    "MAD": "Africa", "MOZ": "Africa", "GNB": "Africa", # Guinea-Bissau
    "RSA": "Africa", # South Africa
    "CPV": "Africa", # Cape Verde
    "CHA": "Africa", # Chad
    "ZAM": "Africa", # Zambia
    "ANG": "Africa", "GAM": "Africa", # Gambia
    "BEN": "Africa", "MTN": "Africa", # Mauritania
    "LBY": "Africa", # Libya
    "SLE": "Africa", # Sierra Leone
    "BDI": "Africa", # Burundi
    "COM": "Africa", # Comoros
    "CGO": "Africa"  # Republic of the Congo

    # Asia
    ,"KOR": "Asia", # South Korea
    "JPN": "Asia", "CHN": "Asia", "IRN": "Asia", "ISR": "Asia", 
    "PHI": "Asia", "UZB": "Asia" # Uzbekistan

    # North & Central America / Caribbean
    ,"USA": "North America", "MEX": "North America", "JAM": "North America",
    "CUB": "North America", "CAN": "North America", "MTQ": "North America", # Martinique (Caribbean)
    "GLP": "North America", # Guadeloupe (Caribbean)
    "NCL": "Oceania", # New Caledonia actually in Oceania
    "SKN": "North America", # Saint Kitts and Nevis
    "DOM": "North America", # Dominican Republic
    "CRC": "North America", # Costa Rica
    "HAI": "North America", # Haiti
    "GRN": "North America", # Grenada
    "PAN": "North America", "HON": "North America"

    # Oceania
    ,"NZL": "Oceania", "AUS": "Oceania"

    # South America or North America for Suriname (geographically in South America but competes in CONCACAF)
    ,"SUR": "South America" 

    # Ambiguous/Unknown codes:
    # 'GEO' (Georgia) is considered Europe (UEFA).
    ,"GEO": "Europe"

    # 'CTA', 'KOR' mentioned, etc.
    # CTA is unclear. Possibly 'Other' or assign based on known data.
    ,"CTA": "Other"
}

merged_totals['continent'] = merged_totals['nation'].map(country_to_continent).fillna("Other")

In [828]:
# Create dummies for categorical variables 
#threshold = 50  # Minimum number of players per country
#nation_counts = merged_totals['continent'].value_counts()
#merged_totals['country_grouped'] = merged_totals['nation'].apply(lambda x: x if nation_counts[x] > threshold else 'Other')

#Create dummy variables for country
country_dummies = pd.get_dummies(merged_totals['continent'], drop_first=True, prefix='continent')

# Create dummy variables for subposition 
pos_dummies = pd.get_dummies(merged_totals['pos'], drop_first=True, prefix='is')

# Create dummy variables for league 
league_dummies = pd.get_dummies(merged_totals['league'], drop_first=True, prefix='league')

# Create dummy for season 
season_dummies = pd.get_dummies(merged_totals['season'], drop_first=True, prefix='season')

# Combine dummy variables with the original DataFrame
merged_totals = pd.concat([merged_totals, pos_dummies, season_dummies, league_dummies, country_dummies], axis=1)

In [829]:
merged_totals

Unnamed: 0,season,league,team,player,nation,pos,age,playing_time_min,playing_time_mn/mp,playing_time_min%,starts_starts,starts_compl,subs_subs,team_success_ppm,team_success_ong,team_success_onga,standard_gls,standard_sh,standard_sot,standard_sot%,standard_g/sh,standard_g/sot,standard_dist,standard_pk,standard_pkatt,standard_fk,expected_xg,expected_npxg,expected_npxg/sh,expected_g_xg,expected_np:g_xg,performance_crdy,performance_crdr,performance_2crdy,performance_fls,performance_fld,performance_off,performance_crs,performance_int,performance_tklw,performance_pkwon,performance_pkcon,performance_og,performance_recov,aerial_duels_won,aerial_duels_lost,aerial_duels_won%,sca_sca,gca_gca,tackles_tkl,...,tackles_def_3rd,tackles_mid_3rd,tackles_att_3rd,challenges_tkl,challenges_att,challenges_tkl_perc,challenges_lost,blocks_blocks,blocks_sh,blocks_pass,int,tklint,clr,err,total_cmp,total_att,total_cmp_perc,short_cmp,short_att,short_cmp_perc,medium_cmp,medium_att,medium_cmp_perc,long_cmp,long_att,long_cmp_perc,ast,xA,ast_minus_xA,kp,passes_into_final_third,passes_into_penalty_area,progressive_passes,market_value_in_eur,continent,is_FW,is_MF,season_2020-2021,season_2021-2022,season_2022-2023,league_EPL,league_LaLiga,league_Ligue1,league_SeriaA,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_Other,continent_South America
0,2019-2020,EPL,Liverpool,Virgil van Dijk,NED,DF,28.0,3420.0,90.0,100.0,38.0,38.0,0.0,2.61,85.0,33.0,5.0,31.0,14.0,45.2,0.16,0.36,11.8,0.0,0.0,0.0,2.9,2.9,0.09,2.1,2.1,1.0,0.0,0.0,16.0,25.0,1.0,0.0,40.0,12.0,0.0,0.0,0.0,220.0,188.0,56.0,77.0,35.0,7.0,23.0,...,18.0,5.0,0.0,6.0,13.0,46.2,7.0,20.0,15.0,5.0,40.0,63.0,166.0,2.0,2919.0,3276.0,89.1,914.0,988.0,92.5,1638.0,1740.0,94.1,331.0,487.0,68.0,1.0,1.5,-0.5,7.0,182.0,9.0,170.0,80000000.0,Europe,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
1,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,ENG,DF,20.0,3175.0,84.0,92.8,35.0,31.0,3.0,2.61,80.0,32.0,4.0,44.0,13.0,29.5,0.09,0.31,22.6,0.0,0.0,13.0,2.8,2.8,0.06,1.2,1.2,5.0,0.0,0.0,22.0,9.0,3.0,382.0,45.0,34.0,0.0,0.0,0.0,221.0,15.0,30.0,33.3,155.0,24.0,55.0,...,35.0,17.0,3.0,36.0,78.0,46.2,42.0,27.0,9.0,18.0,45.0,100.0,84.0,3.0,2278.0,3230.0,70.5,993.0,1095.0,90.7,912.0,1250.0,73.0,315.0,700.0,45.0,13.0,9.7,3.3,87.0,190.0,88.0,247.0,99000000.0,Europe,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2,2019-2020,EPL,Liverpool,Georginio Wijnaldum,NED,MF,28.0,2935.0,79.0,85.8,35.0,20.0,2.0,2.59,74.0,28.0,4.0,36.0,15.0,41.7,0.11,0.27,16.2,0.0,0.0,0.0,2.6,2.6,0.07,1.4,1.4,0.0,0.0,0.0,15.0,32.0,1.0,5.0,17.0,20.0,0.0,0.0,0.0,189.0,30.0,51.0,37.0,55.0,4.0,38.0,...,12.0,21.0,5.0,10.0,40.0,25.0,30.0,18.0,4.0,14.0,17.0,55.0,34.0,3.0,1492.0,1646.0,90.6,849.0,914.0,92.9,511.0,544.0,93.9,67.0,79.0,84.8,0.0,1.6,-1.6,18.0,131.0,23.0,137.0,40000000.0,Europe,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0
3,2019-2020,EPL,Liverpool,Andrew Robertson,SCO,DF,25.0,3111.0,86.0,91.0,34.0,30.0,2.0,2.64,80.0,31.0,2.0,22.0,6.0,27.3,0.09,0.33,18.6,0.0,0.0,0.0,1.8,1.8,0.08,0.2,0.2,2.0,0.0,0.0,18.0,13.0,2.0,210.0,38.0,31.0,0.0,0.0,0.0,210.0,33.0,34.0,49.3,130.0,22.0,54.0,...,17.0,25.0,12.0,27.0,61.0,44.3,34.0,29.0,7.0,22.0,38.0,92.0,72.0,1.0,2391.0,2996.0,79.8,1298.0,1441.0,90.1,874.0,1072.0,81.5,153.0,313.0,48.9,12.0,5.9,6.1,60.0,166.0,56.0,219.0,64000000.0,Europe,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
4,2019-2020,EPL,Liverpool,Roberto Firmino,BRA,FW,27.0,2988.0,79.0,87.4,34.0,14.0,4.0,2.61,75.0,26.0,9.0,99.0,38.0,38.4,0.09,0.24,13.7,0.0,0.0,0.0,14.0,14.0,0.14,-5.0,-5.0,0.0,0.0,0.0,29.0,16.0,6.0,9.0,7.0,19.0,0.0,0.0,0.0,136.0,26.0,74.0,26.0,114.0,20.0,37.0,...,8.0,14.0,15.0,10.0,43.0,23.3,33.0,34.0,3.0,31.0,7.0,44.0,11.0,1.0,941.0,1189.0,79.1,568.0,680.0,83.5,245.0,293.0,83.6,39.0,47.0,83.0,8.0,4.0,4.0,52.0,74.0,31.0,111.0,72000000.0,South America,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7865,2022-2023,SeriaA,Sampdoria,Koray Günter,GER,DF,27.0,776.0,78.0,22.7,8.0,7.0,2.0,0.60,8.0,20.0,0.0,1.0,0.0,0.0,0.00,0.00,9.6,0.0,0.0,0.0,0.1,0.1,0.07,-0.1,-0.1,2.0,0.0,0.0,7.0,3.0,0.0,1.0,11.0,8.0,0.0,1.0,0.0,47.0,24.0,9.0,72.7,3.0,0.0,10.0,...,6.0,4.0,0.0,6.0,9.0,66.7,3.0,10.0,5.0,5.0,11.0,21.0,46.0,0.0,344.0,420.0,81.9,138.0,157.0,87.9,172.0,198.0,86.9,32.0,55.0,58.2,0.0,0.0,0.0,2.0,21.0,2.0,21.0,3500000.0,Europe,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0
7866,2022-2023,SeriaA,Sampdoria,Gonzalo Villar,ESP,MF,24.0,696.0,46.0,20.4,8.0,1.0,7.0,0.40,5.0,14.0,0.0,1.0,0.0,0.0,0.00,0.00,26.1,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,2.0,0.0,0.0,17.0,18.0,0.0,7.0,4.0,7.0,0.0,1.0,0.0,55.0,9.0,5.0,64.3,9.0,0.0,13.0,...,4.0,8.0,1.0,7.0,15.0,46.7,8.0,11.0,3.0,8.0,4.0,17.0,9.0,1.0,317.0,381.0,83.2,136.0,162.0,84.0,145.0,165.0,87.9,27.0,41.0,65.9,0.0,0.0,0.0,0.0,25.0,1.0,26.0,2800000.0,Europe,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0
7867,2022-2023,SeriaA,Sampdoria,Fabio Quagliarella,ITA,FW,39.0,800.0,35.0,23.4,7.0,0.0,16.0,0.39,6.0,21.0,1.0,36.0,6.0,16.7,0.03,0.17,16.5,0.0,0.0,0.0,2.8,2.8,0.08,-1.8,-1.8,2.0,0.0,0.0,4.0,19.0,6.0,14.0,0.0,1.0,0.0,0.0,0.0,29.0,9.0,28.0,24.3,23.0,3.0,2.0,...,1.0,1.0,0.0,1.0,7.0,14.3,6.0,5.0,1.0,4.0,0.0,2.0,9.0,0.0,159.0,245.0,64.9,78.0,100.0,78.0,49.0,72.0,68.1,18.0,37.0,48.6,1.0,0.0,0.0,7.0,15.0,7.0,25.0,500000.0,Europe,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0
7868,2022-2023,SeriaA,Sampdoria,Valerio Verre,ITA,MF,28.0,677.0,38.0,19.8,6.0,0.0,12.0,0.50,3.0,12.0,0.0,14.0,3.0,21.4,0.00,0.00,25.3,0.0,0.0,0.0,0.6,0.6,0.04,-0.6,-0.6,5.0,0.0,0.0,14.0,13.0,2.0,33.0,9.0,7.0,0.0,0.0,0.0,61.0,12.0,12.0,50.0,30.0,0.0,14.0,...,6.0,5.0,3.0,8.0,17.0,47.1,9.0,9.0,1.0,8.0,9.0,23.0,3.0,0.0,282.0,398.0,70.9,136.0,161.0,84.5,99.0,135.0,73.3,41.0,74.0,55.4,0.0,0.0,0.0,13.0,30.0,14.0,45.0,1800000.0,Europe,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0


In [830]:
REFERENCE_DATA = merged_totals.sort_values(by=['player', 'season', 'league', 'team']) 


In [831]:
merged_totals.drop(columns=['player', 'league', 'season', 'pos', 'nation', 'continent'], inplace=True)

In [832]:
merged_totals

Unnamed: 0,team,age,playing_time_min,playing_time_mn/mp,playing_time_min%,starts_starts,starts_compl,subs_subs,team_success_ppm,team_success_ong,team_success_onga,standard_gls,standard_sh,standard_sot,standard_sot%,standard_g/sh,standard_g/sot,standard_dist,standard_pk,standard_pkatt,standard_fk,expected_xg,expected_npxg,expected_npxg/sh,expected_g_xg,expected_np:g_xg,performance_crdy,performance_crdr,performance_2crdy,performance_fls,performance_fld,performance_off,performance_crs,performance_int,performance_tklw,performance_pkwon,performance_pkcon,performance_og,performance_recov,aerial_duels_won,aerial_duels_lost,aerial_duels_won%,sca_sca,gca_gca,tackles_tkl,tackles_tklw,tackles_def_3rd,tackles_mid_3rd,tackles_att_3rd,challenges_tkl,challenges_att,challenges_tkl_perc,challenges_lost,blocks_blocks,blocks_sh,blocks_pass,int,tklint,clr,err,total_cmp,total_att,total_cmp_perc,short_cmp,short_att,short_cmp_perc,medium_cmp,medium_att,medium_cmp_perc,long_cmp,long_att,long_cmp_perc,ast,xA,ast_minus_xA,kp,passes_into_final_third,passes_into_penalty_area,progressive_passes,market_value_in_eur,is_FW,is_MF,season_2020-2021,season_2021-2022,season_2022-2023,league_EPL,league_LaLiga,league_Ligue1,league_SeriaA,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_Other,continent_South America
0,Liverpool,28.0,3420.0,90.0,100.0,38.0,38.0,0.0,2.61,85.0,33.0,5.0,31.0,14.0,45.2,0.16,0.36,11.8,0.0,0.0,0.0,2.9,2.9,0.09,2.1,2.1,1.0,0.0,0.0,16.0,25.0,1.0,0.0,40.0,12.0,0.0,0.0,0.0,220.0,188.0,56.0,77.0,35.0,7.0,23.0,12.0,18.0,5.0,0.0,6.0,13.0,46.2,7.0,20.0,15.0,5.0,40.0,63.0,166.0,2.0,2919.0,3276.0,89.1,914.0,988.0,92.5,1638.0,1740.0,94.1,331.0,487.0,68.0,1.0,1.5,-0.5,7.0,182.0,9.0,170.0,80000000.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
1,Liverpool,20.0,3175.0,84.0,92.8,35.0,31.0,3.0,2.61,80.0,32.0,4.0,44.0,13.0,29.5,0.09,0.31,22.6,0.0,0.0,13.0,2.8,2.8,0.06,1.2,1.2,5.0,0.0,0.0,22.0,9.0,3.0,382.0,45.0,34.0,0.0,0.0,0.0,221.0,15.0,30.0,33.3,155.0,24.0,55.0,34.0,35.0,17.0,3.0,36.0,78.0,46.2,42.0,27.0,9.0,18.0,45.0,100.0,84.0,3.0,2278.0,3230.0,70.5,993.0,1095.0,90.7,912.0,1250.0,73.0,315.0,700.0,45.0,13.0,9.7,3.3,87.0,190.0,88.0,247.0,99000000.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2,Liverpool,28.0,2935.0,79.0,85.8,35.0,20.0,2.0,2.59,74.0,28.0,4.0,36.0,15.0,41.7,0.11,0.27,16.2,0.0,0.0,0.0,2.6,2.6,0.07,1.4,1.4,0.0,0.0,0.0,15.0,32.0,1.0,5.0,17.0,20.0,0.0,0.0,0.0,189.0,30.0,51.0,37.0,55.0,4.0,38.0,20.0,12.0,21.0,5.0,10.0,40.0,25.0,30.0,18.0,4.0,14.0,17.0,55.0,34.0,3.0,1492.0,1646.0,90.6,849.0,914.0,92.9,511.0,544.0,93.9,67.0,79.0,84.8,0.0,1.6,-1.6,18.0,131.0,23.0,137.0,40000000.0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0
3,Liverpool,25.0,3111.0,86.0,91.0,34.0,30.0,2.0,2.64,80.0,31.0,2.0,22.0,6.0,27.3,0.09,0.33,18.6,0.0,0.0,0.0,1.8,1.8,0.08,0.2,0.2,2.0,0.0,0.0,18.0,13.0,2.0,210.0,38.0,31.0,0.0,0.0,0.0,210.0,33.0,34.0,49.3,130.0,22.0,54.0,31.0,17.0,25.0,12.0,27.0,61.0,44.3,34.0,29.0,7.0,22.0,38.0,92.0,72.0,1.0,2391.0,2996.0,79.8,1298.0,1441.0,90.1,874.0,1072.0,81.5,153.0,313.0,48.9,12.0,5.9,6.1,60.0,166.0,56.0,219.0,64000000.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
4,Liverpool,27.0,2988.0,79.0,87.4,34.0,14.0,4.0,2.61,75.0,26.0,9.0,99.0,38.0,38.4,0.09,0.24,13.7,0.0,0.0,0.0,14.0,14.0,0.14,-5.0,-5.0,0.0,0.0,0.0,29.0,16.0,6.0,9.0,7.0,19.0,0.0,0.0,0.0,136.0,26.0,74.0,26.0,114.0,20.0,37.0,19.0,8.0,14.0,15.0,10.0,43.0,23.3,33.0,34.0,3.0,31.0,7.0,44.0,11.0,1.0,941.0,1189.0,79.1,568.0,680.0,83.5,245.0,293.0,83.6,39.0,47.0,83.0,8.0,4.0,4.0,52.0,74.0,31.0,111.0,72000000.0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7865,Sampdoria,27.0,776.0,78.0,22.7,8.0,7.0,2.0,0.60,8.0,20.0,0.0,1.0,0.0,0.0,0.00,0.00,9.6,0.0,0.0,0.0,0.1,0.1,0.07,-0.1,-0.1,2.0,0.0,0.0,7.0,3.0,0.0,1.0,11.0,8.0,0.0,1.0,0.0,47.0,24.0,9.0,72.7,3.0,0.0,10.0,8.0,6.0,4.0,0.0,6.0,9.0,66.7,3.0,10.0,5.0,5.0,11.0,21.0,46.0,0.0,344.0,420.0,81.9,138.0,157.0,87.9,172.0,198.0,86.9,32.0,55.0,58.2,0.0,0.0,0.0,2.0,21.0,2.0,21.0,3500000.0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0
7866,Sampdoria,24.0,696.0,46.0,20.4,8.0,1.0,7.0,0.40,5.0,14.0,0.0,1.0,0.0,0.0,0.00,0.00,26.1,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,2.0,0.0,0.0,17.0,18.0,0.0,7.0,4.0,7.0,0.0,1.0,0.0,55.0,9.0,5.0,64.3,9.0,0.0,13.0,7.0,4.0,8.0,1.0,7.0,15.0,46.7,8.0,11.0,3.0,8.0,4.0,17.0,9.0,1.0,317.0,381.0,83.2,136.0,162.0,84.0,145.0,165.0,87.9,27.0,41.0,65.9,0.0,0.0,0.0,0.0,25.0,1.0,26.0,2800000.0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0
7867,Sampdoria,39.0,800.0,35.0,23.4,7.0,0.0,16.0,0.39,6.0,21.0,1.0,36.0,6.0,16.7,0.03,0.17,16.5,0.0,0.0,0.0,2.8,2.8,0.08,-1.8,-1.8,2.0,0.0,0.0,4.0,19.0,6.0,14.0,0.0,1.0,0.0,0.0,0.0,29.0,9.0,28.0,24.3,23.0,3.0,2.0,1.0,1.0,1.0,0.0,1.0,7.0,14.3,6.0,5.0,1.0,4.0,0.0,2.0,9.0,0.0,159.0,245.0,64.9,78.0,100.0,78.0,49.0,72.0,68.1,18.0,37.0,48.6,1.0,0.0,0.0,7.0,15.0,7.0,25.0,500000.0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0
7868,Sampdoria,28.0,677.0,38.0,19.8,6.0,0.0,12.0,0.50,3.0,12.0,0.0,14.0,3.0,21.4,0.00,0.00,25.3,0.0,0.0,0.0,0.6,0.6,0.04,-0.6,-0.6,5.0,0.0,0.0,14.0,13.0,2.0,33.0,9.0,7.0,0.0,0.0,0.0,61.0,12.0,12.0,50.0,30.0,0.0,14.0,7.0,6.0,5.0,3.0,8.0,17.0,47.1,9.0,9.0,1.0,8.0,9.0,23.0,3.0,0.0,282.0,398.0,70.9,136.0,161.0,84.5,99.0,135.0,73.3,41.0,74.0,55.4,0.0,0.0,0.0,13.0,30.0,14.0,45.0,1800000.0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0


In [833]:
# Calculate the global mean
global_mean = merged_totals['market_value_in_eur'].mean()

# Group by 'team' and calculate count and mean
agg = merged_totals.groupby('team')['market_value_in_eur'].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']

# Apply smoothing
smoothing_factor = 3
smoothed_means = (counts * means + smoothing_factor * global_mean) / (counts + smoothing_factor)

# Create the encoding dictionary
encoding_dict = smoothed_means.to_dict()

# Map the smoothed means back to the DataFrame
merged_totals['team_encoded'] = merged_totals['team'].map(encoding_dict)

In [834]:
merged_totals.drop(columns = 'team', inplace=True)

In [835]:
merged_totals

Unnamed: 0,age,playing_time_min,playing_time_mn/mp,playing_time_min%,starts_starts,starts_compl,subs_subs,team_success_ppm,team_success_ong,team_success_onga,standard_gls,standard_sh,standard_sot,standard_sot%,standard_g/sh,standard_g/sot,standard_dist,standard_pk,standard_pkatt,standard_fk,expected_xg,expected_npxg,expected_npxg/sh,expected_g_xg,expected_np:g_xg,performance_crdy,performance_crdr,performance_2crdy,performance_fls,performance_fld,performance_off,performance_crs,performance_int,performance_tklw,performance_pkwon,performance_pkcon,performance_og,performance_recov,aerial_duels_won,aerial_duels_lost,aerial_duels_won%,sca_sca,gca_gca,tackles_tkl,tackles_tklw,tackles_def_3rd,tackles_mid_3rd,tackles_att_3rd,challenges_tkl,challenges_att,challenges_tkl_perc,challenges_lost,blocks_blocks,blocks_sh,blocks_pass,int,tklint,clr,err,total_cmp,total_att,total_cmp_perc,short_cmp,short_att,short_cmp_perc,medium_cmp,medium_att,medium_cmp_perc,long_cmp,long_att,long_cmp_perc,ast,xA,ast_minus_xA,kp,passes_into_final_third,passes_into_penalty_area,progressive_passes,market_value_in_eur,is_FW,is_MF,season_2020-2021,season_2021-2022,season_2022-2023,league_EPL,league_LaLiga,league_Ligue1,league_SeriaA,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_Other,continent_South America,team_encoded
0,28.0,3420.0,90.0,100.0,38.0,38.0,0.0,2.61,85.0,33.0,5.0,31.0,14.0,45.2,0.16,0.36,11.8,0.0,0.0,0.0,2.9,2.9,0.09,2.1,2.1,1.0,0.0,0.0,16.0,25.0,1.0,0.0,40.0,12.0,0.0,0.0,0.0,220.0,188.0,56.0,77.0,35.0,7.0,23.0,12.0,18.0,5.0,0.0,6.0,13.0,46.2,7.0,20.0,15.0,5.0,40.0,63.0,166.0,2.0,2919.0,3276.0,89.1,914.0,988.0,92.5,1638.0,1740.0,94.1,331.0,487.0,68.0,1.0,1.5,-0.5,7.0,182.0,9.0,170.0,80000000.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,3.739573e+07
1,20.0,3175.0,84.0,92.8,35.0,31.0,3.0,2.61,80.0,32.0,4.0,44.0,13.0,29.5,0.09,0.31,22.6,0.0,0.0,13.0,2.8,2.8,0.06,1.2,1.2,5.0,0.0,0.0,22.0,9.0,3.0,382.0,45.0,34.0,0.0,0.0,0.0,221.0,15.0,30.0,33.3,155.0,24.0,55.0,34.0,35.0,17.0,3.0,36.0,78.0,46.2,42.0,27.0,9.0,18.0,45.0,100.0,84.0,3.0,2278.0,3230.0,70.5,993.0,1095.0,90.7,912.0,1250.0,73.0,315.0,700.0,45.0,13.0,9.7,3.3,87.0,190.0,88.0,247.0,99000000.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,3.739573e+07
2,28.0,2935.0,79.0,85.8,35.0,20.0,2.0,2.59,74.0,28.0,4.0,36.0,15.0,41.7,0.11,0.27,16.2,0.0,0.0,0.0,2.6,2.6,0.07,1.4,1.4,0.0,0.0,0.0,15.0,32.0,1.0,5.0,17.0,20.0,0.0,0.0,0.0,189.0,30.0,51.0,37.0,55.0,4.0,38.0,20.0,12.0,21.0,5.0,10.0,40.0,25.0,30.0,18.0,4.0,14.0,17.0,55.0,34.0,3.0,1492.0,1646.0,90.6,849.0,914.0,92.9,511.0,544.0,93.9,67.0,79.0,84.8,0.0,1.6,-1.6,18.0,131.0,23.0,137.0,40000000.0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,3.739573e+07
3,25.0,3111.0,86.0,91.0,34.0,30.0,2.0,2.64,80.0,31.0,2.0,22.0,6.0,27.3,0.09,0.33,18.6,0.0,0.0,0.0,1.8,1.8,0.08,0.2,0.2,2.0,0.0,0.0,18.0,13.0,2.0,210.0,38.0,31.0,0.0,0.0,0.0,210.0,33.0,34.0,49.3,130.0,22.0,54.0,31.0,17.0,25.0,12.0,27.0,61.0,44.3,34.0,29.0,7.0,22.0,38.0,92.0,72.0,1.0,2391.0,2996.0,79.8,1298.0,1441.0,90.1,874.0,1072.0,81.5,153.0,313.0,48.9,12.0,5.9,6.1,60.0,166.0,56.0,219.0,64000000.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,3.739573e+07
4,27.0,2988.0,79.0,87.4,34.0,14.0,4.0,2.61,75.0,26.0,9.0,99.0,38.0,38.4,0.09,0.24,13.7,0.0,0.0,0.0,14.0,14.0,0.14,-5.0,-5.0,0.0,0.0,0.0,29.0,16.0,6.0,9.0,7.0,19.0,0.0,0.0,0.0,136.0,26.0,74.0,26.0,114.0,20.0,37.0,19.0,8.0,14.0,15.0,10.0,43.0,23.3,33.0,34.0,3.0,31.0,7.0,44.0,11.0,1.0,941.0,1189.0,79.1,568.0,680.0,83.5,245.0,293.0,83.6,39.0,47.0,83.0,8.0,4.0,4.0,52.0,74.0,31.0,111.0,72000000.0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,3.739573e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7865,27.0,776.0,78.0,22.7,8.0,7.0,2.0,0.60,8.0,20.0,0.0,1.0,0.0,0.0,0.00,0.00,9.6,0.0,0.0,0.0,0.1,0.1,0.07,-0.1,-0.1,2.0,0.0,0.0,7.0,3.0,0.0,1.0,11.0,8.0,0.0,1.0,0.0,47.0,24.0,9.0,72.7,3.0,0.0,10.0,8.0,6.0,4.0,0.0,6.0,9.0,66.7,3.0,10.0,5.0,5.0,11.0,21.0,46.0,0.0,344.0,420.0,81.9,138.0,157.0,87.9,172.0,198.0,86.9,32.0,55.0,58.2,0.0,0.0,0.0,2.0,21.0,2.0,21.0,3500000.0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,4.197264e+06
7866,24.0,696.0,46.0,20.4,8.0,1.0,7.0,0.40,5.0,14.0,0.0,1.0,0.0,0.0,0.00,0.00,26.1,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,2.0,0.0,0.0,17.0,18.0,0.0,7.0,4.0,7.0,0.0,1.0,0.0,55.0,9.0,5.0,64.3,9.0,0.0,13.0,7.0,4.0,8.0,1.0,7.0,15.0,46.7,8.0,11.0,3.0,8.0,4.0,17.0,9.0,1.0,317.0,381.0,83.2,136.0,162.0,84.0,145.0,165.0,87.9,27.0,41.0,65.9,0.0,0.0,0.0,0.0,25.0,1.0,26.0,2800000.0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,4.197264e+06
7867,39.0,800.0,35.0,23.4,7.0,0.0,16.0,0.39,6.0,21.0,1.0,36.0,6.0,16.7,0.03,0.17,16.5,0.0,0.0,0.0,2.8,2.8,0.08,-1.8,-1.8,2.0,0.0,0.0,4.0,19.0,6.0,14.0,0.0,1.0,0.0,0.0,0.0,29.0,9.0,28.0,24.3,23.0,3.0,2.0,1.0,1.0,1.0,0.0,1.0,7.0,14.3,6.0,5.0,1.0,4.0,0.0,2.0,9.0,0.0,159.0,245.0,64.9,78.0,100.0,78.0,49.0,72.0,68.1,18.0,37.0,48.6,1.0,0.0,0.0,7.0,15.0,7.0,25.0,500000.0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,4.197264e+06
7868,28.0,677.0,38.0,19.8,6.0,0.0,12.0,0.50,3.0,12.0,0.0,14.0,3.0,21.4,0.00,0.00,25.3,0.0,0.0,0.0,0.6,0.6,0.04,-0.6,-0.6,5.0,0.0,0.0,14.0,13.0,2.0,33.0,9.0,7.0,0.0,0.0,0.0,61.0,12.0,12.0,50.0,30.0,0.0,14.0,7.0,6.0,5.0,3.0,8.0,17.0,47.1,9.0,9.0,1.0,8.0,9.0,23.0,3.0,0.0,282.0,398.0,70.9,136.0,161.0,84.5,99.0,135.0,73.3,41.0,74.0,55.4,0.0,0.0,0.0,13.0,30.0,14.0,45.0,1800000.0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,4.197264e+06


In [836]:
merged_totals.to_csv('output_data/merged_totals_final_df.csv')

### Per90s Combined DF

In [837]:
filtered_per90_df

Unnamed: 0,season,league,team,player,nation,pos,age,playing_time_min,playing_time_mn/mp,playing_time_min%,playing_time_90s,starts_starts_per90,starts_compl_per90,subs_subs_per90,team_success_ppm_per90,team_success_ong_per90,team_success_onga_per90,standard_gls_per90,standard_sh_per90,standard_sot_per90,standard_sot%,standard_g/sh,standard_g/sot,standard_dist,standard_pk_per90,standard_pkatt_per90,standard_fk_per90,expected_xg_per90,expected_npxg_per90,expected_npxg/sh,expected_g_xg_per90,expected_np:g_xg_per90,performance_crdy_per90,performance_crdr_per90,performance_2crdy_per90,performance_fls_per90,performance_fld_per90,performance_off_per90,performance_crs_per90,performance_int_per90,performance_tklw_per90,performance_pkwon_per90,performance_pkcon_per90,performance_og_per90,performance_recov_per90,aerial_duels_won_per90,aerial_duels_lost_per90,aerial_duels_won%,sca_sca90,gca_gca90,tackles_tkl90,tackles_tklw90,tackles_def_3rd90,tackles_mid_3rd90,tackles_att_3rd90,challenges_tkl90,challenges_att90,challenges_tkl_perc,challenges_lost90,blocks_blocks90,blocks_sh90,blocks_pass90,int90,tklint90,clr90,err90,total_cmp_per90,total_att_per90,total_cmp_perc,total_att_per90.1,short_cmp_per90,short_att_per90,short_cmp_perc,short_att_per90.1,medium_cmp_per90,medium_att_per90,medium_cmp_perc,medium_att_per90.1,long_cmp_per90,long_att_per90,long_cmp_perc,long_att_per90.1,ast_per90,xA_per90,ast_minus_xA_per90,kp_per90,passes_into_final_third_per90,passes_into_penalty_area_per90,progressive_passes_per90
0,2019-2020,EPL,Liverpool,Virgil van Dijk,NED,DF,28.0,3420.0,90.0,100.0,38.0,1.00,1.00,0.00,0.07,2.24,0.87,0.13,0.82,0.37,45.2,0.16,0.36,11.8,0.0,0.0,0.00,0.08,0.08,0.09,0.06,0.06,0.03,0.0,0.0,0.42,0.66,0.03,0.00,1.05,0.32,0.0,0.00,0.0,5.79,4.95,1.47,77.0,0.92,0.18,0.605263,0.315789,0.473684,0.131579,0.000000,0.157895,0.342105,46.2,0.184211,0.526316,0.394737,0.131579,1.052632,1.657895,4.368421,0.052632,76.815789,86.210526,89.1,86.210526,24.052632,26.000000,92.5,26.000000,43.105263,45.789474,94.1,45.789474,8.710526,12.815789,68.0,12.815789,0.026316,0.039474,-0.013158,0.184211,4.789474,0.236842,4.473684
1,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,ENG,DF,20.0,3175.0,84.0,92.8,35.3,0.99,0.88,0.08,0.07,2.27,0.91,0.11,1.25,0.37,29.5,0.09,0.31,22.6,0.0,0.0,0.37,0.08,0.08,0.06,0.03,0.03,0.14,0.0,0.0,0.62,0.25,0.08,10.82,1.27,0.96,0.0,0.00,0.0,6.26,0.42,0.85,33.3,4.39,0.68,1.558074,0.963173,0.991501,0.481586,0.084986,1.019830,2.209632,46.2,1.189802,0.764873,0.254958,0.509915,1.274788,2.832861,2.379603,0.084986,64.532578,91.501416,70.5,91.501416,28.130312,31.019830,90.7,31.019830,25.835694,35.410765,73.0,35.410765,8.923513,19.830028,45.0,19.830028,0.368272,0.274788,0.093484,2.464589,5.382436,2.492918,6.997167
2,2019-2020,EPL,Liverpool,Georginio Wijnaldum,NED,MF,28.0,2935.0,79.0,85.8,32.6,1.07,0.61,0.06,0.08,2.27,0.86,0.12,1.10,0.46,41.7,0.11,0.27,16.2,0.0,0.0,0.00,0.08,0.08,0.07,0.04,0.04,0.00,0.0,0.0,0.46,0.98,0.03,0.15,0.52,0.61,0.0,0.00,0.0,5.80,0.92,1.56,37.0,1.69,0.12,1.165644,0.613497,0.368098,0.644172,0.153374,0.306748,1.226994,25.0,0.920245,0.552147,0.122699,0.429448,0.521472,1.687117,1.042945,0.092025,45.766871,50.490798,90.6,50.490798,26.042945,28.036810,92.9,28.036810,15.674847,16.687117,93.9,16.687117,2.055215,2.423313,84.8,2.423313,0.000000,0.049080,-0.049080,0.552147,4.018405,0.705521,4.202454
3,2019-2020,EPL,Liverpool,Andrew Robertson,SCO,DF,25.0,3111.0,86.0,91.0,34.6,0.98,0.87,0.06,0.08,2.31,0.90,0.06,0.64,0.17,27.3,0.09,0.33,18.6,0.0,0.0,0.00,0.05,0.05,0.08,0.01,0.01,0.06,0.0,0.0,0.52,0.38,0.06,6.07,1.10,0.90,0.0,0.00,0.0,6.07,0.95,0.98,49.3,3.76,0.64,1.560694,0.895954,0.491329,0.722543,0.346821,0.780347,1.763006,44.3,0.982659,0.838150,0.202312,0.635838,1.098266,2.658960,2.080925,0.028902,69.104046,86.589595,79.8,86.589595,37.514451,41.647399,90.1,41.647399,25.260116,30.982659,81.5,30.982659,4.421965,9.046243,48.9,9.046243,0.346821,0.170520,0.176301,1.734104,4.797688,1.618497,6.329480
4,2019-2020,EPL,Liverpool,Roberto Firmino,BRA,FW,27.0,2988.0,79.0,87.4,33.2,1.02,0.42,0.12,0.08,2.26,0.78,0.27,2.98,1.14,38.4,0.09,0.24,13.7,0.0,0.0,0.00,0.42,0.42,0.14,-0.15,-0.15,0.00,0.0,0.0,0.87,0.48,0.18,0.27,0.21,0.57,0.0,0.00,0.0,4.10,0.78,2.23,26.0,3.43,0.60,1.114458,0.572289,0.240964,0.421687,0.451807,0.301205,1.295181,23.3,0.993976,1.024096,0.090361,0.933735,0.210843,1.325301,0.331325,0.030120,28.343373,35.813253,79.1,35.813253,17.108434,20.481928,83.5,20.481928,7.379518,8.825301,83.6,8.825301,1.174699,1.415663,83.0,1.415663,0.240964,0.120482,0.120482,1.566265,2.228916,0.933735,3.343373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11344,2022-2023,SeriaA,Sampdoria,Koray Günter,GER,DF,27.0,776.0,78.0,22.7,8.6,0.93,0.81,0.23,0.07,0.93,2.33,0.00,0.12,0.00,0.0,0.00,0.00,9.6,0.0,0.0,0.00,0.01,0.01,0.07,-0.01,-0.01,0.23,0.0,0.0,0.81,0.35,0.00,0.12,1.28,0.93,0.0,0.12,0.0,5.47,2.79,1.05,72.7,0.35,0.00,1.162791,0.930233,0.697674,0.465116,0.000000,0.697674,1.046512,66.7,0.348837,1.162791,0.581395,0.581395,1.279070,2.441860,5.348837,0.000000,40.000000,48.837209,81.9,48.837209,16.046512,18.255814,87.9,18.255814,20.000000,23.023256,86.9,23.023256,3.720930,6.395349,58.2,6.395349,0.000000,0.000000,0.000000,0.232558,2.441860,0.232558,2.441860
11345,2022-2023,SeriaA,Sampdoria,Gonzalo Villar,ESP,MF,24.0,696.0,46.0,20.4,7.7,1.04,0.13,0.91,0.05,0.65,1.82,0.00,0.13,0.00,0.0,0.00,0.00,26.1,0.0,0.0,0.00,0.00,0.00,0.03,0.00,0.00,0.26,0.0,0.0,2.21,2.34,0.00,0.91,0.52,0.91,0.0,0.13,0.0,7.14,1.17,0.65,64.3,1.17,0.00,1.688312,0.909091,0.519481,1.038961,0.129870,0.909091,1.948052,46.7,1.038961,1.428571,0.389610,1.038961,0.519481,2.207792,1.168831,0.129870,41.168831,49.480519,83.2,49.480519,17.662338,21.038961,84.0,21.038961,18.831169,21.428571,87.9,21.428571,3.506494,5.324675,65.9,5.324675,0.000000,0.000000,0.000000,0.000000,3.246753,0.129870,3.376623
11346,2022-2023,SeriaA,Sampdoria,Fabio Quagliarella,ITA,FW,39.0,800.0,35.0,23.4,8.9,0.79,0.00,1.80,0.04,0.67,2.36,0.11,4.04,0.67,16.7,0.03,0.17,16.5,0.0,0.0,0.00,0.31,0.31,0.08,-0.20,-0.20,0.22,0.0,0.0,0.45,2.13,0.67,1.57,0.00,0.11,0.0,0.00,0.0,3.26,1.01,3.15,24.3,2.57,0.34,0.224719,0.112360,0.112360,0.112360,0.000000,0.112360,0.786517,14.3,0.674157,0.561798,0.112360,0.449438,0.000000,0.224719,1.011236,0.000000,17.865169,27.528090,64.9,27.528090,8.764045,11.235955,78.0,11.235955,5.505618,8.089888,68.1,8.089888,2.022472,4.157303,48.6,4.157303,0.112360,0.000000,0.000000,0.786517,1.685393,0.786517,2.808989
11347,2022-2023,SeriaA,Sampdoria,Valerio Verre,ITA,MF,28.0,677.0,38.0,19.8,7.5,0.80,0.00,1.60,0.07,0.40,1.60,0.00,1.87,0.40,21.4,0.00,0.00,25.3,0.0,0.0,0.00,0.08,0.08,0.04,-0.08,-0.08,0.67,0.0,0.0,1.87,1.73,0.27,4.40,1.20,0.93,0.0,0.00,0.0,8.13,1.60,1.60,50.0,3.98,0.00,1.866667,0.933333,0.800000,0.666667,0.400000,1.066667,2.266667,47.1,1.200000,1.200000,0.133333,1.066667,1.200000,3.066667,0.400000,0.000000,37.600000,53.066667,70.9,53.066667,18.133333,21.466667,84.5,21.466667,13.200000,18.000000,73.3,18.000000,5.466667,9.866667,55.4,9.866667,0.000000,0.000000,0.000000,1.733333,4.000000,1.866667,6.000000


In [838]:
from rapidfuzz import process, fuzz

def fuzzy_match(left_df, right_df, left_key, right_key, threshold=80):
    """
    Improved fuzzy matching with debugging and token penalty for short matches.
    """
    left_df['normalized'] = left_df[left_key].apply(lambda x: unidecode(x.strip().lower()))
    right_df['normalized'] = right_df[right_key].apply(lambda x: unidecode(x.strip().lower()))

    matches = {}
    for value in left_df['normalized'].unique():
        # Direct match check
        if value in right_df['normalized'].values:
            matches[value] = value
        else:
            # Fuzzy match
            best_match = process.extractOne(value, right_df['normalized'], scorer=fuzz.token_set_ratio)
            if best_match and best_match[1] >= threshold:
                matches[value] = best_match[0]

    # Debugging: Print mismatches
    for key, match in matches.items():
        if key != match:
            print(f"Mismatch: {key} -> {match}")

    # Apply the matches
    left_df['fuzzy_match'] = left_df['normalized'].map(matches)

    return left_df

# Use the updated function
result_df = fuzzy_match(filtered_per90_df, valuations_df, 'player', 'player', threshold=75)

# Step 2: Merge the dataframes based on fuzzy matches and other keys
merged = filtered_per90_df.merge(
    valuations_df,
    how='left',
    left_on=['season', 'league', 'team', 'fuzzy_match'],
    right_on=['season', 'league', 'team', 'normalized']
)

# Step 3: Clean up and rename columns
merged = merged.drop(columns=['fuzzy_match', 'normalized_x', 'normalized_y']).rename(columns={'player_x': 'player'})

# Display the result
merged

Mismatch: emerson palmieri -> emerson
Mismatch: son heung-min -> heung-min son
Mismatch: martinelli -> gabriel martinelli
Mismatch: oliver mcburnie -> oli mcburnie
Mismatch: pierre hojbjerg -> pierre-emile hojbjerg
Mismatch: matthew longstaff -> matty longstaff
Mismatch: trezeguet -> mahmoud trezeguet
Mismatch: jose holebas -> jose cholevas
Mismatch: emi buendia -> emiliano buendia
Mismatch: moanes dabbur -> munas dabbur
Mismatch: joao victor santos sa -> victor sa
Mismatch: kwon chang-hoon -> chang-hoon kwon
Mismatch: leandro barreiro martins -> leandro barreiro
Mismatch: ohis felix uduokhai -> felix uduokhai
Mismatch: cauly oliveira souza -> cauly
Mismatch: dani carvajal -> daniel carvajal
Mismatch: andre-frank zambo anguissa -> frank anguissa
Mismatch: javier ontiveros -> javi ontiveros
Mismatch: yan brice eteki -> yan eteki
Mismatch: jose martinez -> jose antonio martinez
Mismatch: xabier etxeita -> xabi etxeita
Mismatch: daniel parejo -> dani parejo
Mismatch: jose luis gaya -> jos

Unnamed: 0,season,league,team,player,nation,pos,age,playing_time_min,playing_time_mn/mp,playing_time_min%,playing_time_90s,starts_starts_per90,starts_compl_per90,subs_subs_per90,team_success_ppm_per90,team_success_ong_per90,team_success_onga_per90,standard_gls_per90,standard_sh_per90,standard_sot_per90,standard_sot%,standard_g/sh,standard_g/sot,standard_dist,standard_pk_per90,standard_pkatt_per90,standard_fk_per90,expected_xg_per90,expected_npxg_per90,expected_npxg/sh,expected_g_xg_per90,expected_np:g_xg_per90,performance_crdy_per90,performance_crdr_per90,performance_2crdy_per90,performance_fls_per90,performance_fld_per90,performance_off_per90,performance_crs_per90,performance_int_per90,performance_tklw_per90,performance_pkwon_per90,performance_pkcon_per90,performance_og_per90,performance_recov_per90,aerial_duels_won_per90,aerial_duels_lost_per90,aerial_duels_won%,sca_sca90,gca_gca90,tackles_tkl90,tackles_tklw90,tackles_def_3rd90,tackles_mid_3rd90,tackles_att_3rd90,challenges_tkl90,challenges_att90,challenges_tkl_perc,challenges_lost90,blocks_blocks90,blocks_sh90,blocks_pass90,int90,tklint90,clr90,err90,total_cmp_per90,total_att_per90,total_cmp_perc,total_att_per90.1,short_cmp_per90,short_att_per90,short_cmp_perc,short_att_per90.1,medium_cmp_per90,medium_att_per90,medium_cmp_perc,medium_att_per90.1,long_cmp_per90,long_att_per90,long_cmp_perc,long_att_per90.1,ast_per90,xA_per90,ast_minus_xA_per90,kp_per90,passes_into_final_third_per90,passes_into_penalty_area_per90,progressive_passes_per90,player_y,market_value_in_eur
0,2019-2020,EPL,Liverpool,Virgil van Dijk,NED,DF,28.0,3420.0,90.0,100.0,38.0,1.00,1.00,0.00,0.07,2.24,0.87,0.13,0.82,0.37,45.2,0.16,0.36,11.8,0.0,0.0,0.00,0.08,0.08,0.09,0.06,0.06,0.03,0.0,0.0,0.42,0.66,0.03,0.00,1.05,0.32,0.0,0.00,0.0,5.79,4.95,1.47,77.0,0.92,0.18,0.605263,0.315789,0.473684,0.131579,0.000000,0.157895,0.342105,46.2,0.184211,0.526316,0.394737,0.131579,1.052632,1.657895,4.368421,0.052632,76.815789,86.210526,89.1,86.210526,24.052632,26.000000,92.5,26.000000,43.105263,45.789474,94.1,45.789474,8.710526,12.815789,68.0,12.815789,0.026316,0.039474,-0.013158,0.184211,4.789474,0.236842,4.473684,Virgil van Dijk,80000000.0
1,2019-2020,EPL,Liverpool,Trent Alexander-Arnold,ENG,DF,20.0,3175.0,84.0,92.8,35.3,0.99,0.88,0.08,0.07,2.27,0.91,0.11,1.25,0.37,29.5,0.09,0.31,22.6,0.0,0.0,0.37,0.08,0.08,0.06,0.03,0.03,0.14,0.0,0.0,0.62,0.25,0.08,10.82,1.27,0.96,0.0,0.00,0.0,6.26,0.42,0.85,33.3,4.39,0.68,1.558074,0.963173,0.991501,0.481586,0.084986,1.019830,2.209632,46.2,1.189802,0.764873,0.254958,0.509915,1.274788,2.832861,2.379603,0.084986,64.532578,91.501416,70.5,91.501416,28.130312,31.019830,90.7,31.019830,25.835694,35.410765,73.0,35.410765,8.923513,19.830028,45.0,19.830028,0.368272,0.274788,0.093484,2.464589,5.382436,2.492918,6.997167,Trent Alexander-Arnold,99000000.0
2,2019-2020,EPL,Liverpool,Georginio Wijnaldum,NED,MF,28.0,2935.0,79.0,85.8,32.6,1.07,0.61,0.06,0.08,2.27,0.86,0.12,1.10,0.46,41.7,0.11,0.27,16.2,0.0,0.0,0.00,0.08,0.08,0.07,0.04,0.04,0.00,0.0,0.0,0.46,0.98,0.03,0.15,0.52,0.61,0.0,0.00,0.0,5.80,0.92,1.56,37.0,1.69,0.12,1.165644,0.613497,0.368098,0.644172,0.153374,0.306748,1.226994,25.0,0.920245,0.552147,0.122699,0.429448,0.521472,1.687117,1.042945,0.092025,45.766871,50.490798,90.6,50.490798,26.042945,28.036810,92.9,28.036810,15.674847,16.687117,93.9,16.687117,2.055215,2.423313,84.8,2.423313,0.000000,0.049080,-0.049080,0.552147,4.018405,0.705521,4.202454,Georginio Wijnaldum,40000000.0
3,2019-2020,EPL,Liverpool,Andrew Robertson,SCO,DF,25.0,3111.0,86.0,91.0,34.6,0.98,0.87,0.06,0.08,2.31,0.90,0.06,0.64,0.17,27.3,0.09,0.33,18.6,0.0,0.0,0.00,0.05,0.05,0.08,0.01,0.01,0.06,0.0,0.0,0.52,0.38,0.06,6.07,1.10,0.90,0.0,0.00,0.0,6.07,0.95,0.98,49.3,3.76,0.64,1.560694,0.895954,0.491329,0.722543,0.346821,0.780347,1.763006,44.3,0.982659,0.838150,0.202312,0.635838,1.098266,2.658960,2.080925,0.028902,69.104046,86.589595,79.8,86.589595,37.514451,41.647399,90.1,41.647399,25.260116,30.982659,81.5,30.982659,4.421965,9.046243,48.9,9.046243,0.346821,0.170520,0.176301,1.734104,4.797688,1.618497,6.329480,Andrew Robertson,64000000.0
4,2019-2020,EPL,Liverpool,Roberto Firmino,BRA,FW,27.0,2988.0,79.0,87.4,33.2,1.02,0.42,0.12,0.08,2.26,0.78,0.27,2.98,1.14,38.4,0.09,0.24,13.7,0.0,0.0,0.00,0.42,0.42,0.14,-0.15,-0.15,0.00,0.0,0.0,0.87,0.48,0.18,0.27,0.21,0.57,0.0,0.00,0.0,4.10,0.78,2.23,26.0,3.43,0.60,1.114458,0.572289,0.240964,0.421687,0.451807,0.301205,1.295181,23.3,0.993976,1.024096,0.090361,0.933735,0.210843,1.325301,0.331325,0.030120,28.343373,35.813253,79.1,35.813253,17.108434,20.481928,83.5,20.481928,7.379518,8.825301,83.6,8.825301,1.174699,1.415663,83.0,1.415663,0.240964,0.120482,0.120482,1.566265,2.228916,0.933735,3.343373,Roberto Firmino,72000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7865,2022-2023,SeriaA,Sampdoria,Koray Günter,GER,DF,27.0,776.0,78.0,22.7,8.6,0.93,0.81,0.23,0.07,0.93,2.33,0.00,0.12,0.00,0.0,0.00,0.00,9.6,0.0,0.0,0.00,0.01,0.01,0.07,-0.01,-0.01,0.23,0.0,0.0,0.81,0.35,0.00,0.12,1.28,0.93,0.0,0.12,0.0,5.47,2.79,1.05,72.7,0.35,0.00,1.162791,0.930233,0.697674,0.465116,0.000000,0.697674,1.046512,66.7,0.348837,1.162791,0.581395,0.581395,1.279070,2.441860,5.348837,0.000000,40.000000,48.837209,81.9,48.837209,16.046512,18.255814,87.9,18.255814,20.000000,23.023256,86.9,23.023256,3.720930,6.395349,58.2,6.395349,0.000000,0.000000,0.000000,0.232558,2.441860,0.232558,2.441860,Koray Günter,3500000.0
7866,2022-2023,SeriaA,Sampdoria,Gonzalo Villar,ESP,MF,24.0,696.0,46.0,20.4,7.7,1.04,0.13,0.91,0.05,0.65,1.82,0.00,0.13,0.00,0.0,0.00,0.00,26.1,0.0,0.0,0.00,0.00,0.00,0.03,0.00,0.00,0.26,0.0,0.0,2.21,2.34,0.00,0.91,0.52,0.91,0.0,0.13,0.0,7.14,1.17,0.65,64.3,1.17,0.00,1.688312,0.909091,0.519481,1.038961,0.129870,0.909091,1.948052,46.7,1.038961,1.428571,0.389610,1.038961,0.519481,2.207792,1.168831,0.129870,41.168831,49.480519,83.2,49.480519,17.662338,21.038961,84.0,21.038961,18.831169,21.428571,87.9,21.428571,3.506494,5.324675,65.9,5.324675,0.000000,0.000000,0.000000,0.000000,3.246753,0.129870,3.376623,Gonzalo Villar,2800000.0
7867,2022-2023,SeriaA,Sampdoria,Fabio Quagliarella,ITA,FW,39.0,800.0,35.0,23.4,8.9,0.79,0.00,1.80,0.04,0.67,2.36,0.11,4.04,0.67,16.7,0.03,0.17,16.5,0.0,0.0,0.00,0.31,0.31,0.08,-0.20,-0.20,0.22,0.0,0.0,0.45,2.13,0.67,1.57,0.00,0.11,0.0,0.00,0.0,3.26,1.01,3.15,24.3,2.57,0.34,0.224719,0.112360,0.112360,0.112360,0.000000,0.112360,0.786517,14.3,0.674157,0.561798,0.112360,0.449438,0.000000,0.224719,1.011236,0.000000,17.865169,27.528090,64.9,27.528090,8.764045,11.235955,78.0,11.235955,5.505618,8.089888,68.1,8.089888,2.022472,4.157303,48.6,4.157303,0.112360,0.000000,0.000000,0.786517,1.685393,0.786517,2.808989,Fabio Quagliarella,500000.0
7868,2022-2023,SeriaA,Sampdoria,Valerio Verre,ITA,MF,28.0,677.0,38.0,19.8,7.5,0.80,0.00,1.60,0.07,0.40,1.60,0.00,1.87,0.40,21.4,0.00,0.00,25.3,0.0,0.0,0.00,0.08,0.08,0.04,-0.08,-0.08,0.67,0.0,0.0,1.87,1.73,0.27,4.40,1.20,0.93,0.0,0.00,0.0,8.13,1.60,1.60,50.0,3.98,0.00,1.866667,0.933333,0.800000,0.666667,0.400000,1.066667,2.266667,47.1,1.200000,1.200000,0.133333,1.066667,1.200000,3.066667,0.400000,0.000000,37.600000,53.066667,70.9,53.066667,18.133333,21.466667,84.5,21.466667,13.200000,18.000000,73.3,18.000000,5.466667,9.866667,55.4,9.866667,0.000000,0.000000,0.000000,1.733333,4.000000,1.866667,6.000000,Valerio Verre,1800000.0


In [839]:
# Remove Goalkeepers
merged = merged[merged['pos'] != 'GK']

In [840]:
# Drop non-merged 
merged_totals = merged[merged['market_value_in_eur'].notna()]
merged_totals

# Drop irrelevant columns
merged_totals.drop(columns=['player_y'], inplace=True)

# Re-do positions
merged_totals['pos'] = merged_totals['pos'].str[:2]

In [841]:
merged_totals['continent'] = merged_totals['nation'].map(country_to_continent).fillna("Other")

In [842]:
#Create dummy variables for country
country_dummies = pd.get_dummies(merged_totals['continent'], drop_first=True, prefix='continent')

# Create dummy variables for subposition 
pos_dummies = pd.get_dummies(merged_totals['pos'], drop_first=True, prefix='is')

# Create dummy variables for league 
league_dummies = pd.get_dummies(merged_totals['league'], drop_first=True, prefix='league')

# Create dummy for season 
season_dummies = pd.get_dummies(merged_totals['season'], drop_first=True, prefix='season')

# Combine dummy variables with the original DataFrame
merged_90s = pd.concat([merged_totals, pos_dummies, season_dummies, league_dummies, country_dummies], axis=1)

In [843]:
merged_90s.drop(columns=['player', 'league', 'season', 'pos', 'nation', 'continent'], inplace=True)

In [844]:
merged_90s.to_csv('output_data/merged_90s_df.csv')

In [845]:
merged_90s

Unnamed: 0,team,age,playing_time_min,playing_time_mn/mp,playing_time_min%,playing_time_90s,starts_starts_per90,starts_compl_per90,subs_subs_per90,team_success_ppm_per90,team_success_ong_per90,team_success_onga_per90,standard_gls_per90,standard_sh_per90,standard_sot_per90,standard_sot%,standard_g/sh,standard_g/sot,standard_dist,standard_pk_per90,standard_pkatt_per90,standard_fk_per90,expected_xg_per90,expected_npxg_per90,expected_npxg/sh,expected_g_xg_per90,expected_np:g_xg_per90,performance_crdy_per90,performance_crdr_per90,performance_2crdy_per90,performance_fls_per90,performance_fld_per90,performance_off_per90,performance_crs_per90,performance_int_per90,performance_tklw_per90,performance_pkwon_per90,performance_pkcon_per90,performance_og_per90,performance_recov_per90,aerial_duels_won_per90,aerial_duels_lost_per90,aerial_duels_won%,sca_sca90,gca_gca90,tackles_tkl90,tackles_tklw90,tackles_def_3rd90,tackles_mid_3rd90,tackles_att_3rd90,challenges_tkl90,challenges_att90,challenges_tkl_perc,challenges_lost90,blocks_blocks90,blocks_sh90,blocks_pass90,int90,tklint90,clr90,err90,total_cmp_per90,total_att_per90,total_cmp_perc,total_att_per90.1,short_cmp_per90,short_att_per90,short_cmp_perc,short_att_per90.1,medium_cmp_per90,medium_att_per90,medium_cmp_perc,medium_att_per90.1,long_cmp_per90,long_att_per90,long_cmp_perc,long_att_per90.1,ast_per90,xA_per90,ast_minus_xA_per90,kp_per90,passes_into_final_third_per90,passes_into_penalty_area_per90,progressive_passes_per90,market_value_in_eur,is_FW,is_MF,season_2020-2021,season_2021-2022,season_2022-2023,league_EPL,league_LaLiga,league_Ligue1,league_SeriaA,continent_Asia,continent_Europe,continent_North America,continent_Oceania,continent_Other,continent_South America
0,Liverpool,28.0,3420.0,90.0,100.0,38.0,1.00,1.00,0.00,0.07,2.24,0.87,0.13,0.82,0.37,45.2,0.16,0.36,11.8,0.0,0.0,0.00,0.08,0.08,0.09,0.06,0.06,0.03,0.0,0.0,0.42,0.66,0.03,0.00,1.05,0.32,0.0,0.00,0.0,5.79,4.95,1.47,77.0,0.92,0.18,0.605263,0.315789,0.473684,0.131579,0.000000,0.157895,0.342105,46.2,0.184211,0.526316,0.394737,0.131579,1.052632,1.657895,4.368421,0.052632,76.815789,86.210526,89.1,86.210526,24.052632,26.000000,92.5,26.000000,43.105263,45.789474,94.1,45.789474,8.710526,12.815789,68.0,12.815789,0.026316,0.039474,-0.013158,0.184211,4.789474,0.236842,4.473684,80000000.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
1,Liverpool,20.0,3175.0,84.0,92.8,35.3,0.99,0.88,0.08,0.07,2.27,0.91,0.11,1.25,0.37,29.5,0.09,0.31,22.6,0.0,0.0,0.37,0.08,0.08,0.06,0.03,0.03,0.14,0.0,0.0,0.62,0.25,0.08,10.82,1.27,0.96,0.0,0.00,0.0,6.26,0.42,0.85,33.3,4.39,0.68,1.558074,0.963173,0.991501,0.481586,0.084986,1.019830,2.209632,46.2,1.189802,0.764873,0.254958,0.509915,1.274788,2.832861,2.379603,0.084986,64.532578,91.501416,70.5,91.501416,28.130312,31.019830,90.7,31.019830,25.835694,35.410765,73.0,35.410765,8.923513,19.830028,45.0,19.830028,0.368272,0.274788,0.093484,2.464589,5.382436,2.492918,6.997167,99000000.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2,Liverpool,28.0,2935.0,79.0,85.8,32.6,1.07,0.61,0.06,0.08,2.27,0.86,0.12,1.10,0.46,41.7,0.11,0.27,16.2,0.0,0.0,0.00,0.08,0.08,0.07,0.04,0.04,0.00,0.0,0.0,0.46,0.98,0.03,0.15,0.52,0.61,0.0,0.00,0.0,5.80,0.92,1.56,37.0,1.69,0.12,1.165644,0.613497,0.368098,0.644172,0.153374,0.306748,1.226994,25.0,0.920245,0.552147,0.122699,0.429448,0.521472,1.687117,1.042945,0.092025,45.766871,50.490798,90.6,50.490798,26.042945,28.036810,92.9,28.036810,15.674847,16.687117,93.9,16.687117,2.055215,2.423313,84.8,2.423313,0.000000,0.049080,-0.049080,0.552147,4.018405,0.705521,4.202454,40000000.0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0
3,Liverpool,25.0,3111.0,86.0,91.0,34.6,0.98,0.87,0.06,0.08,2.31,0.90,0.06,0.64,0.17,27.3,0.09,0.33,18.6,0.0,0.0,0.00,0.05,0.05,0.08,0.01,0.01,0.06,0.0,0.0,0.52,0.38,0.06,6.07,1.10,0.90,0.0,0.00,0.0,6.07,0.95,0.98,49.3,3.76,0.64,1.560694,0.895954,0.491329,0.722543,0.346821,0.780347,1.763006,44.3,0.982659,0.838150,0.202312,0.635838,1.098266,2.658960,2.080925,0.028902,69.104046,86.589595,79.8,86.589595,37.514451,41.647399,90.1,41.647399,25.260116,30.982659,81.5,30.982659,4.421965,9.046243,48.9,9.046243,0.346821,0.170520,0.176301,1.734104,4.797688,1.618497,6.329480,64000000.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
4,Liverpool,27.0,2988.0,79.0,87.4,33.2,1.02,0.42,0.12,0.08,2.26,0.78,0.27,2.98,1.14,38.4,0.09,0.24,13.7,0.0,0.0,0.00,0.42,0.42,0.14,-0.15,-0.15,0.00,0.0,0.0,0.87,0.48,0.18,0.27,0.21,0.57,0.0,0.00,0.0,4.10,0.78,2.23,26.0,3.43,0.60,1.114458,0.572289,0.240964,0.421687,0.451807,0.301205,1.295181,23.3,0.993976,1.024096,0.090361,0.933735,0.210843,1.325301,0.331325,0.030120,28.343373,35.813253,79.1,35.813253,17.108434,20.481928,83.5,20.481928,7.379518,8.825301,83.6,8.825301,1.174699,1.415663,83.0,1.415663,0.240964,0.120482,0.120482,1.566265,2.228916,0.933735,3.343373,72000000.0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7865,Sampdoria,27.0,776.0,78.0,22.7,8.6,0.93,0.81,0.23,0.07,0.93,2.33,0.00,0.12,0.00,0.0,0.00,0.00,9.6,0.0,0.0,0.00,0.01,0.01,0.07,-0.01,-0.01,0.23,0.0,0.0,0.81,0.35,0.00,0.12,1.28,0.93,0.0,0.12,0.0,5.47,2.79,1.05,72.7,0.35,0.00,1.162791,0.930233,0.697674,0.465116,0.000000,0.697674,1.046512,66.7,0.348837,1.162791,0.581395,0.581395,1.279070,2.441860,5.348837,0.000000,40.000000,48.837209,81.9,48.837209,16.046512,18.255814,87.9,18.255814,20.000000,23.023256,86.9,23.023256,3.720930,6.395349,58.2,6.395349,0.000000,0.000000,0.000000,0.232558,2.441860,0.232558,2.441860,3500000.0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0
7866,Sampdoria,24.0,696.0,46.0,20.4,7.7,1.04,0.13,0.91,0.05,0.65,1.82,0.00,0.13,0.00,0.0,0.00,0.00,26.1,0.0,0.0,0.00,0.00,0.00,0.03,0.00,0.00,0.26,0.0,0.0,2.21,2.34,0.00,0.91,0.52,0.91,0.0,0.13,0.0,7.14,1.17,0.65,64.3,1.17,0.00,1.688312,0.909091,0.519481,1.038961,0.129870,0.909091,1.948052,46.7,1.038961,1.428571,0.389610,1.038961,0.519481,2.207792,1.168831,0.129870,41.168831,49.480519,83.2,49.480519,17.662338,21.038961,84.0,21.038961,18.831169,21.428571,87.9,21.428571,3.506494,5.324675,65.9,5.324675,0.000000,0.000000,0.000000,0.000000,3.246753,0.129870,3.376623,2800000.0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0
7867,Sampdoria,39.0,800.0,35.0,23.4,8.9,0.79,0.00,1.80,0.04,0.67,2.36,0.11,4.04,0.67,16.7,0.03,0.17,16.5,0.0,0.0,0.00,0.31,0.31,0.08,-0.20,-0.20,0.22,0.0,0.0,0.45,2.13,0.67,1.57,0.00,0.11,0.0,0.00,0.0,3.26,1.01,3.15,24.3,2.57,0.34,0.224719,0.112360,0.112360,0.112360,0.000000,0.112360,0.786517,14.3,0.674157,0.561798,0.112360,0.449438,0.000000,0.224719,1.011236,0.000000,17.865169,27.528090,64.9,27.528090,8.764045,11.235955,78.0,11.235955,5.505618,8.089888,68.1,8.089888,2.022472,4.157303,48.6,4.157303,0.112360,0.000000,0.000000,0.786517,1.685393,0.786517,2.808989,500000.0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0
7868,Sampdoria,28.0,677.0,38.0,19.8,7.5,0.80,0.00,1.60,0.07,0.40,1.60,0.00,1.87,0.40,21.4,0.00,0.00,25.3,0.0,0.0,0.00,0.08,0.08,0.04,-0.08,-0.08,0.67,0.0,0.0,1.87,1.73,0.27,4.40,1.20,0.93,0.0,0.00,0.0,8.13,1.60,1.60,50.0,3.98,0.00,1.866667,0.933333,0.800000,0.666667,0.400000,1.066667,2.266667,47.1,1.200000,1.200000,0.133333,1.066667,1.200000,3.066667,0.400000,0.000000,37.600000,53.066667,70.9,53.066667,18.133333,21.466667,84.5,21.466667,13.200000,18.000000,73.3,18.000000,5.466667,9.866667,55.4,9.866667,0.000000,0.000000,0.000000,1.733333,4.000000,1.866667,6.000000,1800000.0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0
