In [1]:
import pandas as pd
model_performance = pd.read_csv('/kaggle/input/results/model_performance.csv')
outfield_data_df = pd.read_csv('/kaggle/input/outfield-player-data/combined_other_position_data.csv')

# Prepare player data

In [2]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encoding(df, cols):
    # Create a OneHotEncoder object
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    # Fit and transform the 'Position' column
    df_encoded = pd.DataFrame(encoder.fit_transform(df[cols]))
    df_encoded.columns = encoder.get_feature_names_out()

    df.drop(cols, axis=1, inplace=True)
    return pd.concat([df, df_encoded], axis = 1)

outfield_cols = ['Position', 'Categorized Position', 'League']
outfield_data_df = one_hot_encoding(outfield_data_df, outfield_cols)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Select features (X) and target variable (y)
outfield_X = outfield_data_df.drop(['Total Days', 'Total Games Missed', 'Injury Count', 'Average Injury Cost', 'Injury Rate', 'Birthday', 'Injuries', 'Type of Injury', 'Categorized Injuries'], axis=1)
outfield_y = outfield_data_df[['Total Days', 'Total Games Missed', 'Injury Count', 'Average Injury Cost', 'Injury Rate']]

# Split the data into training and testing sets
outfield_X_train, outfield_X_test, outfield_y_train, outfield_y_test = train_test_split(outfield_X, outfield_y, test_size=0.2, random_state=42)

# Get the column names to be scaled
cols_to_scale = outfield_X.columns.difference(['Name', 'Club'])

# Perform scaling only on the selected columns
scaler = MinMaxScaler()
outfield_X[cols_to_scale] = scaler.fit_transform(outfield_X[cols_to_scale])

In [4]:
default_prediction_df = outfield_data_df[['Name', 'Club', 'Age', 'Minutes Played Total 23/24:']]
# Group by 'Club' and calculate total minutes played
club_total_minutes = default_prediction_df.groupby('Club')['Minutes Played Total 23/24:'].sum()

# Calculate individual player weights
default_prediction_df['Weight'] = default_prediction_df['Minutes Played Total 23/24:'] / default_prediction_df['Club'].map(club_total_minutes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  default_prediction_df['Weight'] = default_prediction_df['Minutes Played Total 23/24:'] / default_prediction_df['Club'].map(club_total_minutes)


In [5]:
default_prediction_df.head()

Unnamed: 0,Name,Club,Age,Minutes Played Total 23/24:,Weight
0,Tosin Adarabioyo,Fulham,26,1995,0.041889
1,Issa Diop,Fulham,27,1949,0.040923
2,Calvin Bassey,Fulham,24,2562,0.053794
3,Tim Ream,Fulham,36,1692,0.035527
4,Antonee Robinson,Fulham,26,3749,0.078718


# Find Best Performing Model on 'Injury Rate'

In [6]:
injury_rate_models = model_performance[model_performance['Target'] == 'Injury Rate']
print(injury_rate_models.idxmin())

Model                                 44
Target                                 4
Mean Square Error                     44
Mean Absolute Error                   44
Normalized Root Mean Square Error     44
R-Squared Error                      122
dtype: int64


## Based on MSE, MAE and NRMSE

In [7]:
model_performance.iloc[[44]]

Unnamed: 0,Model,Target,Mean Square Error,Mean Absolute Error,Normalized Root Mean Square Error,R-Squared Error
44,Decision Tree,Injury Rate,9.763372e-14,1.458457e-08,0.000895,0.999999


In [8]:
import pickle

# Load the saved model
with open("/kaggle/input/injury-rate-predictor/Decision Tree_Injury Rate.sav", "rb") as f:
    injury_rate_mse_model = pickle.load(f)

## Predicting based on MSE, MAE and NRMSE

In [9]:
injury_rate_mse_prediction_df = default_prediction_df[['Name', 'Club', 'Age', 'Minutes Played Total 23/24:', 'Weight']]

injury_rate_predictions = []
for index, row in outfield_X.iterrows():
    X_test = row.drop(['Name', 'Club']).values.reshape(1, -1)
    predicted_injury_rate = injury_rate_mse_model.predict(X_test)
    injury_rate_predictions.append(predicted_injury_rate[0])

injury_rate_mse_prediction_df['Predicted Injury Rate'] = injury_rate_predictions
injury_rate_mse_prediction_df.to_csv('injury_rate_mse_prediction.csv', index=False)
injury_rate_mse_prediction_df.head()

Unnamed: 0,Name,Club,Age,Minutes Played Total 23/24:,Weight,Predicted Injury Rate
0,Tosin Adarabioyo,Fulham,26,1995,0.041889,0.000504
1,Issa Diop,Fulham,27,1949,0.040923,0.000504
2,Calvin Bassey,Fulham,24,2562,0.053794,0.000504
3,Tim Ream,Fulham,36,1692,0.035527,0.000504
4,Antonee Robinson,Fulham,26,3749,0.078718,0.000504


## Based on R Squared Error

In [6]:
model_performance.iloc[[122]]

Unnamed: 0,Model,Target,Mean Square Error,Mean Absolute Error,Normalized Root Mean Square Error,R-Squared Error
122,Neural Network,Injury Rate,0.067344,0.177419,1.316764,-553035.69929


# Find Best Performing Model on 'Total Days'

In [6]:
total_days_models = model_performance[model_performance['Target'] == 'Total Days']
print(total_days_models.idxmin())

Model                                 40
Target                                 0
Mean Square Error                    100
Mean Absolute Error                  121
Normalized Root Mean Square Error     75
R-Squared Error                       50
dtype: int64


## Based on MSE

In [7]:
model_performance.iloc[[100]]

Unnamed: 0,Model,Target,Mean Square Error,Mean Absolute Error,Normalized Root Mean Square Error,R-Squared Error
100,Voting Regressor,Total Days,2993.668443,35.232941,1.823566,0.228574


In [8]:
import pickle

# Load the saved model
with open("/kaggle/input/total-days-predictors/Voting Regressor_Total Days.sav", "rb") as f:
    total_days_mse_model = pickle.load(f)

## Predicting based on MSE

In [9]:
total_days_out_mse_prediction_df = default_prediction_df[['Name', 'Club', 'Age', 'Minutes Played Total 23/24:', 'Weight']]

predicted_total_days_out = []
for index, row in outfield_X.iterrows():
    X_test = row.drop(['Name', 'Club']).values.reshape(1, -1)
    predicted_total_days_out.append(total_days_mse_model.predict(X_test)[0])

total_days_out_mse_prediction_df['Predicted Days Out'] = predicted_total_days_out
total_days_out_mse_prediction_df.to_csv('total_days_out_mse_prediction.csv', index=False)
total_days_out_mse_prediction_df.head()

Unnamed: 0,Name,Club,Age,Minutes Played Total 23/24:,Weight,Predicted Days Out
0,Tosin Adarabioyo,Fulham,26,1995,0.041889,77.4089
1,Issa Diop,Fulham,27,1949,0.040923,78.596995
2,Calvin Bassey,Fulham,24,2562,0.053794,22.061949
3,Tim Ream,Fulham,36,1692,0.035527,56.835895
4,Antonee Robinson,Fulham,26,3749,0.078718,12.757569


## Based on MAE

In [12]:
model_performance.iloc[[121]]

Unnamed: 0,Model,Target,Mean Square Error,Mean Absolute Error,Normalized Root Mean Square Error,R-Squared Error
121,Neural Network,Total Days,4331.870115,35.098744,10.036033,-0.116261


In [21]:
from tensorflow.keras.models import load_model

# Load the model
total_days_mae_model = load_model('/kaggle/input/total_days_predictors/keras/default/1/nn_total_days_model.keras')

  saveable.load_own_variables(weights_store.get(inner_path))


## Based on NRMSE

In [10]:
model_performance.iloc[[75]]

Unnamed: 0,Model,Target,Mean Square Error,Mean Absolute Error,Normalized Root Mean Square Error,R-Squared Error
75,Random Forest (PCA n = 13),Total Days,5567.24183,44.997821,1.244698,-0.434599


In [11]:
# Load the saved model
with open("/kaggle/input/total-days-predictors/Random Forest (PCA n 13)_Total Days.sav", "rb") as f:
    total_days_nrmse_model = pickle.load(f)

In [12]:
from sklearn.decomposition import PCA

# Apply PCA for dimensionality reduction
pca = PCA(n_components=13)
outfield_X_pca = pca.fit_transform(outfield_X.drop(['Name', 'Club'], axis=1))

# Create a new DataFrame with the reduced features and original columns
outfield_X_pca_df = pd.DataFrame(outfield_X_pca, columns=['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5', 'PCA6', 'PCA7', 'PCA8', 'PCA9', 'PCA10', 'PCA11', 'PCA12', 'PCA13'])
outfield_X_pca_df['Name'] = outfield_X['Name']
outfield_X_pca_df['Club'] = outfield_X['Club']

total_days_out_nrmse_prediction_df = default_prediction_df[['Name', 'Club', 'Age', 'Minutes Played Total 23/24:', 'Weight']]

predicted_total_days_out = []
for index, row in outfield_X_pca_df.iterrows():
    X_test = row.drop(['Name', 'Club']).values.reshape(1, -1)
    predicted_total_days_out.append(total_days_nrmse_model.predict(X_test)[0])

total_days_out_nrmse_prediction_df['Predicted Days Out'] = predicted_total_days_out
total_days_out_nrmse_prediction_df.to_csv('total_days_out_nrmse_prediction.csv', index=False)
total_days_out_nrmse_prediction_df.head()

Unnamed: 0,Name,Club,Age,Minutes Played Total 23/24:,Weight,Predicted Days Out
0,Tosin Adarabioyo,Fulham,26,1995,0.041889,46.0
1,Issa Diop,Fulham,27,1949,0.040923,46.0
2,Calvin Bassey,Fulham,24,2562,0.053794,149.0
3,Tim Ream,Fulham,36,1692,0.035527,46.0
4,Antonee Robinson,Fulham,26,3749,0.078718,51.0


# Perform Optimisation
## Objective Function
Given a squad of $n$ players, the objective function is

$\begin{align*} 
    \min &\sum_{i = 1}^{n} w_i z_i x_i\\
    \text{s.t. } &\sum_{i = 1}^{n} x_i = 90 * 50 * 10 = 450000\\
    &0 \leq x_i \leq 90 * 50 = 4500\\
    &0 \leq x_i \leq 65 * 50 = 3250 \text{ , if $16 \leq$ age of player $i \leq 24$)}\\
    &0 \leq x_i \leq 90 * 50 = 4500 \text{ , if $25 \leq$ age of player $i \leq 30$)}\\
    &0 \leq x_i \leq 75 * 50 = 3750 \text{ , if age of player $i \geq 31$)}
\end{align*}$

where $w_i$ is the **weight (importance)**, $z_i$ is the **predicted total days out from injuries** and $x_i$ is the **minutes played across the whole season** for player $i$.

In other words, we are trying to minimise the weighted sum of the product of player weights, predicted total days out, and playing time. The product $w_i z_i x_i$ represents the potential impact of player $i$'s injuries on the team. A higher value indicates a greater negative impact due to:

- Higher weight: The player is more important to the team.
- Higher predicted days out: The player is more likely to be injured for a longer duration.
- Higher playing time: The player is exposed to more potential injury risks.

By minimizing this weighted sum, the optimization problem seeks to allocate playing time in a way that reduces the overall negative impact of injuries on the team. This can help ensure that the team remains competitive and avoids excessive absences due to injuries.


In [43]:
import numpy as np
from scipy.optimize import minimize

def objective_function(x, df, games_per_season, age_ranges, lambda_penalty=0.5):
    """
    Calculates the weighted sum of predicted total days out.

    Args:
    x: Array of playing minutes for each player.
    df: DataFrame containing player data (e.g., weights, predicted days out, age).
    games_per_season: Number of games played per season.
    minutes_per_age: Dictionary of minutes limits for different age groups.
    lambda_penalty: Tradeoff control between minimizing injury risk and ensuring playing time.

    Returns:
    The weighted sum of predicted total days out.
    """

    predicted_days_out = df['Predicted Days Out'].values
    weights = df['Weight'].values
    min_playing_time = df['Min Playing Time'].values
    
    # Calculate penalty term
    penalty = 100 * np.square(x - min_playing_time)
    return lambda_penalty * np.dot(weights, predicted_days_out * x) + (1 - lambda_penalty) * (np.sum(penalty))



def age_ub_constraint(x, df, games_per_season, age_ranges):
    """
    Enforces individual playing time limits based on age.

    Args:
    x: Array of playing minutes for each player.
    df: DataFrame containing player data (e.g., age).
    age_ranges: List of tuples representing age ranges and their corresponding minutes limits.

    Returns:
    A tuple of constraint values.
    """

    age = df['Age'].values
    ub_constraints = []
    
    for i in range(len(x)):
        for age_range, minutes_limit in age_ranges:
            if age_range[0] <= age[i] <= age_range[1]:
                ub_constraints.append(games_per_season * minutes_limit[1] - x[i])
                break  # Exit the loop if a match is found

    return ub_constraints

def age_lb_constraint(x, df, games_per_season, age_ranges):
    """
    Enforces individual playing time limits based on age.

    Args:
    x: Array of playing minutes for each player.
    df: DataFrame containing player data (e.g., age).
    age_ranges: List of tuples representing age ranges and their corresponding minutes limits.

    Returns:
    A tuple of constraint values.
    """

    age = df['Age'].values
    lb_constraints = []
    
    for i in range(len(x)):
        for age_range, minutes_limit in age_ranges:
            if age_range[0] <= age[i] <= age_range[1]:
                lb_constraints.append(x[i] - games_per_season * minutes_limit[0])
                break  # Exit the loop if a match is found

    return lb_constraints

def total_playing_time_constraint(x, games_per_season):
    """
    Enforces the total playing time constraint.

    Args:
    x: Array of playing minutes for each player.
    games_per_season: Number of games played per season.

    Returns:
    The constraint value.
    """
#     print((games_per_season * 90 * 10), np.sum(x))
    return (games_per_season * 90 * 10) - np.sum(x)

def non_negative_constraint(x):
    """
    Enforces non-negativity of playing time.

    Args:
    x: Array of playing minutes for each player.

    Returns:
    The constraint value.
    """

    return x

In [53]:
club_players_df = total_days_out_mse_prediction_df[total_days_out_mse_prediction_df['Club'] == 'Man City']
num_players = len(club_players_df)
age_ranges = [((16, 21), (10, 65)), 
              ((21, 35), (80, 90)), 
              ((35, 50), (60, 75))]
games_per_season = 60
# Calculate total available minutes
total_available_minutes = games_per_season * 90 * 10
# Set minimum playing time based on quantiles
base_min_playing_time = total_available_minutes / num_players

# Assuming you have a DataFrame df with player weights
weight_quantiles = club_players_df['Weight'].quantile([0.5, 0.75, 0.9])
min_playing_times = []

for index, row in club_players_df.iterrows():
    if row['Weight'] >= weight_quantiles[0.9]:
        min_playing_times.append(base_min_playing_time * 1.5) # Increase for top players
#         club_players_df.loc[index, 'min_playing_time'] = base_min_playing_time * 1.5  
    elif row['Weight'] >= weight_quantiles[0.75]:
        min_playing_times.append(base_min_playing_time * 1.2)
#         club_players_df.loc[index, 'min_playing_time'] = base_min_playing_time * 1.2
    elif row['Weight'] >= weight_quantiles[0.5]:
        min_playing_times.append(base_min_playing_time)
    else:
        min_playing_times.append(0) # No minimum playing time for lower-weighted players
#         club_players_df.loc[index, 'min_playing_time'] = 0  
club_players_df['Min Playing Time'] = min_playing_times

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  club_players_df['Min Playing Time'] = min_playing_times


In [54]:
club_players_df

Unnamed: 0,Name,Club,Age,Minutes Played Total 23/24:,Weight,Predicted Days Out,Min Playing Time
198,Rúben Dias,Man City,27,3817,0.061609,11.715614,2817.391304
199,Manuel Akanji,Man City,28,3861,0.062319,12.077287,2817.391304
200,Nathan Aké,Man City,29,3053,0.049278,13.162987,2347.826087
201,John Stones,Man City,30,1858,0.02999,56.180477,0.0
202,Taylor Harwood-Bellis,Man City,22,3942,0.063627,8.670015,2817.391304
203,Josko Gvardiol,Man City,22,3479,0.056154,27.861962,2347.826087
204,Sergio Gómez,Man City,23,445,0.007183,48.22448,0.0
205,Josh Wilson-Esbrand,Man City,21,1248,0.020144,41.324985,0.0
206,Rico Lewis,Man City,19,1606,0.025922,23.870095,0.0
207,Kyle Walker,Man City,34,4083,0.065903,19.32289,3521.73913


In [57]:
# Initialize x0 with equal distribution
x0 = np.full(num_players, base_min_playing_time) # Initial guess for playing minutes
penalty_lambda = 0.5

# Set constraints
constraints = ({'type': 'eq', 'fun': lambda x: total_playing_time_constraint(x, games_per_season)},
               {'type': 'ineq', 'fun': lambda x: age_ub_constraint(x, club_players_df, games_per_season, age_ranges)},
               {'type': 'ineq', 'fun': lambda x: age_lb_constraint(x, club_players_df, games_per_season, age_ranges)},
               {'type': 'ineq', 'fun': non_negative_constraint})

# Solve the optimization problem
result = minimize(objective_function, x0, args=(club_players_df, games_per_season, age_ranges, penalty_lambda), method='SLSQP', constraints=constraints)

# Extract the optimal playing minutes
optimal_playing_minutes = result.x

In [58]:
rounded_optimal_playing_minutes = [round(value) for value in optimal_playing_minutes]
club_players_df['Suggested Minutes'] = rounded_optimal_playing_minutes
club_players_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  club_players_df['Suggested Minutes'] = rounded_optimal_playing_minutes


Unnamed: 0,Name,Club,Age,Minutes Played Total 23/24:,Weight,Predicted Days Out,Min Playing Time,Suggested Minutes
198,Rúben Dias,Man City,27,3817,0.061609,11.715614,2817.391304,2379
199,Manuel Akanji,Man City,28,3861,0.062319,12.077287,2817.391304,2379
200,Nathan Aké,Man City,29,3053,0.049278,13.162987,2347.826087,2368
201,John Stones,Man City,30,1858,0.02999,56.180477,0.0,2315
202,Taylor Harwood-Bellis,Man City,22,3942,0.063627,8.670015,2817.391304,2379
203,Josko Gvardiol,Man City,22,3479,0.056154,27.861962,2347.826087,2368
204,Sergio Gómez,Man City,23,445,0.007183,48.22448,0.0,2315
205,Josh Wilson-Esbrand,Man City,21,1248,0.020144,41.324985,0.0,568
206,Rico Lewis,Man City,19,1606,0.025922,23.870095,0.0,568
207,Kyle Walker,Man City,34,4083,0.065903,19.32289,3521.73913,4725
