# **Import Libraries**

In [2]:
# Pandas for data manipulation
import pandas as pd

In [3]:
# Random Forest classifier
from sklearn.ensemble import RandomForestClassifier

# Xgboost Classifier
from xgboost import XGBClassifier

# Multi-layer perceptron (MLP) Classifier
from sklearn.neural_network import MLPClassifier

# For Performance Metrices
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# For splitting data into training and testing sets
from sklearn.model_selection import train_test_split

# For label encoding categorical variables
from sklearn.preprocessing import LabelEncoder

In [4]:
# Libraries for data balancing
from imblearn.over_sampling import SMOTE  # Synthetic Minority Over-sampling Technique for addressing class imbalance

In [5]:
# Libraries for model evaluation and tuning
from sklearn.model_selection import GridSearchCV  # Grid search for hyperparameter tuning
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # Metrics for model evaluation

In [77]:
# Library for Data Scaling
from sklearn.preprocessing import MinMaxScaler

# **Upload Dataset**

In [6]:
# Reading the dataset from a CSV file and storing it in a DataFrame called df
df = pd.read_csv('/content/Final Dataset.csv')

In [7]:
# Dropping rows with missing values from the DataFrame in place
df.dropna(inplace=True)

# **Convert All Factors to Per-90**

In [8]:
# Displaying the column names of the DataFrame
df.columns

Index(['Player Name', 'Position', 'Club Name', 'Age', ' Weekly Salary ',
       'Matches Played', 'Minutes Played', 'Yellow Cards', 'Red Cards ',
       'Shots', 'Shots per 90', 'Tackles Made', 'Tackles Won',
       'Times Team Pressed Opposition', 'Blocks ', 'Interceptions',
       'Clearances', 'Fouls Commited', 'Fouls Drawn', 'Loose Balls Recovered',
       'Aerial Battles Won %', 'Goal Creation per 90',
       'Fully Completed Matches', 'Touches', 'Successful Dribbles',
       'Attempted Dribbles', 'Distance Ran', 'Distance Covered With Ball',
       'Times Dispossessed', 'Passes Received', 'Total Injuries',
       'Minor Injuries', 'Moderate Injuries', 'Serious Injuries', 'COVID-19'],
      dtype='object')

In [9]:
# Calculating the number of full 90-minute matches played by dividing the total minutes played by 90
df['Full 90s Played'] = df['Minutes Played'] / 90

In [10]:
# List of columns to be converted to numeric data type
columns_to_convert = ['Yellow Cards', 'Red Cards ', 'Tackles Made', 'Tackles Won', 'Times Team Pressed Opposition',
                      'Blocks ', 'Interceptions', 'Clearances', 'Fouls Commited', 'Fouls Drawn', 'Loose Balls Recovered',
                      'Touches', 'Successful Dribbles', 'Attempted Dribbles', 'Distance Ran', 'Distance Covered With Ball',
                      'Times Dispossessed', 'Passes Received', 'Total Injuries', 'Minor Injuries', 'Moderate Injuries',
                      'Serious Injuries', 'COVID-19']



In [11]:
# List of columns to select for further analysis
columns_to_select = ['Yellow Cards', 'Red Cards ', 'Tackles Made', 'Tackles Won', 'Times Team Pressed Opposition',
                      'Blocks ', 'Interceptions', 'Clearances', 'Fouls Commited', 'Fouls Drawn', 'Loose Balls Recovered',
                      'Touches', 'Successful Dribbles', 'Attempted Dribbles', 'Distance Ran', 'Distance Covered With Ball',
                      'Times Dispossessed', 'Passes Received']

In [12]:
# Iterating through each column in the list of columns to convert
for column in columns_to_convert:
    # Calculating the per 90 minutes value for the current column and creating a new column with the suffix ' per 90'
    df[column + ' per 90'] = df[column] / df['Full 90s Played']

# **Prepare Data**

In [39]:
# Displaying the column names of the DataFrame
df.columns

Index(['Player Name', 'Position', 'Club Name', 'Age', ' Weekly Salary ',
       'Matches Played', 'Minutes Played', 'Yellow Cards', 'Red Cards ',
       'Shots', 'Shots per 90', 'Tackles Made', 'Tackles Won',
       'Times Team Pressed Opposition', 'Blocks ', 'Interceptions',
       'Clearances', 'Fouls Commited', 'Fouls Drawn', 'Loose Balls Recovered',
       'Aerial Battles Won %', 'Goal Creation per 90',
       'Fully Completed Matches', 'Touches', 'Successful Dribbles',
       'Attempted Dribbles', 'Distance Ran', 'Distance Covered With Ball',
       'Times Dispossessed', 'Passes Received', 'Total Injuries',
       'Minor Injuries', 'Moderate Injuries', 'Serious Injuries', 'COVID-19',
       'Full 90s Played', 'Yellow Cards per 90', 'Red Cards  per 90',
       'Tackles Made per 90', 'Tackles Won per 90',
       'Times Team Pressed Opposition per 90', 'Blocks  per 90',
       'Interceptions per 90', 'Clearances per 90', 'Fouls Commited per 90',
       'Fouls Drawn per 90', 'Loose Ba

In [40]:
# Creating a list of columns to be dropped from the DataFrame
# These columns include player-specific details, club-related information, individual match statistics, and injury-related metrics
drop_columns = df[['Player Name','Club Name',' Weekly Salary ', 'Matches Played', 'Minutes Played', 'Yellow Cards', 'Red Cards ',
                  'Shots','Tackles Made', 'Tackles Won','Times Team Pressed Opposition', 'Blocks ', 'Interceptions',
       'Clearances', 'Fouls Commited', 'Fouls Drawn', 'Loose Balls Recovered','Fully Completed Matches', 'Touches', 'Successful Dribbles',
       'Attempted Dribbles', 'Distance Ran', 'Distance Covered With Ball','Times Dispossessed', 'Passes Received', 'Total Injuries',
       'Minor Injuries', 'Moderate Injuries', 'Serious Injuries', 'COVID-19','Yellow Cards per 90', 'Red Cards  per 90','COVID-19 per 90','Total Injuries per 90']]

In [41]:
# Dropping the selected columns from the DataFrame
df = df.drop(drop_columns, axis=1)

# **Create Injury Type Column**

In [42]:
# Initialize a new column named 'Injury_Type' and set all values to 0
df['Injury_Type'] = 0  # Initialize all values to 0

# Assign values based on conditions for each type of injury

# If 'Serious Injuries per 90' is greater than 0, assign 1 for Serious Injuries
df.loc[df['Serious Injuries per 90'] > 0, 'Injury_Type'] = 1
# If 'Moderate Injuries per 90' is greater than 0, assign 2 for Moderate Injuries
df.loc[df['Moderate Injuries per 90'] > 0, 'Injury_Type'] = 2
# If 'Minor Injuries per 90' is greater than 0, assign 3 for Minor Injuries
df.loc[df['Minor Injuries per 90'] > 0, 'Injury_Type'] = 3

# Drop the individual injury columns ('Serious Injuries per 90', 'Moderate Injuries per 90', 'Minor Injuries per 90')
df.drop(['Serious Injuries per 90', 'Moderate Injuries per 90', 'Minor Injuries per 90'], axis=1, inplace=True)


# **Encode POSITION Column**

In [43]:
# Create a LabelEncoder object to convert categorical positions into numerical labels
label_encoder = LabelEncoder()

# Fit the LabelEncoder to the 'Position' column and transform it to encode positions with numerical labels
df['Position_Encoded'] = label_encoder.fit_transform(df['Position'])

# Display the mapping of original position names to their corresponding encoded numerical values
print("Mapping of original positions to encoded values:")
for position, encoded_value in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{position}: {encoded_value}")


Mapping of original positions to encoded values:
DF: 0
DFFW: 1
DFMF: 2
FW: 3
FWDF: 4
FWMF: 5
GK: 6
MF: 7
MFDF: 8
MFFW: 9


In [44]:
df.head()

Unnamed: 0,Position,Age,Shots per 90,Aerial Battles Won %,Goal Creation per 90,Full 90s Played,Tackles Made per 90,Tackles Won per 90,Times Team Pressed Opposition per 90,Blocks per 90,...,Loose Balls Recovered per 90,Touches per 90,Successful Dribbles per 90,Attempted Dribbles per 90,Distance Ran per 90,Distance Covered With Ball per 90,Times Dispossessed per 90,Passes Received per 90,Injury_Type,Position_Encoded
0,DF,21,0.41,22.7,0.16,32.011111,2.155502,1.155849,13.557792,2.686567,...,5.52933,57.948629,1.030892,2.436654,199.368275,121.614023,0.937175,32.395002,0,0
1,DF,20,0.64,51.3,0.3,20.311111,3.741794,2.117068,19.792123,1.673961,...,6.646608,63.068928,1.969365,3.594092,182.363239,113.238512,1.575492,35.891685,2,0
2,DF,23,0.32,62.4,0.27,22.166667,1.804511,0.992481,9.293233,1.62406,...,8.390977,49.984962,0.496241,0.766917,152.706767,92.345865,0.496241,24.090226,1,0
3,DF,26,0.49,83.7,0.0,10.255556,1.462622,0.487541,6.825569,1.170098,...,7.605634,91.267606,0.0,0.195016,347.128927,217.053088,0.097508,70.595883,0,0
4,DF,22,1.61,45.0,0.54,31.7,1.388013,0.757098,9.085174,1.230284,...,10.063091,94.921136,0.820189,1.388013,190.662461,104.574132,0.820189,59.274448,2,0


In [45]:
df['Position_Encoded']

0      0
1      0
2      0
3      0
4      0
      ..
465    9
466    9
467    9
468    9
469    9
Name: Position_Encoded, Length: 430, dtype: int64

In [46]:
df.columns

Index(['Position', 'Age', 'Shots per 90', 'Aerial Battles Won %',
       'Goal Creation per 90', 'Full 90s Played', 'Tackles Made per 90',
       'Tackles Won per 90', 'Times Team Pressed Opposition per 90',
       'Blocks  per 90', 'Interceptions per 90', 'Clearances per 90',
       'Fouls Commited per 90', 'Fouls Drawn per 90',
       'Loose Balls Recovered per 90', 'Touches per 90',
       'Successful Dribbles per 90', 'Attempted Dribbles per 90',
       'Distance Ran per 90', 'Distance Covered With Ball per 90',
       'Times Dispossessed per 90', 'Passes Received per 90', 'Injury_Type',
       'Position_Encoded'],
      dtype='object')

In [74]:
df['Injury_Type'].value_counts()

Injury_Type
0    237
1     83
2     68
3     42
Name: count, dtype: int64

# **Model Training Step**

In [47]:
# Extract features (X) by dropping the 'Injury_Type' and 'Position' columns from the DataFrame
X = df.drop(columns=['Injury_Type','Position'])  # Features

# Extract the target variable (y) which is the 'Injury_Type' column
y = df['Injury_Type']  # Target variable

# **Model Traning and Evaluation**

In [75]:
# Step 1: Balance the Dataset using Synthetic Minority Over-sampling Technique (SMOTE)
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

In [76]:
# Split the dataset into training and testing sets, with 80% of the data for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

In [78]:
# Initialize the StandardScaler
scaler = MinMaxScaler()

# Fit the scaler to the training data and transform it
X_train = scaler.fit_transform(X_train)

# Apply the same transformation to the test data
X_test = scaler.transform(X_test)

In [79]:
# Initialize classifiers
rf_classifier = RandomForestClassifier(random_state=42)
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
mlp_classifier = MLPClassifier(random_state=42)

# List of classifiers
classifiers = [
    ('Random Forest', rf_classifier),
    ('XGBoost', xgb_classifier),
    ('MLP', mlp_classifier)
]

# Function to train and evaluate a model
def train_evaluate_model(model, model_name, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"{model_name} - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}\n")

In [80]:
# Train and evaluate each classifier
for name, classifier in classifiers:
    train_evaluate_model(classifier, name, X_train, y_train, X_test, y_test)

Random Forest - Accuracy: 0.82, Precision: 0.82, Recall: 0.82, F1-Score: 0.82

XGBoost - Accuracy: 0.78, Precision: 0.78, Recall: 0.78, F1-Score: 0.78

MLP - Accuracy: 0.56, Precision: 0.56, Recall: 0.56, F1-Score: 0.55





# **Hyperparameter Tuning**


The hyperparameter tuning conducted via GridSearchCV for each model ensures that each model is optimized for its parameters, which is crucial for achieving the best possible performance. This approach systematically explores multiple combinations of parameters, selecting the set that provides the best fit to the data.


In [81]:
from sklearn.model_selection import GridSearchCV

In [82]:
# Define the model
rf = RandomForestClassifier(random_state=42)

# Set up the parameter grid
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Configure GridSearchCV
rf_grid_search = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the grid search to the data
rf_grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters for Random Forest:", rf_grid_search.best_params_)
print("Best score for Random Forest:", rf_grid_search.best_score_)

Fitting 3 folds for each of 324 candidates, totalling 972 fits


  warn(


Best parameters for Random Forest: {'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best score for Random Forest: 0.7427379383901123


In [83]:
# Define the model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Set up the parameter grid
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Configure GridSearchCV
xgb_grid_search = GridSearchCV(estimator=xgb, param_grid=xgb_param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the grid search to the data
xgb_grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters for XGBoost:", xgb_grid_search.best_params_)
print("Best score for XGBoost:", xgb_grid_search.best_score_)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters for XGBoost: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.6}
Best score for XGBoost: 0.7414517849300458


In [84]:
# Define the model
mlp = MLPClassifier(random_state=42)

# Set up the parameter grid
mlp_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant', 'adaptive']
}

# Configure GridSearchCV
mlp_grid_search = GridSearchCV(estimator=mlp, param_grid=mlp_param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the grid search to the data
mlp_grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters for MLP:", mlp_grid_search.best_params_)
print("Best score for MLP:", mlp_grid_search.best_score_)


Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best parameters for MLP: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'solver': 'adam'}
Best score for MLP: 0.5264288851245372




In [85]:
def evaluate_model(model, X_test, y_test):
    # Predict the target variable for the test set using the trained classifier
    y_pred = model.predict(X_test)

    # Calculate the accuracy, precision, recall, and f1-score of the classifier
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Print the metrics
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}")

# Evaluate each model
print("Random Forest Model Evaluation")
evaluate_model(rf_grid_search.best_estimator_, X_test, y_test)

print("\nXGBoost Model Evaluation")
evaluate_model(xgb_grid_search.best_estimator_, X_test, y_test)

print("\nMLP Model Evaluation")
evaluate_model(mlp_grid_search.best_estimator_, X_test, y_test)

Random Forest Model Evaluation
Accuracy: 0.81, Precision: 0.81, Recall: 0.81, F1-Score: 0.81

XGBoost Model Evaluation
Accuracy: 0.78, Precision: 0.78, Recall: 0.78, F1-Score: 0.78

MLP Model Evaluation
Accuracy: 0.62, Precision: 0.61, Recall: 0.61, F1-Score: 0.61


# **Prediction for NEW INPUT**

In [93]:
import numpy as np

# Define all the expected features as used in model training
expected_features = [
    'Age',
    'Shots per 90',
    'Aerial Battles Won %',
    'Goal Creation per 90',
    'Full 90s Played',
    'Tackles Made per 90',
    'Tackles Won per 90',
    'Times Team Pressed Opposition per 90',
    'Blocks per 90',
    'Interceptions per 90',
    'Clearances per 90',
    'Fouls Commited per 90',
    'Fouls Drawn per 90',
    'Loose Balls Recovered per 90',
    'Touches per 90',
    'Successful Dribbles per 90',
    'Attempted Dribbles per 90',
    'Distance Ran per 90',
    'Distance Covered With Ball per 90',
    'Times Dispossessed per 90',
    'Passes Received per 90',
    'Position_Encoded'
]

# User provides raw data
user_input_data = {
    'Age': 25,
    'Minutes Played': 1800,  # This will be used to calculate 'Full 90s Played'
    'Shots': 17,
    'Aerial Battles Won %': 55,
    'Goal Creation': 12,
    'Tackles Made': 73,
    'Tackles Won': 58,
    'Times Team Pressed Opposition': 842,
    'Blocks': 69,
    'Interceptions': 64,
    'Clearances': 87,
    'Fouls Commited': 33,
    'Fouls Drawn': 29,
    'Loose Balls Recovered': 300,
    'Touches': 2500,
    'Successful Dribbles': 80,
    'Attempted Dribbles': 120,
    'Distance Ran': 10400,  # in meters
    'Distance Covered With Ball': 2100,  # in meters
    'Times Dispossessed': 40,
    'Passes Received': 800,
    'Position_Encoded': 2
}

# Calculate 'Full 90s Played' and derive per-90 metrics
full_90s_played = user_input_data['Minutes Played'] / 90
user_input_data['Full 90s Played'] = full_90s_played  # Make sure this is included

# Calculate per-90 metrics
for feature_name in expected_features:
    if 'per 90' in feature_name and feature_name != 'Full 90s Played':
        base_feature = feature_name.replace(' per 90', '')
        user_input_data[feature_name] = user_input_data[base_feature] / full_90s_played

# Prepare the model input list in the exact order of expected features
model_input_list = [user_input_data[feature] for feature in expected_features]

# Convert to numpy array and reshape for a single sample
model_input_array = np.array([model_input_list])

# Scale the input data using the same scaler used during training
model_input_scaled = scaler.transform(model_input_array)

best_model = rf_grid_search.best_estimator_

# Map prediction to label
prediction = best_model.predict(model_input_scaled)

injury_labels = {
    0: 'No injury',
    1: 'Serious Injuries',
    2: 'Moderate Injuries',
    3: 'Minor Injuries'
}

prediction_label = injury_labels.get(prediction[0], "Unknown label")

# Output the result
print(f"Predicted Injury Type: {prediction_label}")


Predicted Injury Type: Moderate Injuries




# **Save Best Model**

In [94]:
import pickle

# Saving the model
with open('model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

# Saving the scaler
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)