In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

In [3]:
pl_19_20 = pd.read_csv('/content/drive/MyDrive/DS340W/pl_19-20.csv')
pl_20_21 = pd.read_csv('/content/drive/MyDrive/DS340W/pl_20-21.csv')
pl_21_22 = pd.read_csv('/content/drive/MyDrive/DS340W/pl_21-22.csv')
pl_22_23 = pd.read_csv('/content/drive/MyDrive/DS340W/pl_22-23.csv')
pl_23_24 = pd.read_csv('/content/drive/MyDrive/DS340W/pl_23-24.csv')
player_injuries = pd.read_csv('/content/drive/MyDrive/DS340W/player_injuries_impact.csv')

In [4]:
pl_19_20['Name'] = pl_19_20['Name'].str.strip()
pl_20_21['Name'] = pl_20_21['Name'].str.strip()
pl_21_22['Name'] = pl_21_22['Name'].str.strip()
pl_22_23['Name'] = pl_22_23['Name'].str.strip()
pl_23_24['Name'] = pl_23_24['Name'].str.strip()
player_injuries['Name'] = player_injuries['Name'].str.strip()

merged_seasons = []

seasons_data = {
    '2019/20': pl_19_20,
    '2020/21': pl_20_21,
    '2021/22': pl_21_22,
    '2022/23': pl_22_23,
    '2023/24': pl_23_24
}

for season, pl_data in seasons_data.items():
    season_injuries = player_injuries[player_injuries['Season'] == season]
    merged_season = pd.merge(pl_data, season_injuries, on='Name', how='left')
    merged_season['Season'] = season
    merged_seasons.append(merged_season)

all_seasons_merged = pd.concat(merged_seasons, ignore_index=True)

print(f"Non-null injury records: {all_seasons_merged['Injury'].notna().sum()}")
print(f"Null injury records: {all_seasons_merged['Injury'].isna().sum()}")

Non-null injury records: 525
Null injury records: 4601


In [5]:
all_seasons_merged = all_seasons_merged.drop(columns=['Unnamed: 0', 'Team Name', 'Position_y', 'Goals conceded', 'Goals Conceded',
                                                      'Big Chances Created', 'Big chances missed', 'Freekicks scored', 'Own goals',
                                                      'Errors leading to goal', 'Offsides', 'Penalties Saved'])
all_seasons_merged = all_seasons_merged.rename(columns={"Position_x":"Position"})
all_seasons_merged = all_seasons_merged[all_seasons_merged['Injury'].notna()]

In [6]:
all_seasons_merged = all_seasons_merged.fillna(0)
all_seasons_merged.to_csv('/content/drive/MyDrive/DS340W/all_seasons_merged.csv', index = False)

display(all_seasons_merged.head())

Unnamed: 0,Name,Position,Appearances,Clean sheets,Tackles,Tackle success %,Last man tackles,Blocked shots,Interceptions,Clearances,...,High Claims,Catches,Sweeper clearances,Throw outs,Goal Kicks,Age,Season,Injury,Date of Injury,Date of return
24,Dele Alli,Midfielder,25,0.0,34.0,47%,0.0,7.0,17.0,14.0,...,0.0,0.0,0.0,0.0,0.0,23.0,2019/20,Hamstring injury,"Aug 4,2019","Aug 31,2019"
25,Dele Alli,Midfielder,25,0.0,34.0,47%,0.0,7.0,17.0,14.0,...,0.0,0.0,0.0,0.0,0.0,24.0,2019/20,Muscle injury,"Jul 3,2020","Jul 25,2020"
26,Miguel Almirón,Midfielder,36,0.0,60.0,65%,0.0,19.0,31.0,16.0,...,0.0,0.0,0.0,0.0,0.0,26.0,2019/20,Hamstring injury,9-Dec-19,20-Dec-19
35,André Gomes,Midfielder,19,0.0,22.0,68%,0.0,0.0,7.0,6.0,...,0.0,0.0,0.0,0.0,0.0,26.0,2019/20,bruised ribs,2-Sep-19,4-Oct-19
36,André Gomes,Midfielder,19,0.0,22.0,68%,0.0,0.0,7.0,6.0,...,0.0,0.0,0.0,0.0,0.0,26.0,2019/20,ankle injury,4-Nov-19,21-Feb-20


In [7]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(all_seasons_merged, test_size = 0.3, random_state = 42)
test_df, val_df = train_test_split(temp_df, test_size = 1/3, random_state = 42)

print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)
print("Validation set shape:", val_df.shape)

Training set shape: (367, 52)
Testing set shape: (105, 52)
Validation set shape: (53, 52)


In [8]:
train_df.to_csv('/content/drive/MyDrive/DS340W/asm_train.csv', index = False)
test_df.to_csv('/content/drive/MyDrive/DS340W/asm_test.csv', index = False)
val_df.to_csv('/content/drive/MyDrive/DS340W/asm_val.csv', index = False)

In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                           confusion_matrix, classification_report, mean_squared_error,
                           r2_score, mean_absolute_error)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score, GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class InjuryAnalysisML:
    def __init__(self, train_df, test_df, validation_df):
        self.train_df = pd.DataFrame(train_df.copy())
        self.test_df = pd.DataFrame(test_df.copy())
        self.validation_df = pd.DataFrame(validation_df.copy())

        for df in [self.train_df, self.test_df, self.validation_df]:
            for col in df.columns:
                if hasattr(df[col], 'cat'):
                    df[col] = df[col].astype(str)

        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.models = {}
        self.results = {}

    def preprocess_data(self):
        all_data = all_seasons_merged.reset_index(drop=True)
        for col in all_data.columns:
            if all_data[col].dtype == 'object' or str(all_data[col].dtype) == 'category':
                all_data[col] = all_data[col].astype(str).replace('nan', '')
        percentage_cols = [col for col in all_data.columns if '%' in str(col)]
        for col in percentage_cols:
            all_data[col] = pd.to_numeric(all_data[col].astype(str).str.replace('%', ''), errors='coerce')

        numeric_cols = []
        for col in ['Appearances', 'Clean sheets', 'Tackles', 'Last man tackles',
                   'Blocked shots', 'Interceptions', 'Clearances', 'Age']:
            if col in all_data.columns:
                numeric_cols.append(col)
                all_data[col] = pd.to_numeric(all_data[col], errors='coerce')

        print(f"Numeric columns identified: {numeric_cols}")

        try:
            if 'Date of Injury' in all_data.columns and 'Date of return' in all_data.columns:
                date_formats = ['%b %d,%Y', '%d-%b-%y', '%Y-%m-%d', '%m/%d/%Y']

                for date_col in ['Date of Injury', 'Date of return']:
                    all_data[date_col] = pd.to_datetime(all_data[date_col], errors='coerce')
                    if all_data[date_col].isna().all():
                        for fmt in date_formats:
                            try:
                                all_data[date_col] = pd.to_datetime(all_data[date_col], format=fmt, errors='coerce')
                                if not all_data[date_col].isna().all():
                                    break
                            except:
                                continue

                duration = (all_data['Date of return'] - all_data['Date of Injury']).dt.days
                all_data['injury_duration_days'] = duration.fillna(14).clip(lower=1, upper=365)
            else:
                print("Date columns not found, using default duration")
                all_data['injury_duration_days'] = 14

        except Exception as e:
            print(f"Date parsing failed: {e}. Using default duration of 14 days.")
            all_data['injury_duration_days'] = 14

        def get_severity(days):
            if pd.isna(days) or days <= 7:
                return 'Minor'
            elif days <= 21:
                return 'Moderate'
            elif days <= 60:
                return 'Severe'
            else:
                return 'Critical'

        all_data['injury_severity'] = all_data['injury_duration_days'].apply(get_severity)

        if 'Injury' in all_data.columns:
            all_data['injury_type_category'] = all_data['Injury'].astype(str).str.extract(r'(\w+)')[0].str.lower().fillna('unknown')
        else:
            all_data['injury_type_category'] = 'unknown'

        if 'Appearances' in all_data.columns:
            appearances = np.maximum(all_data['Appearances'].fillna(1), 1)
        else:
            appearances = 1

        for metric_col, rate_col in [('Tackles', 'tackle_rate'),
                                    ('Interceptions', 'interception_rate'),
                                    ('Clearances', 'clearance_rate')]:
            if metric_col in all_data.columns:
                all_data[rate_col] = all_data[metric_col].fillna(0) / appearances
            else:
                all_data[rate_col] = 0

        if 'Age' in all_data.columns:
            age = all_data['Age'].fillna(25)
            all_data['age_group'] = pd.cut(age, bins=[0, 23, 28, 33, 40],
                                          labels=['Young', 'Prime', 'Experienced', 'Veteran'],
                                          include_lowest=True).astype(str)
        else:
            all_data['age_group'] = 'Prime'

        potential_feature_columns = [
            'Appearances', 'Clean sheets', 'Tackles', 'Tackle success %', 'Last man tackles',
            'Blocked shots', 'Interceptions', 'Clearances', 'Age',
            'tackle_rate', 'interception_rate', 'clearance_rate'
        ]

        feature_columns = []
        for col in potential_feature_columns:
            if col in all_data.columns:
                feature_columns.append(col)
                if all_data[col].dtype in ['int64', 'float64']:
                    all_data[col] = all_data[col].fillna(all_data[col].median())
            else:
                print(f"Warning: Column '{col}' not found in data. Skipping.")

        categorical_columns = ['Position', 'injury_type_category', 'age_group']

        for col in categorical_columns:
            if col in all_data.columns:
                all_data[col] = all_data[col].astype(str).replace('nan', 'Unknown').fillna('Unknown')

                le = LabelEncoder()
                all_data[col + '_encoded'] = le.fit_transform(all_data[col])
                self.label_encoders[col] = le
                feature_columns.append(col + '_encoded')

        for col in feature_columns:
            if col in all_data.columns:
                if all_data[col].dtype in ['int64', 'float64']:
                    all_data[col] = all_data[col].fillna(0)

        train_size = len(self.train_df)
        test_size = len(self.test_df)

        self.train_processed = all_data.iloc[:train_size].copy()
        self.test_processed = all_data.iloc[train_size:train_size + test_size].copy()
        self.validation_processed = all_data.iloc[train_size + test_size:].copy()

        self.feature_columns = feature_columns


    def prepare_features_targets(self):
        #Features
        X_train = self.train_processed[self.feature_columns].copy()
        X_test = self.test_processed[self.feature_columns].copy()
        X_validation = self.validation_processed[self.feature_columns].copy()

        #Fill any remaining NaN values with 0
        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)
        X_validation = X_validation.fillna(0)

        #Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        X_validation_scaled = self.scaler.transform(X_validation)

        #Injury duration
        y_duration_train = self.train_processed['injury_duration_days'].fillna(14).astype(float)
        y_duration_test = self.test_processed['injury_duration_days'].fillna(14).astype(float)
        y_duration_validation = self.validation_processed['injury_duration_days'].fillna(14).astype(float)

        #Injury severity
        y_severity_train = self.train_processed['injury_severity'].astype(str).replace('nan', 'Minor')
        y_severity_test = self.test_processed['injury_severity'].astype(str).replace('nan', 'Minor')
        y_severity_validation = self.validation_processed['injury_severity'].astype(str).replace('nan', 'Minor')

        #Injury type
        y_type_train = self.train_processed['injury_type_category'].astype(str).replace('nan', 'unknown')
        y_type_test = self.test_processed['injury_type_category'].astype(str).replace('nan', 'unknown')
        y_type_validation = self.validation_processed['injury_type_category'].astype(str).replace('nan', 'unknown')

        return {
            'X_train': X_train, 'X_test': X_test, 'X_validation': X_validation,
            'X_train_scaled': X_train_scaled, 'X_test_scaled': X_test_scaled, 'X_validation_scaled': X_validation_scaled,
            'y_duration': {'train': y_duration_train, 'test': y_duration_test, 'validation': y_duration_validation},
            'y_severity': {'train': y_severity_train, 'test': y_severity_test, 'validation': y_severity_validation},
            'y_type': {'train': y_type_train, 'test': y_type_test, 'validation': y_type_validation}
        }

    def train_classification_models(self, data_dict, target_type='severity'):
        """Train classification models"""

        X_train = data_dict['X_train_scaled']
        X_test = data_dict['X_test_scaled']
        y_train = data_dict[f'y_{target_type}']['train']
        y_test = data_dict[f'y_{target_type}']['test']

        classification_models = {
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10),
            'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
            'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
            'Naive Bayes': GaussianNB(),
            'Gradient Boosting': GradientBoostingClassifier(random_state=42)
        }

        results = {}

        for name, model in classification_models.items():
            try:
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                cv_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
                recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
                f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

                results[name] = {
                    'model': model,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1_score': f1,
                    'cv_mean': cv_scores.mean(),
                    'cv_std': cv_scores.std(),
                    'predictions': y_pred
                }

                print(f"  Accuracy: {accuracy:.4f}")
                print(f"  CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

            except Exception as e:
                print(f"Error training {name}: {e}")
                continue

        return results

    def train_regression_models(self, data_dict):
        """Train regression models for injury duration prediction"""

        X_train = data_dict['X_train_scaled']
        X_test = data_dict['X_test_scaled']
        y_train = data_dict['y_duration']['train']
        y_test = data_dict['y_duration']['test']

        regression_models = {
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
            'Decision Tree': DecisionTreeRegressor(random_state=42, max_depth=10),
            'Linear Regression': LinearRegression()
        }

        results = {}

        for name, model in regression_models.items():
            try:
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                cv_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error')
                mse = mean_squared_error(y_test, y_pred)
                rmse = np.sqrt(mse)
                mae = mean_absolute_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)

                results[name] = {
                    'model': model,
                    'mse': mse,
                    'rmse': rmse,
                    'mae': mae,
                    'r2_score': r2,
                    'cv_mean': -cv_scores.mean(),
                    'cv_std': cv_scores.std(),
                    'predictions': y_pred
                }

                print(f"  RMSE: {rmse:.4f}")
                print(f"  R²: {r2:.4f}")
                print(f"  CV RMSE: {np.sqrt(-cv_scores.mean()):.4f}")

            except Exception as e:
                print(f"Error training {name}: {e}")
                continue

        return results

    def feature_importance_analysis(self, model_results, model_name='Random Forest'):
        """Analyze feature importance"""
        print(f"\nFeature Importance Analysis for {model_name}...")

        if model_name in model_results:
            model = model_results[model_name]['model']

            if hasattr(model, 'feature_importances_'):
                importance_df = pd.DataFrame({
                    'feature': self.feature_columns,
                    'importance': model.feature_importances_
                }).sort_values('importance', ascending=False)

                print("\nTop 10 Most Important Features:")
                print(importance_df.head(10))

                return importance_df

        return None

    def print_model_comparison(self, results_dict, task_type='classification'):
        """Print comparison of model performances"""
        if not results_dict:
            print(f"No results available for {task_type}")
            return

        print(f"\n{task_type.upper()} MODEL COMPARISON:")
        print("-" * 80)

        if task_type == 'classification':
            print(f"{'Model':<20} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'CV Score':<10}")
            print("-" * 80)

            for name, results in results_dict.items():
                print(f"{name:<20} {results['accuracy']:<10.4f} {results['precision']:<10.4f} "
                      f"{results['recall']:<10.4f} {results['f1_score']:<10.4f} {results['cv_mean']:<10.4f}")

        elif task_type == 'regression':
            print(f"{'Model':<20} {'RMSE':<10} {'MAE':<10} {'R²':<10} {'CV RMSE':<10}")
            print("-" * 70)

            for name, results in results_dict.items():
                cv_rmse = np.sqrt(results['cv_mean'])
                print(f"{name:<20} {results['rmse']:<10.4f} {results['mae']:<10.4f} "
                      f"{results['r2_score']:<10.4f} {cv_rmse:<10.4f}")

    def run_complete_analysis(self):
        """Run the complete ML analysis pipeline"""
        print("Starting Premier League Injury Analysis")
        print("=" * 50)

        try:
            self.preprocess_data()

            data_dict = self.prepare_features_targets()

            severity_results = self.train_classification_models(data_dict, target_type='severity')

            type_results = self.train_classification_models(data_dict, target_type='type')

            duration_results = self.train_regression_models(data_dict)

            self.print_model_comparison(severity_results, 'classification')
            self.print_model_comparison(duration_results, 'regression')

            if severity_results:
                severity_importance = self.feature_importance_analysis(severity_results, 'Random Forest')
            else:
                severity_importance = None

            self.results = {
                'severity_classification': severity_results,
                'type_classification': type_results,
                'duration_regression': duration_results,
                'feature_importance': severity_importance
            }

            print("\n" + "="*50)

            if severity_results:
                best_severity = max(severity_results.items(), key=lambda x: x[1]['accuracy'])
                print(f"\nBest Severity Classification Model: {best_severity[0]} (Accuracy: {best_severity[1]['accuracy']:.4f})")

            if duration_results:
                best_duration = min(duration_results.items(), key=lambda x: x[1]['rmse'])
                print(f"Best Duration Regression Model: {best_duration[0]} (RMSE: {best_duration[1]['rmse']:.4f})")

            return self.results

        except Exception as e:
            print(f"Error in analysis: {e}")
            import traceback
            traceback.print_exc()
            return None

In [10]:
analyzer = InjuryAnalysisML(train_df, test_df, val_df)
results = analyzer.run_complete_analysis()

Starting Premier League Injury Analysis
Numeric columns identified: ['Appearances', 'Clean sheets', 'Tackles', 'Last man tackles', 'Blocked shots', 'Interceptions', 'Clearances', 'Age']
  Accuracy: 0.3619
  CV Score: 0.3677 (+/- 0.0552)
  Accuracy: 0.3429
  CV Score: 0.3023 (+/- 0.0909)
  Accuracy: 0.3429
  CV Score: 0.3678 (+/- 0.0699)
  Accuracy: 0.3238
  CV Score: 0.3324 (+/- 0.0201)
  Accuracy: 0.3429
  CV Score: 0.2480 (+/- 0.0738)
  Accuracy: 0.3429
  CV Score: 0.3297 (+/- 0.0473)
  Accuracy: 0.4952
  CV Score: 0.4685 (+/- 0.1040)
  Accuracy: 0.8000
  CV Score: 0.8937 (+/- 0.0274)
  Accuracy: 0.3238
  CV Score: 0.3432 (+/- 0.0986)
  Accuracy: 0.1524
  CV Score: 0.1499 (+/- 0.0149)
  Accuracy: 0.7714
  CV Score: 0.8528 (+/- 0.0469)
  Accuracy: 0.7524
  CV Score: 0.8582 (+/- 0.0777)
  RMSE: 62.7474
  R²: 0.2099
  CV RMSE: 53.3211
  RMSE: 96.8483
  R²: -0.8823
  CV RMSE: 74.7006
  RMSE: 65.5644
  R²: 0.1373
  CV RMSE: 52.7425

CLASSIFICATION MODEL COMPARISON:
-----------------------