In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

from scikeras.wrappers import KerasClassifier
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

########################################
# Load and Prepare Data
########################################
df = pd.read_csv(r'C:\Users\Joshua_zza\Desktop\IS 597 MLC\Final Project\FC25\expanded_all_players.csv')

# Drop unwanted columns
cols_to_drop = ['Name', 'url', 'Team', 'League', 'Nation', 'Alternative positions', 'play style', 'Weak foot']
df.drop(columns=[c for c in cols_to_drop if c in df.columns], inplace=True, errors='ignore')

# Extract target
target = df['Position']
df.drop(columns=['Position'], inplace=True)

# Encode target
le = LabelEncoder()
y = le.fit_transform(target)

# Identify numeric and categorical features
categorical_cols = [col for col in df.columns if df[col].dtype == 'object']
numeric_cols = [col for col in df.columns if df[col].dtype != 'object']

# If you only want 'Preferred foot' as categorical, adjust
if 'Preferred foot' in categorical_cols:
    categorical_cols = ['Preferred foot']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, stratify=y, random_state=42)

########################################
# Model Definitions and Parameter Grids
########################################
def create_nn_model(neurons=64, dropout_rate=0.2):
    model = Sequential()
    model.add(Dense(neurons, activation='relu', input_dim=X_train.shape[1])) # will adjust if PCA used
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons//2, activation='relu'))
    model.add(Dense(len(np.unique(y)), activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# For NN, dimension might change if PCA is applied, so we will define model input_dim dynamically later.

models = {
    'LogisticRegression': (LogisticRegression(max_iter=1000), {
        'clf__C': [0.1, 1.0, 10]
    }),
    'RandomForest': (RandomForestClassifier(random_state=42), {
        'clf__n_estimators': [100, 300],
        'clf__max_depth': [10, 20, None],
        'clf__min_samples_split': [2, 5]
    }),
    'XGBoost': (XGBClassifier(eval_metric='mlogloss', random_state=42), {
        'clf__n_estimators': [100, 300],
        'clf__max_depth': [3, 6],
        'clf__learning_rate': [0.1, 0.3]
    })
    # You can add NN after deciding on PCA dimension
}

########################################
# Experiments Config
########################################
# We will run experiments with/without PCA and with/without SMOTE
# This allows us to compare how each choice affects the results
experiments = [
    {'use_pca': False, 'use_smote': False},
    {'use_pca': False, 'use_smote': True},
    {'use_pca': True, 'use_smote': False},
    {'use_pca': True, 'use_smote': True},
]

pca_components = 20  # you can adjust this as needed

results = []

########################################
# Run Experiments
########################################
for exp in experiments:
    use_pca = exp['use_pca']
    use_smote = exp['use_smote']
    
    # Create pipeline steps dynamically
    steps = [('preprocessing', preprocessor)]
    if use_pca:
        steps.append(('pca', PCA(n_components=pca_components)))
    if use_smote:
        steps.append(('smote', SMOTE(random_state=42)))
    
    # Adjust input_dim for NN if you use NN:
    # We'll do this after fitting the preprocessing (and pca if used) pipeline.
    
    # For each model:
    for model_name, (model, param_grid) in models.items():
        print(f"Running: Model={model_name}, PCA={use_pca}, SMOTE={use_smote}")
        # Use ImbPipeline if SMOTE is used, else normal Pipeline
        pipeline_class = ImbPipeline if use_smote else Pipeline
        
        clf_pipeline = pipeline_class(steps + [('clf', model)])
        
        # If using PCA, we need to know the dimension after PCA for NN input_dim
        # For now, we handle only LR, RF, XGB. If you add NN, you'd need a preliminary fit 
        # to determine input dimension and re-compile the model if PCA changes dimension.
        
        grid = GridSearchCV(clf_pipeline, param_grid=param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
        grid.fit(X_train, y_train)
        y_pred = grid.predict(X_test)
        
        report = classification_report(y_test, y_pred, output_dict=True)
        accuracy = report['accuracy']
        
        # Store results
        results.append({
            'Model': model_name,
            'Use_PCA': use_pca,
            'Use_SMOTE': use_smote,
            'Best_Params': grid.best_params_,
            'Accuracy': accuracy,
            'Classification_Report': report
        })
        
        print("Best Params:", grid.best_params_)
        print("Accuracy:", accuracy)
        print(classification_report(y_test, y_pred))
        print("-" * 50)

########################################
# After running all experiments, you have a structured `results` variable
# You can now analyze which configuration worked best.
########################################

# Example: Print summary of all experiments
print("Summary of All Experiments:")
for res in results:
    print(f"Model={res['Model']}, PCA={res['Use_PCA']}, SMOTE={res['Use_SMOTE']}, Accuracy={res['Accuracy']}, Params={res['Best_Params']}")


Running: Model=LogisticRegression, PCA=False, SMOTE=False
Best Params: {'clf__C': 1.0}
Accuracy: 0.7790302379674361
              precision    recall  f1-score   support

           0       0.60      0.64      0.62       323
           1       0.91      0.93      0.92       993
           2       0.77      0.67      0.71       467
           3       0.78      0.87      0.82       685
           4       1.00      1.00      1.00       626
           5       0.85      0.86      0.85       418
           6       0.33      0.33      0.33       323
           7       0.28      0.11      0.16       123
           8       0.76      0.88      0.82       446
           9       0.36      0.30      0.33       305
          10       0.29      0.06      0.11       124
          11       0.86      0.96      0.90       756

    accuracy                           0.78      5589
   macro avg       0.65      0.63      0.63      5589
weighted avg       0.76      0.78      0.77      5589

-----------------