In [1]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
column_names = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 
               'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']

df = pd.read_csv('magic04.data', names=column_names)

df['class'] = df['class'].map({'g': 1, 'h': 0})

print("Missing values in each column:")
print(df.isnull().sum())

print("\nDataset Information:")
print(f"Number of samples: {df.shape[0]}")
print(f"Number of features: {df.shape[1] - 1}")
print(f"Class distribution:\n{df['class'].value_counts(normalize=True).mul(100).round(2)}")

print("\nSummary Statistics:")
print(df.describe())

X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Missing values in each column:
fLength     0
fWidth      0
fSize       0
fConc       0
fConc1      0
fAsym       0
fM3Long     0
fM3Trans    0
fAlpha      0
fDist       0
class       0
dtype: int64

Dataset Information:
Number of samples: 19020
Number of features: 10
Class distribution:
class
1    64.84
0    35.16
Name: proportion, dtype: float64

Summary Statistics:
            fLength        fWidth         fSize         fConc        fConc1  \
count  19020.000000  19020.000000  19020.000000  19020.000000  19020.000000   
mean      53.250154     22.180966      2.825017      0.380327      0.214657   
std       42.364855     18.346056      0.472599      0.182813      0.110511   
min        4.283500      0.000000      1.941300      0.013100      0.000300   
25%       24.336000     11.863800      2.477100      0.235800      0.128475   
50%       37.147700     17.139900      2.739600      0.354150      0.196500   
75%       70.122175     24.739475      3.101600      0.503700      0.285225  

### Define models

In [3]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga']
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9, 11, 13],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Support Vector Machine': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto', 0.1, 0.01]
    },
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0]
    }
}

### Model Performance Function

In [4]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    }

### Cross Validation and hyperparameter tuning to get best model

In [5]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_models = {}
model_results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train_scaled, y_train)
    
    best_model = grid_search.best_estimator_
    best_models[name] = best_model
    
    model_results[name] = evaluate_model(best_model, X_test_scaled, y_test)
    
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best cross-validation ROC AUC: {grid_search.best_score_:.4f}")
    print(f"Test set performance for {name}:")
    for metric, value in model_results[name].items():
        print(f"  {metric}: {value:.4f}")


Training Logistic Regression...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for Logistic Regression: {'C': 0.1, 'solver': 'saga'}
Best cross-validation ROC AUC: 0.8442
Test set performance for Logistic Regression:
  Accuracy: 0.7808
  Precision: 0.7923
  Recall: 0.8970
  F1 Score: 0.8414
  ROC AUC: 0.8276

Training K-Nearest Neighbors...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for K-Nearest Neighbors: {'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'distance'}
Best cross-validation ROC AUC: 0.9037
Test set performance for K-Nearest Neighbors:
  Accuracy: 0.8412
  Precision: 0.8214
  Recall: 0.9649
  F1 Score: 0.8874
  ROC AUC: 0.8990

Training Support Vector Machine...
Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best parameters for Support Vector Machine: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Best cross-validation ROC AUC: 0.9218
Test set performance for Support Vector Machine:
  Accuracy: 0.8747
  Precision: 0.8613
  Recall: 0.9616
  F1 Score: 0.9087
  ROC AUC: 0.9284

Training Random Forest...
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validation ROC AUC: 0.9329
Test set performance for Random Forest:
  Accuracy: 0.8840
  Precision: 0.8848
  Recall: 0.9441
  F1 Score: 0.9134
  ROC AUC: 0.9374

Training Gradient Boosting...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Best cross-validation ROC AUC: 0.9328
Test set performance for Gradient Boosting:
  Accuracy: 0.8894
  Precision: 0.8918
  Recall: 0.9441
 