# Import Dependencies

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Load the dataset and prepare the training and test dataset

In [15]:
df = pd.read_csv('../Dataset/penguins_size.csv')
df = df[df['sex'] != '.'] #There is a row where sex = '.', so filtering it out
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [None]:
X = df.drop(columns=['species'])
y  = df.species
print('Shape of X:', X.shape)
print('Shape of y:', y.shape)

# Identify numerical and categorical columns
num_features = X.select_dtypes(include='number').columns.tolist()
cat_features = X.select_dtypes(include='object').columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42, stratify=y)

Shape of X: (343, 6)
Shape of y: (343,)


### Missinng value Handeling

In [17]:
df.isna().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

Approach for handelling missing value:

1. For the `sex` column (categorical feature), replace any missing values with the most frequently occurring value (the mode).
2. For the remaining numerical features with missing values, use the average value (mean) of each respective column to fill in the gaps.


### Data Encoding

**Approach:**
- Apply one-hot encoding to categorical columns (`Sex`, `island`, and `species`) because these columns do not have an inherent order.

<hr>

In [19]:
# Define preprocessing for numerical features (with KNN imputation)
numerical_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),  # Use KNN imputation for numerical features
    ('scaler', StandardScaler())  # Scale numerical features
])

# Define preprocessing for categorical features
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values with mode
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))  # Encode categorical features
])

# Combine preprocessing steps for numerical and categorical features
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, num_features),
    ('cat', categorical_pipeline, cat_features)
])

# Define models with pipelines
pipelines = {
    'Logistic Regression': Pipeline([
        ('preprocessor', preprocessor),
        ('model', LogisticRegression(max_iter=1000, n_jobs=-1, class_weight='balanced'))
    ]),
    'Decision Tree': Pipeline([
        ('preprocessor', preprocessor),
        ('model', DecisionTreeClassifier(class_weight='balanced'))
    ]),
    'Random Forest': Pipeline([
        ('preprocessor', preprocessor),
        ('model', RandomForestClassifier(n_jobs=-1, class_weight='balanced'))
    ]),
    'Support Vector Machine': Pipeline([
        ('preprocessor', preprocessor),
        ('model', SVC(class_weight='balanced'))
    ]),
    'Naive Bayes': Pipeline([
        ('preprocessor', ColumnTransformer([
            ('num', SimpleImputer(strategy='mean'), num_features),  # Skip scaling for Naive Bayes
            ('cat', categorical_pipeline, cat_features)
        ])),
        ('model', GaussianNB())
    ]),
    'Gradient Boosting': Pipeline([
        ('preprocessor', preprocessor),
        ('model', GradientBoostingClassifier())
    ]),
    'AdaBoost': Pipeline([
        ('preprocessor', preprocessor),
        ('model', AdaBoostClassifier())
    ]),
    'Bagging': Pipeline([
        ('preprocessor', preprocessor),
        ('model', BaggingClassifier(n_jobs=-1))
    ]),
    'K-Nearest Neighbors': Pipeline([
        ('preprocessor', preprocessor),
        ('model', KNeighborsClassifier())
    ])
}

# Initialize metrics dictionary
metrics = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': []
}

In [None]:
# Perform cross-validation for each pipeline
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Use stratified cross-validation
for name, pipeline in pipelines.items():
    scores = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'])
    metrics['Model'].append(name)
    metrics['Accuracy'].append(scores['test_accuracy'].mean())
    metrics['Precision'].append(scores['test_precision_weighted'].mean())
    metrics['Recall'].append(scores['test_recall_weighted'].mean())
    metrics['F1 Score'].append(scores['test_f1_weighted'].mean())

# Create DataFrame from metrics
metrics_df = pd.DataFrame(metrics)
print("Cross-Validation Metrics:")
print(metrics_df)

# Evaluate the best model on the test set
best_model_name = metrics_df.loc[metrics_df['Accuracy'].idxmax()]['Model']
best_model = pipelines[best_model_name]
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print(f"\nTest Set Evaluation for {best_model_name}:")
print(classification_report(y_test, y_pred))

# Confusion matrix for the best model
print(f"Confusion Matrix for {best_model_name}:")
print(confusion_matrix(y_test, y_pred))

# Feature importance for interpretable models
if hasattr(best_model.named_steps['model'], 'feature_importances_'):
    feature_importances = best_model.named_steps['model'].feature_importances_
    feature_names = best_model.named_steps['preprocessor'].transformers_[1][1].named_steps['encoder'].get_feature_names_out(cat_features).tolist()
    feature_names += num_features
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
    print("\nFeature Importances:")
    print(importance_df.sort_values(by='Importance', ascending=False))

Cross-Validation Metrics:
                    Model  Accuracy  Precision    Recall  F1 Score
0     Logistic Regression  0.992727   0.993007  0.992727  0.992663
1           Decision Tree  0.963636   0.964105  0.963636  0.963455
2           Random Forest  0.981818   0.982134  0.981818  0.981691
3  Support Vector Machine  0.992727   0.993007  0.992727  0.992663
4             Naive Bayes  0.901347   0.937892  0.901347  0.904475
5       Gradient Boosting  0.974545   0.975674  0.974545  0.974200
6                AdaBoost  0.926936   0.934856  0.926936  0.928665
7                 Bagging  0.978182   0.978896  0.978182  0.978005
8     K-Nearest Neighbors  0.985387   0.986307  0.985387  0.985412

Test Set Evaluation for Logistic Regression:
              precision    recall  f1-score   support

      Adelie       1.00      0.97      0.98        30
   Chinstrap       0.93      1.00      0.97        14
      Gentoo       1.00      1.00      1.00        25

    accuracy                           0