# Import Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load the dataset and prepare the training and test dataset

In [2]:
df = pd.read_csv('../Dataset/penguins_size.csv')
df = df[df['sex'] != '.'] #There is a row where sex = '.', so filtering it out
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [3]:
X = df.drop(columns=['species'])
y  = df.species
print('Shape of X:', X.shape)
print('Shape of y:', y.shape)

num_features = X.select_dtypes(include='number').columns
cat_features = X.select_dtypes(include='object').columns
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42, stratify=y)

Shape of X: (343, 6)
Shape of y: (343,)


### Missinng value Handeling

In [4]:
df.isna().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

Approach for handelling missing value:

1. For the `sex` column (categorical feature), replace any missing values with the most frequently occurring value (the mode).
2. For the remaining numerical features with missing values, use the average value (mean) of each respective column to fill in the gaps.


In [5]:
num_impute = SimpleImputer(strategy='mean')
cat_impute = SimpleImputer(strategy='most_frequent')

In [6]:
X_train[num_features] = num_impute.fit_transform(X_train[num_features])
X_test[num_features]  = num_impute.fit_transform(X_test[num_features])

X_train[cat_features]  = cat_impute.fit_transform(X_train[cat_features])
X_test[cat_features] = cat_impute.fit_transform(X_test[cat_features])

In [7]:
X_train.isna().sum()

island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [8]:
X_test.isna().sum()

island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

### Data Scalling

In [10]:
scaler = StandardScaler()
X_train[num_features]  = scaler.fit_transform(X_train[num_features])
X_test[num_features]  = scaler.transform(X_test[num_features])

### Data Encoding

**Approach:**
- Apply one-hot encoding to categorical columns (`Sex`, `island`, and `species`) because these columns do not have an inherent order.

In [13]:
# Step 3: Use OneHotEncoder for categorical features
categorical_cols = ['sex', 'island']
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder (use sparse_output instead of sparse)
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid multicollinearity

# Fit the encoder on the training data
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])

# Transform the test data using the encoder fitted on X_train
X_test_encoded = encoder.transform(X_test[categorical_cols])

# Get the column names for the encoded variables
encoded_columns = encoder.get_feature_names_out(categorical_cols)

# Convert the encoded arrays to DataFrames
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_columns)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_columns)

# Combine the original non-categorical columns with the encoded columns
X_train_final = pd.concat([X_train.drop(columns=categorical_cols).reset_index(drop=True), X_train_encoded_df], axis=1)
X_test_final = pd.concat([X_test.drop(columns=categorical_cols).reset_index(drop=True), X_test_encoded_df], axis=1)


In [14]:
X_train_final.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex_MALE,island_Dream,island_Torgersen
0,-1.194968,0.860348,-1.570186,-1.541421,1.0,1.0,0.0
1,-0.811698,1.756003,-0.704608,-0.383053,0.0,0.0,0.0
2,0.0,1.76778e-15,2.0501e-15,0.0,1.0,0.0,1.0
3,0.192105,-1.32903,1.026548,0.994465,0.0,0.0,0.0
4,0.265109,-0.08506524,-0.3439505,-0.883969,0.0,1.0,0.0


<hr>

In [15]:
from sklearn.model_selection import KFold

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier, BaggingClassifier
# import xgboost as xgb
# import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import math
from sklearn.model_selection import StratifiedKFold

In [16]:
# Define the models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, n_jobs=-1),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_jobs=-1),
    'Support Vector Machine': SVC(),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    #'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1),
    'Bagging': BaggingClassifier(n_jobs=-1),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    # 'LightGBM': lgb.LGBMClassifier(random_state = 0)
}


# Initialize lists to store metrics for each fold
metrics = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': []
}

In [17]:
for name, model in models.items():
        model.fit(X_train_final, y_train)
        y_pred = model.predict(X_test_final)

        # Compute metrics
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        print(classification_report(y_test, y_pred))

        metrics['Model'].append(name)
        metrics['Accuracy'].append(accuracy)
        metrics['Precision'].append(report['weighted avg']['precision'])
        metrics['Recall'].append(report['weighted avg']['recall'])
        metrics['F1 Score'].append(report['weighted avg']['f1-score'])

# Create DataFrame from metrics
metrics_df = pd.DataFrame(metrics)

              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        30
   Chinstrap       1.00      1.00      1.00        14
      Gentoo       1.00      1.00      1.00        25

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69

              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        30
   Chinstrap       1.00      1.00      1.00        14
      Gentoo       1.00      1.00      1.00        25

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69

              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        30
   Chinstrap       1.00      1.00      1.00        14
      Gentoo       1.00      1.00      1.00        25

    accuracy        

In [18]:
metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,1.0,1.0,1.0,1.0
1,Decision Tree,1.0,1.0,1.0,1.0
2,Random Forest,1.0,1.0,1.0,1.0
3,Support Vector Machine,1.0,1.0,1.0,1.0
4,Naive Bayes,0.811594,0.892419,0.811594,0.811767
5,Gradient Boosting,1.0,1.0,1.0,1.0
6,AdaBoost,0.956522,0.959645,0.956522,0.956972
7,Bagging,1.0,1.0,1.0,1.0
8,K-Nearest Neighbors,0.985507,0.986473,0.985507,0.985634


In [19]:
knn = KNeighborsClassifier()
knn.fit(X_train_final,y_train)
y_pred = knn.predict(X_test_final)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

      Adelie       1.00      0.97      0.98        30
   Chinstrap       0.93      1.00      0.97        14
      Gentoo       1.00      1.00      1.00        25

    accuracy                           0.99        69
   macro avg       0.98      0.99      0.98        69
weighted avg       0.99      0.99      0.99        69



In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('../Dataset/penguins_size.csv')
df = df[df['sex'] != '.']  # Filter out rows where sex = '.'

# Separate features and target
X = df.drop(columns=['species'])
y = df['species']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify numerical and categorical columns
num_features = X.select_dtypes(include='number').columns.tolist()
cat_features = X.select_dtypes(include='object').columns.tolist()

# Define preprocessing for numerical features
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values with mean
    ('scaler', StandardScaler())  # Scale numerical features
])

# Define preprocessing for categorical features
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values with mode
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))  # Encode categorical features
])

# Combine preprocessing steps for numerical and categorical features
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, num_features),
    ('cat', categorical_pipeline, cat_features)
])

# Define models with pipelines
pipelines = {
    'Logistic Regression': Pipeline([('preprocessor', preprocessor), ('model', LogisticRegression(max_iter=1000, n_jobs=-1))]),
    'Decision Tree': Pipeline([('preprocessor', preprocessor), ('model', DecisionTreeClassifier())]),
    'Random Forest': Pipeline([('preprocessor', preprocessor), ('model', RandomForestClassifier(n_jobs=-1))]),
    'Support Vector Machine': Pipeline([('preprocessor', preprocessor), ('model', SVC())]),
    'Naive Bayes': Pipeline([('preprocessor', preprocessor), ('model', GaussianNB())]),  # Note: Naive Bayes may not require scaling
    'Gradient Boosting': Pipeline([('preprocessor', preprocessor), ('model', GradientBoostingClassifier())]),
    'AdaBoost': Pipeline([('preprocessor', preprocessor), ('model', AdaBoostClassifier())]),
    'Bagging': Pipeline([('preprocessor', preprocessor), ('model', BaggingClassifier(n_jobs=-1))]),
    'K-Nearest Neighbors': Pipeline([('preprocessor', preprocessor), ('model', KNeighborsClassifier())])
}

# Initialize metrics dictionary
metrics = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': []
}

# Perform cross-validation for each pipeline
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Use stratified cross-validation

for name, pipeline in pipelines.items():
    scores = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'])
    metrics['Model'].append(name)
    metrics['Accuracy'].append(scores['test_accuracy'].mean())
    metrics['Precision'].append(scores['test_precision_weighted'].mean())
    metrics['Recall'].append(scores['test_recall_weighted'].mean())
    metrics['F1 Score'].append(scores['test_f1_weighted'].mean())

# Create DataFrame from metrics
metrics_df = pd.DataFrame(metrics)
print(metrics_df)

                    Model  Accuracy  Precision    Recall  F1 Score
0     Logistic Regression  0.992727   0.993007  0.992727  0.992663
1           Decision Tree  0.956296   0.958193  0.956296  0.956172
2           Random Forest  0.978182   0.978467  0.978182  0.977671
3  Support Vector Machine  0.985455   0.986763  0.985455  0.984889
4             Naive Bayes  0.802963   0.879667  0.802963  0.798693
5       Gradient Boosting  0.974545   0.975674  0.974545  0.974200
6                AdaBoost  0.926936   0.934856  0.926936  0.928665
7                 Bagging  0.974545   0.975288  0.974545  0.974376
8     K-Nearest Neighbors  0.985387   0.986307  0.985387  0.985412


In [22]:
from sklearn.metrics import classification_report

best_model = pipelines['Logistic Regression']  # Replace with the chosen model
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        30
   Chinstrap       1.00      1.00      1.00        14
      Gentoo       1.00      1.00      1.00        25

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
df = pd.read_csv('../Dataset/penguins_size.csv')
df = df[df['sex'] != '.']  # Filter out rows where sex = '.'

# Separate features and target
X = df.drop(columns=['species'])
y = df['species']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify numerical and categorical columns
num_features = X.select_dtypes(include='number').columns.tolist()
cat_features = X.select_dtypes(include='object').columns.tolist()

# Define preprocessing for numerical features (with KNN imputation)
numerical_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),  # Use KNN imputation for numerical features
    ('scaler', StandardScaler())  # Scale numerical features
])

# Define preprocessing for categorical features
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values with mode
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))  # Encode categorical features
])

# Combine preprocessing steps for numerical and categorical features
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, num_features),
    ('cat', categorical_pipeline, cat_features)
])

# Define models with pipelines
pipelines = {
    'Logistic Regression': Pipeline([
        ('preprocessor', preprocessor),
        ('model', LogisticRegression(max_iter=1000, n_jobs=-1, class_weight='balanced'))
    ]),
    'Decision Tree': Pipeline([
        ('preprocessor', preprocessor),
        ('model', DecisionTreeClassifier(class_weight='balanced'))
    ]),
    'Random Forest': Pipeline([
        ('preprocessor', preprocessor),
        ('model', RandomForestClassifier(n_jobs=-1, class_weight='balanced'))
    ]),
    'Support Vector Machine': Pipeline([
        ('preprocessor', preprocessor),
        ('model', SVC(class_weight='balanced'))
    ]),
    'Naive Bayes': Pipeline([
        ('preprocessor', ColumnTransformer([
            ('num', SimpleImputer(strategy='mean'), num_features),  # Skip scaling for Naive Bayes
            ('cat', categorical_pipeline, cat_features)
        ])),
        ('model', GaussianNB())
    ]),
    'Gradient Boosting': Pipeline([
        ('preprocessor', preprocessor),
        ('model', GradientBoostingClassifier())
    ]),
    'AdaBoost': Pipeline([
        ('preprocessor', preprocessor),
        ('model', AdaBoostClassifier())
    ]),
    'Bagging': Pipeline([
        ('preprocessor', preprocessor),
        ('model', BaggingClassifier(n_jobs=-1))
    ]),
    'K-Nearest Neighbors': Pipeline([
        ('preprocessor', preprocessor),
        ('model', KNeighborsClassifier())
    ])
}

# Initialize metrics dictionary
metrics = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': []
}

# Perform cross-validation for each pipeline
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Use stratified cross-validation
for name, pipeline in pipelines.items():
    scores = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'])
    metrics['Model'].append(name)
    metrics['Accuracy'].append(scores['test_accuracy'].mean())
    metrics['Precision'].append(scores['test_precision_weighted'].mean())
    metrics['Recall'].append(scores['test_recall_weighted'].mean())
    metrics['F1 Score'].append(scores['test_f1_weighted'].mean())

# Create DataFrame from metrics
metrics_df = pd.DataFrame(metrics)
print("Cross-Validation Metrics:")
print(metrics_df)

# Evaluate the best model on the test set
best_model_name = metrics_df.loc[metrics_df['Accuracy'].idxmax()]['Model']
best_model = pipelines[best_model_name]
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print(f"\nTest Set Evaluation for {best_model_name}:")
print(classification_report(y_test, y_pred))

# Confusion matrix for the best model
print(f"Confusion Matrix for {best_model_name}:")
print(confusion_matrix(y_test, y_pred))

# Feature importance for interpretable models
if hasattr(best_model.named_steps['model'], 'feature_importances_'):
    feature_importances = best_model.named_steps['model'].feature_importances_
    feature_names = best_model.named_steps['preprocessor'].transformers_[1][1].named_steps['encoder'].get_feature_names_out(cat_features).tolist()
    feature_names += num_features
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
    print("\nFeature Importances:")
    print(importance_df.sort_values(by='Importance', ascending=False))

Cross-Validation Metrics:
                    Model  Accuracy  Precision    Recall  F1 Score
0     Logistic Regression  0.992727   0.993007  0.992727  0.992663
1           Decision Tree  0.967273   0.968195  0.967273  0.967318
2           Random Forest  0.974545   0.974624  0.974545  0.973980
3  Support Vector Machine  0.992727   0.993007  0.992727  0.992663
4             Naive Bayes  0.901347   0.937892  0.901347  0.904475
5       Gradient Boosting  0.974545   0.975674  0.974545  0.974200
6                AdaBoost  0.926936   0.934856  0.926936  0.928665
7                 Bagging  0.974545   0.975114  0.974545  0.974419
8     K-Nearest Neighbors  0.985387   0.986307  0.985387  0.985412

Test Set Evaluation for Logistic Regression:
              precision    recall  f1-score   support

      Adelie       1.00      0.97      0.98        30
   Chinstrap       0.93      1.00      0.97        14
      Gentoo       1.00      1.00      1.00        25

    accuracy                           0