# Import Dependencies

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load the dataset and prepare the training and test dataset

In [13]:
df = pd.read_csv('../Dataset/penguins_size.csv')
df = df[df['sex'] != '.'] #There is a row where sex = '.', so filtering it out
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [14]:
X = df.drop(columns=['species'])
y  = df.species
print('Shape of X:', X.shape)
print('Shape of y:', y.shape)

num_features = X.select_dtypes(include='number').columns
cat_features = X.select_dtypes(include='object').columns
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42, stratify=y)

Shape of X: (343, 6)
Shape of y: (343,)


### Missinng value Handeling

In [15]:
df.isna().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

Approach for handelling missing value:

1. For the `sex` column (categorical feature), replace any missing values with the most frequently occurring value (the mode).
2. For the remaining numerical features with missing values, use the average value (mean) of each respective column to fill in the gaps.


In [16]:


num_impute = SimpleImputer(strategy='mean')
cat_impute = SimpleImputer(strategy='most_frequent')

In [17]:
X_train[num_features] = num_impute.fit_transform(X_train[num_features])
X_test[num_features]  = num_impute.fit_transform(X_test[num_features])

X_train[cat_features]  = cat_impute.fit_transform(X_train[cat_features])
X_test[cat_features] = cat_impute.fit_transform(X_test[cat_features])

In [18]:
X_train.isna().sum()

island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [19]:
X_test.isna().sum()

island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

### Data Scalling

In [20]:
scaler = StandardScaler()
X_train[num_features]  = scaler.fit_transform(X_train[num_features])
X_test[num_features]  = scaler.transform(X_test[num_features])

### Data Encoding

**Approach:**
- Apply one-hot encoding to categorical columns (`Sex`, `island`, and `species`) because these columns do not have an inherent order.

In [21]:
# Step 3: Use OneHotEncoder for categorical features
categorical_cols = ['sex', 'island']
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder (use sparse_output instead of sparse)
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid multicollinearity

# Fit the encoder on the training data
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])

# Transform the test data using the encoder fitted on X_train
X_test_encoded = encoder.transform(X_test[categorical_cols])

# Get the column names for the encoded variables
encoded_columns = encoder.get_feature_names_out(categorical_cols)

# Convert the encoded arrays to DataFrames
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_columns)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_columns)

# Combine the original non-categorical columns with the encoded columns
X_train_final = pd.concat([X_train.drop(columns=categorical_cols).reset_index(drop=True), X_train_encoded_df], axis=1)
X_test_final = pd.concat([X_test.drop(columns=categorical_cols).reset_index(drop=True), X_test_encoded_df], axis=1)


In [22]:
X_train_final.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex_MALE,island_Dream,island_Torgersen
0,-1.194968,0.860348,-1.570186,-1.541421,1.0,1.0,0.0
1,-0.811698,1.756003,-0.704608,-0.383053,0.0,0.0,0.0
2,0.0,1.76778e-15,2.0501e-15,0.0,1.0,0.0,1.0
3,0.192105,-1.32903,1.026548,0.994465,0.0,0.0,0.0
4,0.265109,-0.08506524,-0.3439505,-0.883969,0.0,1.0,0.0


<hr>

In [23]:
from sklearn.model_selection import KFold

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier, BaggingClassifier
# import xgboost as xgb
# import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import math
from sklearn.model_selection import StratifiedKFold

In [24]:
# Define the models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, n_jobs=-1),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_jobs=-1),
    'Support Vector Machine': SVC(),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    #'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1),
    'Bagging': BaggingClassifier(n_jobs=-1),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    # 'LightGBM': lgb.LGBMClassifier(random_state = 0)
}


# Initialize lists to store metrics for each fold
metrics = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1 Score': []
}

In [25]:
for name, model in models.items():
        model.fit(X_train_final, y_train)
        y_pred = model.predict(X_test_final)

        # Compute metrics
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)

        if name == 'K-Nearest Neighbors':
                print(report)

        metrics['Model'].append(name)
        metrics['Accuracy'].append(accuracy)
        metrics['Precision'].append(report['weighted avg']['precision'])
        metrics['Recall'].append(report['weighted avg']['recall'])
        metrics['F1 Score'].append(report['weighted avg']['f1-score'])

# Create DataFrame from metrics
metrics_df = pd.DataFrame(metrics)

{'Adelie': {'precision': 1.0, 'recall': 0.9666666666666667, 'f1-score': 0.983050847457627, 'support': 30.0}, 'Chinstrap': {'precision': 0.9333333333333333, 'recall': 1.0, 'f1-score': 0.9655172413793104, 'support': 14.0}, 'Gentoo': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 25.0}, 'accuracy': 0.9855072463768116, 'macro avg': {'precision': 0.9777777777777779, 'recall': 0.9888888888888889, 'f1-score': 0.9828560296123126, 'support': 69.0}, 'weighted avg': {'precision': 0.9864734299516907, 'recall': 0.9855072463768116, 'f1-score': 0.985634301493321, 'support': 69.0}}


In [26]:
metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,1.0,1.0,1.0,1.0
1,Decision Tree,0.985507,0.986065,0.985507,0.985527
2,Random Forest,1.0,1.0,1.0,1.0
3,Support Vector Machine,1.0,1.0,1.0,1.0
4,Naive Bayes,0.811594,0.892419,0.811594,0.811767
5,Gradient Boosting,1.0,1.0,1.0,1.0
6,AdaBoost,0.956522,0.959645,0.956522,0.956972
7,Bagging,1.0,1.0,1.0,1.0
8,K-Nearest Neighbors,0.985507,0.986473,0.985507,0.985634


In [27]:
knn = KNeighborsClassifier()
knn.fit(X_train_final,y_train)
y_pred = knn.predict(X_test_final)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

      Adelie       1.00      0.97      0.98        30
   Chinstrap       0.93      1.00      0.97        14
      Gentoo       1.00      1.00      1.00        25

    accuracy                           0.99        69
   macro avg       0.98      0.99      0.98        69
weighted avg       0.99      0.99      0.99        69

