In [44]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt


#Pull data from Kaggle
path = kagglehub.dataset_download("parulpandey/palmer-archipelago-antarctica-penguin-data")

Using Colab cache for faster access to the 'palmer-archipelago-antarctica-penguin-data' dataset.


In [45]:
#Data Loading
penguins = '/kaggle/input/palmer-archipelago-antarctica-penguin-data/penguins_size.csv'
dataIn = pd.read_csv(penguins)
filteredData = dataIn[dataIn['species'] != 'Chinstrap'].dropna()

#Class Labels
filteredData['species_label'] = filteredData['species'].astype('category').cat.codes
filteredData['sex_label'] = filteredData['sex'].astype('category').cat.codes
filteredData['island_label'] = filteredData['island'].astype('category').cat.codes

#Test Train Split
x = filteredData[['island_label','culmen_length_mm', 'culmen_depth_mm','flipper_length_mm', 'body_mass_g', 'sex_label']]
y = filteredData['species_label']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,shuffle=True,random_state=3)

In [51]:
#Bayes
nb = MultinomialNB()
nb.fit(x_train, y_train)
y_pred_NBTest = nb.predict(x_test)
y_pred_NBTrain = nb.predict(x_train)
y_prob_NBTest = nb.predict_proba(x_test)

#Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(x_train,y_train)
y_pred_LRTest= lr.predict(x_test)
y_pred_LRTrain = lr.predict(x_train)
y_prob_LRTest = lr.predict_proba(x_test)


In [52]:
#Accuracy
NB_train_accuracy = accuracy_score(y_train,y_pred_NBTrain)
NB_test_accuracy = accuracy_score(y_test,y_pred_NBTest)
LR_test_accuracy = lr.score(x_test,y_test)
LR_train_accuracy = lr.score(x_train, y_train)
print("*************** Accuracy ***************")
print(f"Naive Bayes Train Accuracy: {NB_train_accuracy}")
print(f"Naive Bayes Test Accuracy: {NB_test_accuracy}")
print(f"Linear Regression Train Accuracy: {LR_train_accuracy}")
print(f"Linear Regression Test Accuracy: {LR_test_accuracy}")
print()


*************** Accuracy ***************
Naive Bayes Train Accuracy: 0.9528301886792453
Naive Bayes Test Accuracy: 0.9259259259259259
Linear Regression Train Accuracy: 1.0
Linear Regression Test Accuracy: 1.0



In [54]:
#AUC
NB_train_auc = roc_auc_score(y_train,y_pred_NBTrain)
NB_test_auc = roc_auc_score(y_test,y_pred_NBTest)
LR_test_auc = roc_auc_score(y_test,y_pred_LRTest)
LR_train_auc = roc_auc_score(y_train,y_pred_LRTrain)
print("******* Area Under The ROC Curve *******")
print(f"Naive Bayes Train AUC: {NB_train_auc}")
print(f"Naive Bayes Test AUC: {NB_test_auc}")
print(f"Linear Regression Train AUC: {LR_train_auc}")
print(f"Linear Regression Test AUC: {LR_test_auc}")

******* Area Under The ROC Curve *******
Naive Bayes Train AUC: 0.9538750335210513
Naive Bayes Test AUC: 0.9307359307359306
Linear Regression Train AUC: 1.0
Linear Regression Test AUC: 1.0
