In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

  import pandas.util.testing as tm


In [2]:
# Load the dataset;
penguin = pd.read_csv('penguins_size.csv')
penguin.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [3]:
print('The dataset has ' + str(len(penguin)) + ' rows')

The dataset has 344 rows


In [4]:
# Drop missing values
penguin.dropna(axis=0, inplace=True)

In [5]:
penguin['culmen_length_mm'] = penguin['culmen_length_mm'].astype('float32')
penguin['culmen_depth_mm'] = penguin['culmen_depth_mm'].astype('float32')
penguin['flipper_length_mm'] = penguin['flipper_length_mm'].astype('float32')
penguin['body_mass_g'] = penguin['body_mass_g'].astype('float32')

In [6]:
penguin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 334 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            334 non-null    object 
 1   island             334 non-null    object 
 2   culmen_length_mm   334 non-null    float32
 3   culmen_depth_mm    334 non-null    float32
 4   flipper_length_mm  334 non-null    float32
 5   body_mass_g        334 non-null    float32
 6   sex                334 non-null    object 
dtypes: float32(4), object(3)
memory usage: 15.7+ KB


In [7]:
# EDA with pandas-profiling
profile = ProfileReport(penguin, title='Pandas Profiling Report')
profile

Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …



In [8]:
# Drop row with invalid value:
penguin[penguin['sex']=='.'].index
penguin.drop(index=336, inplace=True)

In [9]:
# Save cleaned dataset to a .csv file:
penguin.to_csv('penguins_cleaned.csv', index=False)

In [10]:
# Feature encoding:
df = penguin.copy()
target = 'species'
encode = ['sex','island']

for col in encode:
    dummy = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df,dummy], axis=1)
    del df[col]

In [11]:
df.head()

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex_FEMALE,sex_MALE,island_Biscoe,island_Dream,island_Torgersen
0,Adelie,39.099998,18.700001,181.0,3750.0,0,1,0,0,1
1,Adelie,39.5,17.4,186.0,3800.0,1,0,0,0,1
2,Adelie,40.299999,18.0,195.0,3250.0,1,0,0,0,1
4,Adelie,36.700001,19.299999,193.0,3450.0,1,0,0,0,1
5,Adelie,39.299999,20.6,190.0,3650.0,0,1,0,0,1


In [12]:
# Encode target variable:
target_mapper = {'Adelie':0, 'Chinstrap':1, 'Gentoo':2}
def target_encode(val):
    return target_mapper[val]

df['species'] = df['species'].apply(target_encode)

# Separating X and y
X = df.drop('species', axis=1)
y = df['species']

In [13]:
# Standard scaler
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

In [14]:
# Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(233, 9) (233,)
(100, 9) (100,)


In [15]:
# Build random forest model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

# Saving the model
pickle.dump(clf, open('penguins_clf.pkl', 'wb'))

Accuracy: 1.0


In [16]:
# Build KNN model:

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

# Saving the model
pickle.dump(knn, open('penguins_knn.pkl', 'wb'))

Accuracy: 0.73
