# Penguin Classification

## By: Tahsin Jahin Khalid

### Import Modules

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
import pickle

### Load Preprocessed Data

In [4]:
data_penguins = pd.read_csv("data/penguins_cleaned.csv", header=0, encoding="utf-8")

In [5]:
data_penguins.head()

Unnamed: 0.1,Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,1,Adelie,Torgersen,39.1,18.7,181,3750,male
1,2,Adelie,Torgersen,39.5,17.4,186,3800,female
2,3,Adelie,Torgersen,40.3,18.0,195,3250,female
3,5,Adelie,Torgersen,36.7,19.3,193,3450,female
4,6,Adelie,Torgersen,39.3,20.6,190,3650,male


In [6]:
data_penguins.drop("Unnamed: 0", axis=1, inplace=True)

In [7]:
data_penguins.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181,3750,male
1,Adelie,Torgersen,39.5,17.4,186,3800,female
2,Adelie,Torgersen,40.3,18.0,195,3250,female
3,Adelie,Torgersen,36.7,19.3,193,3450,female
4,Adelie,Torgersen,39.3,20.6,190,3650,male


In [8]:
data_penguins.dtypes

species               object
island                object
culmen_length_mm     float64
culmen_depth_mm      float64
flipper_length_mm      int64
body_mass_g            int64
sex                   object
dtype: object

**Note:** <span style="color: red;">Species</span> is the target variable.

We have 2 categorical variables here: island and sex. These will have to be encoded to numeric factors.

In [9]:
penguins_df = data_penguins.copy()
target = "species"
to_encode = ["sex", "island"]

In [10]:
for col in to_encode:
    dummy = pd.get_dummies(penguins_df[col], prefix=col)
    penguins_df = pd.concat([penguins_df,dummy], axis=1)
    del penguins_df[col]


In [11]:
penguins_df.head(5)

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex_female,sex_male,island_Biscoe,island_Dream,island_Torgersen
0,Adelie,39.1,18.7,181,3750,False,True,False,False,True
1,Adelie,39.5,17.4,186,3800,True,False,False,False,True
2,Adelie,40.3,18.0,195,3250,True,False,False,False,True
3,Adelie,36.7,19.3,193,3450,True,False,False,False,True
4,Adelie,39.3,20.6,190,3650,False,True,False,False,True


In [12]:
target_mapper = {
    'Adelie': 0,
    'Chinstrap': 1,
    'Gentoo': 2
}

In [13]:
def target_encode(val):
    return target_mapper[val]

In [14]:
penguins_df['species'] = penguins_df["species"].apply(target_encode)

In [15]:
penguins_df.tail(5)

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex_female,sex_male,island_Biscoe,island_Dream,island_Torgersen
328,2,47.2,13.7,214,4925,True,False,True,False,False
329,2,46.8,14.3,215,4850,True,False,True,False,False
330,2,50.4,15.7,222,5750,False,True,True,False,False
331,2,45.2,14.8,212,5200,True,False,True,False,False
332,2,49.9,16.1,213,5400,False,True,True,False,False


In [16]:
# formulate X and y
X = penguins_df.drop("species", axis=1)
y = penguins_df[target]

In [20]:
X.columns

Index(['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm',
       'body_mass_g', 'sex_female', 'sex_male', 'island_Biscoe',
       'island_Dream', 'island_Torgersen'],
      dtype='object')

### Classification Model: Random Forest

In [17]:
classifier = RandomForestClassifier(n_estimators=200, max_depth=5)

In [18]:
classifier.fit(X,y)

### Saving the <span style='color: green;'>Model</span>

In [19]:
with open("penguin_clf.pkl", "wb") as file:
    pickle.dump(classifier, file)

**Note**: I am very aware of intentionally skipping the train-validation split of the dataset part while training the model. The goal of this model is to be used in the Web App to showcase my <span style="color: #fff000;">Model Deployment</span> skills.