In [1]:
# standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# jupyter notebook configurations
%matplotlib inline
sns.set_style('whitegrid')
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columns
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)

In [3]:
# supress warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
# load dataset
df = pd.read_csv("penguins_cleaned.csv")

In [5]:
# preview dataset
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181,3750,male
1,Adelie,Torgersen,39.5,17.4,186,3800,female
2,Adelie,Torgersen,40.3,18.0,195,3250,female
3,Adelie,Torgersen,36.7,19.3,193,3450,female
4,Adelie,Torgersen,39.3,20.6,190,3650,male


In [6]:
df.tail()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
328,Chinstrap,Dream,55.8,19.8,207,4000,male
329,Chinstrap,Dream,43.5,18.1,202,3400,female
330,Chinstrap,Dream,49.6,18.2,193,3775,male
331,Chinstrap,Dream,50.8,19.0,210,4100,male
332,Chinstrap,Dream,50.2,18.7,198,3775,female


In [7]:
# features
df.columns

Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')

- we will treat `species` as our target variable

In [8]:
# size of dataset
print("# rows:", df.shape[0])
print("# cols:", df.shape[1])

# rows: 333
# cols: 7


In [9]:
# metadata - dataframe makeup and structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    int64  
 5   body_mass_g        333 non-null    int64  
 6   sex                333 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 18.3+ KB


- dataset has both categorical and numerical features
- dataset does not have missing values

In [10]:
# check for missing values
df.isna().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [11]:
# metadata - numerical features
df.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,333.0,333.0,333.0,333.0
mean,43.992793,17.164865,200.966967,4207.057057
std,5.468668,1.969235,14.015765,805.215802
min,32.1,13.1,172.0,2700.0
25%,39.5,15.6,190.0,3550.0
50%,44.5,17.3,197.0,4050.0
75%,48.6,18.7,213.0,4775.0
max,59.6,21.5,231.0,6300.0


In [12]:
# explore categorical features
for i in ['sex', 'species', 'island']:
    print("Feature:",i)
    print(df[i].value_counts())
    print("- - -")

Feature: sex
male      168
female    165
Name: sex, dtype: int64
- - -
Feature: species
Adelie       146
Gentoo       119
Chinstrap     68
Name: species, dtype: int64
- - -
Feature: island
Biscoe       163
Dream        123
Torgersen     47
Name: island, dtype: int64
- - -


In [13]:
# encode categorical features
df = pd.get_dummies(data=df, columns=['sex', 'island'])
# df = pd.concat([df,dummy], axis=1)  # concat dummy features to original dataset
# df.drop(['sex', 'island'], axis=1, inplace=True)

In [14]:
# encode 'species'

# create a mapper
target_mapper = {
    "Adelie":0,
    "Gentoo":1,
    "Chinstrap":2
}
df['species'] = df['species'].map(target_mapper)

In [15]:
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex_female,sex_male,island_Biscoe,island_Dream,island_Torgersen
0,0,39.1,18.7,181,3750,0,1,0,0,1
1,0,39.5,17.4,186,3800,1,0,0,0,1
2,0,40.3,18.0,195,3250,1,0,0,0,1
3,0,36.7,19.3,193,3450,1,0,0,0,1
4,0,39.3,20.6,190,3650,0,1,0,0,1


In [16]:
df['species'].value_counts()

0    146
1    119
2     68
Name: species, dtype: int64

In [17]:
# dependent and independent features
X = np.array(df.drop(['species'], axis=1))
y = np.array(df['species'])

In [18]:
X

array([[ 39.1,  18.7, 181. , ...,   0. ,   0. ,   1. ],
       [ 39.5,  17.4, 186. , ...,   0. ,   0. ,   1. ],
       [ 40.3,  18. , 195. , ...,   0. ,   0. ,   1. ],
       ...,
       [ 49.6,  18.2, 193. , ...,   0. ,   1. ,   0. ],
       [ 50.8,  19. , 210. , ...,   0. ,   1. ,   0. ],
       [ 50.2,  18.7, 198. , ...,   0. ,   1. ,   0. ]])

In [19]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [20]:
# modelling

In [21]:
# random forest classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X,y)

RandomForestClassifier()

In [22]:
# save the model
import pickle
pickle.dump(clf, open('penguins_random_forest_clf.pkl','wb'))