# Machine Learning Classification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os

df = pd.read_csv('../data/penguins_cleaned.csv')
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,diet,life_stage,health_metrics
0,Adelie,Biscoe,53.4,17.8,219.0,5687.0,female,fish,adult,overweight
1,Adelie,Biscoe,49.3,18.1,245.0,6811.0,female,fish,adult,overweight
2,Adelie,Biscoe,55.7,16.6,226.0,5388.0,female,fish,adult,overweight
3,Adelie,Biscoe,38.0,15.6,221.0,6262.0,female,fish,adult,overweight
4,Adelie,Biscoe,60.7,17.9,177.0,4811.0,female,fish,juvenile,overweight
...,...,...,...,...,...,...,...,...,...,...
3425,Gentoo,Biscoe,44.0,20.4,252.0,6447.0,male,squid,adult,healthy
3426,Gentoo,Biscoe,54.5,25.2,245.0,6872.0,male,squid,adult,healthy
3427,Gentoo,Biscoe,51.4,20.4,258.0,7409.0,male,squid,adult,overweight
3428,Gentoo,Biscoe,55.9,20.5,247.0,6491.0,male,squid,adult,healthy


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3430 entries, 0 to 3429
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            3430 non-null   object 
 1   island             3430 non-null   object 
 2   bill_length_mm     3430 non-null   float64
 3   bill_depth_mm      3430 non-null   float64
 4   flipper_length_mm  3430 non-null   float64
 5   body_mass_g        3430 non-null   float64
 6   sex                3430 non-null   object 
 7   diet               3430 non-null   object 
 8   life_stage         3430 non-null   object 
 9   health_metrics     3430 non-null   object 
dtypes: float64(4), object(6)
memory usage: 268.1+ KB


In [4]:
df = df.astype({col: "category" for col in df.select_dtypes('object').columns})
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3430 entries, 0 to 3429
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   species            3430 non-null   category
 1   island             3430 non-null   category
 2   bill_length_mm     3430 non-null   float64 
 3   bill_depth_mm      3430 non-null   float64 
 4   flipper_length_mm  3430 non-null   float64 
 5   body_mass_g        3430 non-null   float64 
 6   sex                3430 non-null   category
 7   diet               3430 non-null   category
 8   life_stage         3430 non-null   category
 9   health_metrics     3430 non-null   category
dtypes: category(6), float64(4)
memory usage: 128.2 KB


In [5]:
df.shape

(3430, 10)

# Split data into train and test 

In [6]:
from sklearn.model_selection import train_test_split

# X represents the features of the data
# Y represets the targets which in this case will be species

X = df.drop(['species'], axis = 1)
Y = df['species']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

print("Train set: ", X_train.shape, y_train.shape, "/n", "Test set: ", X_test.shape, y_test.shape)


Train set:  (2744, 9) (2744,) /n Test set:  (686, 9) (686,)


# Pipeline construction

Dataset contains numerical features and categorical features. This dataset is relatively small and clean meaning that the numerical features do not need to be scaled in order to improve restuls. 
The categorical features do need to be one hot encoded though so that they are interpretable by the ML model. 
I am going to include a pre-processing step called column transformer which one hot encodes the categorical variables and ignores the numerical values. 

In [7]:
df.columns.unique()

Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex', 'diet', 'life_stage',
       'health_metrics'],
      dtype='object')

In [8]:
num_cols = [ ]

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

def pipeline_decision_tree():
    pipeline = Pipeline([
        ("feature_scaling", StandardScaler()), 
     
        ( "model", DecisionTreeClassifier(random_state=1)),
    ])
    return pipeline


Fit the pipeline

In [10]:
#Need to add a categorical encoding step, on kaggle someone manually did it, I wanted to add a onehotencoder to my pipeline

pipeline = pipeline_decision_tree()
pipeline.fit(X_train, y_train)

ValueError: could not convert string to float: 'Biscoe'

future steps:

- Add PCA step
- add cross validation