In [116]:
# Imports
import joblib
import pandas as pd
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Load Data
penguins = pd.read_csv("data/penguins.csv")

# Process Data - create input & output sets.
penguins['Sex'] = penguins['Sex'].map({'MALE': 1, 'FEMALE': 0})
penguins = penguins.drop(columns=['studyName','Sample Number','Region','Island','Individual ID','Stage','Date Egg','Comments','Clutch Completion','Delta 15 N (o/oo)','Delta 13 C (o/oo)', 'Flipper Length (mm)','Body Mass (g)']) # Get rid of columns not used for training/testing.
penguins = penguins.dropna() # Get rid of rows w/ NaN

X = penguins.drop(columns=['Species']) # input
y = penguins['Species'] # output
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2) # Use 20% of data for testing.

In [112]:
# Build a model.
model = DecisionTreeClassifier()

# Train the model.
model.fit(X_train, y_train)

# Test model accuracy.
prediction = model.predict(X_test)
accuracy = accuracy_score(y_test, prediction)
accuracy

0.9701492537313433

In [117]:
# Save model.
joblib.dump(model, 'penguin-species-identifier.joblib')

['penguin-species-identifier.joblib']

In [118]:
# Use model.
model = joblib.load('penguin-species-identifier.joblib')
prediction = model.predict([[15, 11, 1]]) # Enter Culmen Length, Culmen Depth and Sex.
prediction

array(['Gentoo penguin (Pygoscelis papua)'], dtype=object)

In [120]:
# Visualize model in a decision tree.
tree.export_graphviz(model, out_file='penguin-species-identifier.dot', 
                     feature_names = ['Culmen Length (mm)', 'Culmen Depth (mm)', 'Sex'],
                     class_names = sorted(y.unique()), # Get all possible outputs.
                     label = 'all',
                     rounded = True, # Design stuff...
                     filled = True)