# Penguins Classifier


In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn graphviz

In [None]:
import pandas as pd
from prepare_training_data import prepare_splits, train_all_models, evaluate_all
from visualize import visualize_all, plot_class_distributions, plot_original_distribution, analyze_accuracy_vs_depth

In [None]:
dataset_path = "Data/penguinsData.csv"

In [None]:
# Initialization
penguins_df = pd.read_csv(dataset_path)
feature = penguins_df.drop("Species", axis=1)
label = penguins_df["Species"]

In [None]:
# Show the chart of original data
plot_original_distribution(label, "Original Penguins Dataset")

In [None]:
# set up the proportions
splits = [(0.4, 0.6), (0.6, 0.4), (0.8, 0.2), (0.9, 0.1)]
# Implement preparing data 
datasets = prepare_splits(feature, label, splits)

In [None]:
# show charts of datasets after we "preparing" 
plot_class_distributions(datasets, splits, "Study Name")


In [None]:
# train data
clfs = train_all_models(datasets)
# visualize the trees based on trained models of datasets (clfs)
visualize_all(
              clfs, 
              feature_names=feature.columns.tolist(), 
              class_names=["Chinstrap penguin (Pygoscelis antarctica)", 
                            "Adelie Penguin (Pygoscelis adeliae)",
                            "Gentoo penguin (Pygoscelis papua)"]
            )

In [None]:
evaluate_all(clfs, 
             datasets, 
             class_names=["Chinstrap penguin (Pygoscelis antarctica)", 
                          "Adelie Penguin (Pygoscelis adeliae)",
                          "Gentoo penguin (Pygoscelis papua)"])


In [None]:
results_df = analyze_accuracy_vs_depth(dataset=datasets[2],
                                       feature_names=feature.columns.tolist(),
                                       class_names=["Chinstrap penguin (Pygoscelis antarctica)", 
                                                    "Adelie Penguin (Pygoscelis adeliae)",
                                                    "Gentoo penguin (Pygoscelis papua)"])

results_df