In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

print(tf.__version__)

In [None]:
penguins_raw = pd.read_csv('/kaggle/input/palmer-archipelago-antarctica-penguin-data/penguins_size.csv', na_values='NA', skipinitialspace=True)
print(penguins_raw.tail())
print("Columns: ", penguins_raw.columns)

In [None]:
features = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']
label = 'sex'

dataset = penguins_raw.copy()[[label] + features]
print(dataset)

In [None]:
# Overview for numeric variables
print(dataset.describe(include = [np.number]))
# Overview for categorical label
print(dataset.describe(include = ['O']))

In [None]:
# unique values
print("Unique values of column 'sex': ", dataset.sex.unique())
# NaN values
print("NA values in our dataset:\n", dataset.isna().sum())

# remove all rows with NA values or "." in column 'sex'
dataset = dataset.loc[dataset.sex.isin(['FEMALE', 'MALE']), :]
print(dataset.tail())
# Check NA values again
print("NA values in our dataset (there should be none):\n", dataset.isna().sum())

In [None]:
# convert Sex from string to 0/1 (0=MALE, 1=FEMALE)
dataset['sex'] = pd.factorize(dataset['sex'])[0]
dataset.tail()

In [None]:
# basic exploratory analysis with boxplots
f, axes = plt.subplots(2, 2)
sns.boxplot(x=dataset['sex'],y=dataset['culmen_length_mm'], ax = axes[0][0])
sns.boxplot(x=dataset['sex'],y=dataset['culmen_depth_mm'], ax = axes[0][1])
sns.boxplot(x=dataset['sex'],y=dataset['flipper_length_mm'], ax = axes[1][0])
sns.boxplot(x=dataset['sex'],y=dataset['body_mass_g'], ax = axes[1][1])

In [None]:
# split the dataset into two parts (train & test)
train_dataset = dataset.sample(frac=0.7, random_state=42)
test_dataset = dataset.drop(train_dataset.index)

train_dataset.shape, test_dataset.shape

In [None]:
# separate label column from the data
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop(label)
test_labels = test_features.pop(label)

train_features.shape, test_features.shape, train_labels.shape, test_labels.shape

In [None]:
# normalizer for all feature columns
normalizer = preprocessing.Normalization()
normalizer.adapt(np.array(train_features))

normalized = np.array(normalizer(train_features))
normalized.mean(), normalized.std()

In [None]:
# model definition
dnn_model = keras.Sequential([
      normalizer,
      layers.Dense(32, activation='relu'),
      #layers.Dense(16, activation='relu'),
      layers.Dense(1, activation='sigmoid')
])

dnn_model.compile(loss='binary_crossentropy', 
                  optimizer=tf.keras.optimizers.Adam(0.001),
                  metrics=['accuracy'])

In [None]:
dnn_model.summary()

In [None]:
%%time
history = dnn_model.fit(
    train_features, train_labels,
    validation_split=0.2,
    verbose=0,
    epochs=200)

In [None]:
def plot_loss(history):
  plt.plot(history.history['accuracy'], label='accuracy')
  plt.plot(history.history['val_accuracy'], label='val_accuracy')
  plt.ylim([0, 1])
  plt.xlabel('Epoch')
  plt.ylabel('Cross-entropy')
  plt.legend()
  plt.grid(True)

plot_loss(history)

In [None]:
# evaluation on test data
test_evaluation = {}
test_evaluation['dnn_model'] = dnn_model.evaluate(
    test_features,
    test_labels, verbose=0)
test_evaluation