In [93]:
import os
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight

np.random.seed(42)
tf.random.set_seed(42)

DATA_PATH = os.path.join(os.getcwd(), 'data', 'breast-cancer-wisconsin.data')

In [94]:
names = [
    'id', 
    'clump_thickness', 
    'cell_size_uniformity', 
    'cell_shape_uniformity',
    'marginal_adhesion',
    'epithelial_cell_size',
    'bare_nuclei',
    'bland_chromatin',
    'normal_nucleoli',
    'mitoses',
    'class'
]

df = pd.read_csv(DATA_PATH, names=names)

# remove NaN values
df = df.replace('?', np.NaN)
df.dropna(inplace=True)

# remove non-feature col
df.drop(columns=['id'], inplace=True)

# convert labels to M=1 and B=0
df['class'] = (df['class'] == 4).astype(int)

df.head()

Unnamed: 0,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0


In [95]:
# spilt features and labels
X = df.drop(columns=['class'])
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [96]:
# scale features between 0 and 1
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [97]:
# compute class weights (data is unbalanced)
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {
    0: class_weights[0],
    1: class_weights[1],
}
class_weights

{0: 0.7479452054794521, 1: 1.5082872928176796}

In [112]:
# model architecture
model = keras.models.Sequential([
    keras.layers.Input(shape=[9,]),
    keras.layers.Dense(16, activation='selu'),
    keras.layers.Dense(8, activation='selu'),
    keras.layers.Dense(1, activation='sigmoid'),
])

early_stop = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [113]:
model.fit(X_train, y_train, epochs=50, 
          validation_data=(X_test, y_test), 
          class_weight=class_weights,
          callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x153629950>