In [36]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Input  
from keras.optimizers import Adam

In [30]:
# Load data
dfdata = pd.read_csv("dfdata.csv")

# Ensure outcome is binary numeric
dfdata['outcome'] = dfdata['outcome'].astype('category').cat.codes

X = dfdata.drop(columns=['outcome'])
y = dfdata['outcome']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [31]:
# Function to train model and collect metrics
def run_model(X, y, size, hidden_layers):
    idx = np.random.choice(len(X), size=size, replace=True)
    X_sample = X[idx]
    y_sample = y.iloc[idx]
    
    X_train, X_val, y_train, y_val = train_test_split(X_sample, y_sample, test_size=0.2, random_state=123, stratify=y_sample)
    
    model = Sequential()
    model.add(Input(shape=(X.shape[1],)))  # Define input shape using the Input layer
    
    model.add(Dense(4, activation='relu'))
    
    if hidden_layers == 2:
        model.add(Dense(4, activation='relu'))
    
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

    start_time = time.time()
    history = model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=0, validation_data=(X_val, y_val))
    end_time = time.time()
    
    training_error = history.history['loss'][-1]
    validation_error = history.history['val_loss'][-1]
    elapsed_time = end_time - start_time
    
    return {
        'Data size': size,
        'Configuration': '1 hidden layer 4 nodes' if hidden_layers == 1 else '2 hidden layers of 4 nodes each',
        'Training error': round(training_error, 4),
        'Validation error': round(validation_error, 4),
        'Time of execution': round(elapsed_time, 2)
    }

# Run for specified configurations
sizes = [1000, 10000, 100000]
results = []

for sz in sizes:
    results.append(run_model(X_scaled, y, sz, hidden_layers=1))
for sz in sizes:
    results.append(run_model(X_scaled, y, sz, hidden_layers=2))

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Data size,Configuration,Training error,Validation error,Time of execution
0,1000,1 hidden layer 4 nodes,0.4426,0.4545,6.53
1,10000,1 hidden layer 4 nodes,0.0294,0.025,16.46
2,100000,1 hidden layer 4 nodes,0.007,0.0069,115.71
3,1000,2 hidden layers of 4 nodes each,0.1798,0.2111,6.25
4,10000,2 hidden layers of 4 nodes each,0.0212,0.0165,16.53
5,100000,2 hidden layers of 4 nodes each,0.0045,0.0044,116.29
