In [27]:
#Loading the packages
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
import time

In [26]:
#loading the dataset downloaded from the assignment-data-generator.R file given in the same week-12 folder in the github
data = pd.read_csv('dfdata_10M.csv')
data.head()

Unnamed: 0,pregnant,glucose,pressure,triceps,insulin,mass,pedigree,age,outcome
0,5,88,58,41,144,20.4,0.466,25,0
1,7,88,72,22,92,38.7,0.378,24,0
2,1,101,84,29,140,35.8,0.401,21,0
3,2,120,64,31,176,23.8,0.245,22,0
4,1,83,74,16,56,32.9,0.687,23,0


In [16]:
#defining the x(predictors) and y(outcome)
X = data.drop(columns='outcome').values  # all feature columns
y = data['outcome'].values               # target column

In [17]:
#shuffle the entire dataset first to ensure random sampling
from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state=42)
#subsets of required sizes
X_1k, y_1k = X[:1000], y[:1000]
X_10k, y_10k = X[:10000], y[:10000]
X_100k, y_100k = X[:100000], y[:100000]

In [25]:
#building the neural network models with 1 and 2 layers for each of the dataset
def build_model(input_dim, hidden_layers=1, nodes=4):
    model = Sequential()
    model.add(Dense(nodes, activation='relu', input_dim=input_dim))# first hidden layer (for both models with 1 and 2 hidden layers)
    if hidden_layers == 2:
        model.add(Dense(nodes, activation='relu'))# second hidden layer for model with 2 hidden layers
    model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [21]:
#training and evaluating the model with 8:2 train and test along with 20 epochs 
def train_and_evaluate(X, y, hidden_layers):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = build_model(X.shape[1], hidden_layers)
    start_time = time.time()
    history = model.fit(X_train, y_train, epochs=20, batch_size=32, 
                        validation_data=(X_test, y_test), verbose=0)
    end_time = time.time()
    train_loss = history.history['loss'][-1]
    test_loss = history.history['val_loss'][-1]
    exec_time = round(end_time - start_time, 2)
    return train_loss, test_loss, exec_time

In [22]:
#loading the results into a list in a required format
results = []
datasets = [
    (X_1k, y_1k, '1000'),
    (X_10k, y_10k, '10000'),
    (X_100k, y_100k, '100000')
]
for X_sub, y_sub, label in datasets:
    for layers in [1, 2]:
        print(f"Training on data size {label} with {layers} hidden layer(s)...")
        train_loss, test_loss, exec_time = train_and_evaluate(X_sub, y_sub, layers)
        results.append({
            "Data size": label,
            "Configuration": f"{layers} hidden layer(s), 4 nodes each",
            "Training error": round(train_loss, 4),
            "Validation error": round(test_loss, 4),
            "Time of execution (s)": exec_time
        })

Training on data size 1000 with 1 hidden layer(s)...
Training on data size 1000 with 2 hidden layer(s)...
Training on data size 10000 with 1 hidden layer(s)...
Training on data size 10000 with 2 hidden layer(s)...
Training on data size 100000 with 1 hidden layer(s)...
Training on data size 100000 with 2 hidden layer(s)...


In [23]:
#results dataframe containing the datasize, configuration, training error, validation error, and time of execution
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Data size,Configuration,Training error,Validation error,Time of execution (s)
0,1000,"1 hidden layer(s), 4 nodes each",0.5531,0.7136,8.06
1,1000,"2 hidden layer(s), 4 nodes each",0.6049,0.608,2.73
2,10000,"1 hidden layer(s), 4 nodes each",0.3269,0.3206,5.72
3,10000,"2 hidden layer(s), 4 nodes each",0.538,0.5266,7.87
4,100000,"1 hidden layer(s), 4 nodes each",0.0696,0.081,51.89
5,100000,"2 hidden layer(s), 4 nodes each",0.0758,0.083,56.26
