In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

tf.random.set_seed(123)

tf_df = pd.read_parquet('data/transformed.parquet')

tf_df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
STATUS,1,1,1,1,1,1,1,1,1,1
SPECIAL_CONSIDERATIONS,0,0,0,0,0,0,0,0,0,0
IS_SUCCESSFUL,1,1,0,1,1,1,1,1,1,0
AFFILIATION_CompanySponsored,0,0,1,1,0,0,0,0,0,1
AFFILIATION_Family/Parent,0,0,0,0,0,0,0,0,0,0
AFFILIATION_Independent,1,1,0,0,1,1,1,1,1,0
AFFILIATION_National,0,0,0,0,0,0,0,0,0,0
AFFILIATION_Other,0,0,0,0,0,0,0,0,0,0
AFFILIATION_Regional,0,0,0,0,0,0,0,0,0,0
APPLICATION_TYPE_T10,1,0,0,0,0,0,0,0,0,0


In [21]:
# Split our preprocessed data into our features and target arrays
y = tf_df['IS_SUCCESSFUL'].values
X = tf_df.drop(columns='IS_SUCCESSFUL').values


In [23]:

# Define a range of hyperparameters to search over
learning_rates = [0.001, 0.01, 0.01]
batch_sizes = [32, 64, 128]
layer_configs = [(32, 16), (64, 32), (128, 64)]  # Varying the units in each layer
activation_functions = ['relu', 'tanh', 'sigmoid']

best_accuracy = 0
best_hyperparameters = {}



# in four nested loops, we test each variation of the above
# the final nested loop performs k-fold cross-validation, k=5

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

n_features = len(tf_df.columns) - 1
iteration = 0
total_iterations = len(learning_rates) * len(batch_sizes) * len(layer_configs) * len(activation_functions)

for learning_rate in learning_rates:
    for batch_size in batch_sizes:
        for layer_config in layer_configs:
            for activation_func in activation_functions:
                iteration += 1
                print(f"Iteration {iteration}/{total_iterations}: Learning Rate = {learning_rate}, Batch Size = {batch_size}, Layer Config = {layer_config}, Activation = {activation_func}")

                val_accuracies = []

                for train_idx, val_idx in kf.split(X, y):
                    nn = tf.keras.models.Sequential()

                    # Adding variable layers and units
                    nn.add(Dense(units=layer_config[0], activation=activation_func, input_dim=n_features))
                    nn.add(Dropout(0.5))
                    nn.add(Dense(units=layer_config[1], activation=activation_func))
                    
                    # Output layer
                    nn.add(Dense(units=1, activation='sigmoid'))

                    nn.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate), metrics=['accuracy'])

                    X_fold, X_val_fold = X[train_idx], X[val_idx]
                    y_fold, y_val_fold = y[train_idx], y[val_idx]

                    early_stopping = EarlyStopping(monitor='val_loss', patience=10)

                    fit_model = nn.fit(X_fold, y_fold, epochs=100, batch_size=batch_size, verbose=0, validation_data=(X_val_fold, y_val_fold), callbacks=[early_stopping])

                    _, val_accuracy = nn.evaluate(X_val_fold, y_val_fold, verbose=0)
                    val_accuracies.append(val_accuracy)

                avg_val_accuracy = np.mean(val_accuracies)

                if avg_val_accuracy > best_accuracy:
                    best_accuracy = avg_val_accuracy
                    best_hyperparameters = {'learning_rate': learning_rate, 'batch_size': batch_size, 'layer_config': layer_config, 'activation_func': activation_func}

print("Best Hyperparameters:", best_hyperparameters)
print("Best Validation Accuracy:", best_accuracy)


Iteration 1/81: Learning Rate = 0.0001, Batch Size = 48, Layer Config = (32, 16), Activation = relu


KeyboardInterrupt: 