In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

tf_df = pd.read_parquet('data/transformed.parquet')

tf_df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
STATUS,1,1,1,1,1,1,1,1,1,1
SPECIAL_CONSIDERATIONS,0,0,0,0,0,0,0,0,0,0
IS_SUCCESSFUL,1,1,0,1,1,1,1,1,1,0
AFFILIATION_CompanySponsored,0,0,1,1,0,0,0,0,0,1
AFFILIATION_Family/Parent,0,0,0,0,0,0,0,0,0,0
AFFILIATION_Independent,1,1,0,0,1,1,1,1,1,0
AFFILIATION_National,0,0,0,0,0,0,0,0,0,0
AFFILIATION_Other,0,0,0,0,0,0,0,0,0,0
AFFILIATION_Regional,0,0,0,0,0,0,0,0,0,0
APPLICATION_TYPE_T10,1,0,0,0,0,0,0,0,0,0


In [15]:
# Split our preprocessed data into our features and target arrays
y = tf_df['IS_SUCCESSFUL'].values
X = tf_df.drop(columns='IS_SUCCESSFUL').values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, test_size=.2)

In [16]:

# Define a range of hyperparameters to search over
learning_rates = [0.001, 0.01, 0.1]
batch_sizes = [32, 64, 128]

best_accuracy = 0
best_hyperparameters = {}

# k-fold cross-validation, k=5
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

n_features = len(tf_df.columns) - 1  # Assume tf_df is your DataFrame
iteration = 0
total_iterations = len(learning_rates) * len(batch_sizes)

for learning_rate in learning_rates:
    for batch_size in batch_sizes:
        iteration += 1
        print(f"Iteration {iteration}/{total_iterations}: Learning Rate = {learning_rate}, Batch Size = {batch_size}")

        val_accuracies = []

        for train_idx, val_idx in kf.split(X_train, y_train):
            nn = tf.keras.models.Sequential()

            nn.add(Dense(units=32, activation='relu', input_dim=n_features))
            nn.add(Dropout(0.5))
            nn.add(Dense(units=16, activation='relu'))
            nn.add(Dense(units=1, activation='sigmoid'))

            nn.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=['accuracy'])

            X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
            y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

            early_stopping = EarlyStopping(monitor='val_loss', patience=10)

            fit_model = nn.fit(X_train_fold, y_train_fold, epochs=100, batch_size=batch_size, verbose=0, validation_data=(X_val_fold, y_val_fold), callbacks=[early_stopping])

            _, val_accuracy = nn.evaluate(X_val_fold, y_val_fold, verbose=0)
            val_accuracies.append(val_accuracy)

        avg_val_accuracy = np.mean(val_accuracies)

        if avg_val_accuracy > best_accuracy:
            best_accuracy = avg_val_accuracy
            best_hyperparameters = {'learning_rate': learning_rate, 'batch_size': batch_size}

print("Best Hyperparameters:", best_hyperparameters)
print("Best Validation Accuracy:", best_accuracy)



Iteration 1/9: Learning Rate = 0.001, Batch Size = 32




KeyboardInterrupt: 