## Preprocessing

In [1]:
import keras_tuner as kt
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
#  Import and read the charity_data.csv.
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [3]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df.drop(["EIN", "NAME"], axis=1, inplace=True)

In [4]:
def apply_cutoff_to_column(df: pd.DataFrame,
                           column: str,
                           cutoff_value: int,
                           new_val_name: str="Other") -> pd.DataFrame:
    """Replaces (Bins) values in the `column` where the
     count is < `cutoff_value` with `new_val_name`.

    :param df: DataFrame with column
    :param column: Column name
    :param cutoff_value: count of values to cutoff
    :param new_val_name: string to replace cutoff values with; default=Other
    :return: DataFrame with replaced values
    """
    _col_unique = df[column].unique()
    _col_value_counts = df[column].value_counts()
    _vals_to_replace = [x for x in _col_unique if _col_value_counts[x] < cutoff_value]
    print(f"Replace with '{new_val_name}': {_vals_to_replace}")

    # Replace in DataFrame
    for app in _vals_to_replace:
        df[column] = df[column].replace(app, new_val_name)

    return df

In [5]:
# APPLICATION_TYPE cutoff = 500
app_df_cutoffs = apply_cutoff_to_column(application_df, "APPLICATION_TYPE", 500)
app_df_cutoffs['APPLICATION_TYPE'].value_counts()

Replace with 'Other': ['T2', 'T9', 'T13', 'T12', 'T29', 'T25', 'T14', 'T17', 'T15']


T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: APPLICATION_TYPE, dtype: int64

In [6]:
# CLASSIFICATION cutoff = 700
app_df_cutoffs = apply_cutoff_to_column(app_df_cutoffs, "CLASSIFICATION", 500)
app_df_cutoffs['CLASSIFICATION'].value_counts()

Replace with 'Other': ['C2700', 'C7200', 'C1700', 'C4000', 'C7100', 'C2800', 'C6000', 'C1238', 'C5000', 'C7120', 'C1800', 'C4100', 'C1400', 'C1270', 'C2300', 'C8200', 'C1500', 'C7210', 'C1300', 'C1230', 'C1280', 'C1240', 'C2710', 'C2561', 'C1250', 'C8000', 'C1245', 'C1260', 'C1235', 'C1720', 'C1257', 'C4500', 'C2400', 'C8210', 'C1600', 'C1278', 'C1237', 'C4120', 'C2170', 'C1728', 'C1732', 'C2380', 'C1283', 'C1570', 'C2500', 'C1267', 'C3700', 'C1580', 'C2570', 'C1256', 'C1236', 'C1234', 'C1246', 'C2190', 'C4200', 'C0', 'C3200', 'C5200', 'C1370', 'C2600', 'C1248', 'C6100', 'C1820', 'C1900', 'C2150']


C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
Other     1484
C7000      777
Name: CLASSIFICATION, dtype: int64

In [7]:
display(application_df["AFFILIATION"].value_counts())
display(application_df["USE_CASE"].value_counts())
display(application_df["ORGANIZATION"].value_counts())
display(application_df["INCOME_AMT"].value_counts())
application_df["SPECIAL_CONSIDERATIONS"].value_counts()

Independent         18480
CompanySponsored    15705
Family/Parent          64
National               33
Regional               13
Other                   4
Name: AFFILIATION, dtype: int64

Preservation     28095
ProductDev        5671
CommunityServ      384
Heathcare          146
Other                3
Name: USE_CASE, dtype: int64

Trust           23515
Association     10255
Co-operative      486
Corporation        43
Name: ORGANIZATION, dtype: int64

0                24388
25000-99999       3747
100000-499999     3374
1M-5M              955
1-9999             728
10000-24999        543
10M-50M            240
5M-10M             185
50M+               139
Name: INCOME_AMT, dtype: int64

N    34272
Y       27
Name: SPECIAL_CONSIDERATIONS, dtype: int64

Try adding additional cutoffs for `AFFILIATION` and `INCOME_AMT`

In [8]:
# INCOME_AMT cutoff = 500
app_df_cutoffs = apply_cutoff_to_column(app_df_cutoffs, "INCOME_AMT", 500, "5M+")
app_df_cutoffs['INCOME_AMT'].value_counts()

Replace with '5M+': ['10M-50M', '50M+', '5M-10M']


0                24388
25000-99999       3747
100000-499999     3374
1M-5M              955
1-9999             728
5M+                564
10000-24999        543
Name: INCOME_AMT, dtype: int64

In [9]:
# AFFILIATION cutoff = 1000
app_df_cutoffs = apply_cutoff_to_column(app_df_cutoffs, "AFFILIATION", 1000)
app_df_cutoffs['AFFILIATION'].value_counts()

Replace with 'Other': ['Family/Parent', 'National', 'Regional', 'Other']


Independent         18480
CompanySponsored    15705
Other                 114
Name: AFFILIATION, dtype: int64

Drop 'SPECIAL_CONSIDERATIONS' instead of making it a dummy, Special Considerations could be very broad and it is only true for a tiny percentage (<.1%) of organizations.

In [10]:
# Drop SPECIAL_CONSIDERATIONS
application_df.drop("SPECIAL_CONSIDERATIONS", axis=1, inplace=True)

In [11]:
# Convert categorical data to numeric with `pd.get_dummies`
df_app_clean = pd.get_dummies(application_df, columns=["AFFILIATION", "USE_CASE", "ORGANIZATION", "INCOME_AMT",
                                                       "CLASSIFICATION", "APPLICATION_TYPE"])
df_app_clean.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,AFFILIATION_CompanySponsored,AFFILIATION_Independent,AFFILIATION_Other,USE_CASE_CommunityServ,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,...,CLASSIFICATION_Other,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,APPLICATION_TYPE_T8
0,1,5000,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1,108590,1,0,1,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,1,5000,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,6692,1,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,1,142590,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [12]:
# Split our preprocessed data into our features and target arrays
X = df_app_clean.drop("IS_SUCCESSFUL", axis=1).values
y = df_app_clean["IS_SUCCESSFUL"].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instances
# Fit the StandardScaler
scaler = StandardScaler().fit(X_train)

# Scale the data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train.shape

(25724, 37)

## Compile, Train and Evaluate the Model
---
## Attempt 1, Use same model setup as original
Only the input data/number of features has changed

In [34]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
num_features = X_train.shape[1]
print("Features:", num_features)

nn = tf.keras.models.Sequential()

# number of neurons should be 2-3 times number of inputs=number of features
# there is only 1 output
nn.add(tf.keras.layers.Dense(units=num_features*2, activation="relu", input_dim=num_features))
nn.add(tf.keras.layers.Dense(units=num_features*2, activation="relu"))
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


# Check the structure of the model
nn.summary()

Features: 37
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 74)                2812      
                                                                 
 dense_6 (Dense)             (None, 74)                5550      
                                                                 
 dense_7 (Dense)             (None, 1)                 75        
                                                                 
Total params: 8,437
Trainable params: 8,437
Non-trainable params: 0
_________________________________________________________________


In [36]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [37]:
# Train the model
nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x214a2f97be0>

In [38]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.5552 - accuracy: 0.7251 - 295ms/epoch - 1ms/step
Loss: 0.5551943778991699, Accuracy: 0.7251312136650085


---
## Attempt 2
## Try using Keras Tuner
- Allow activation function to choose between `relu`, `sigmoid`, `tanh`
    - `sigmoid` is still the only option for the final layer
- Allow number of neurons to vary from 6 to ~75
- Allow for 1, 2, or 3 extra hidden layers (not input or output)

In [31]:
NUM_FEATURES = X_train.shape[1]
print("Features:", NUM_FEATURES)

def create_model(hp: kt.HyperParameters) -> tf.keras.models.Sequential:
    model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice("activation", ["relu", "sigmoid", "tanh"])

    # Allow kerastuner to decide number of neurons in first layer
    model.add(tf.keras.layers.Dense(units=hp.Int("first_units", 6, NUM_FEATURES*2, step=6),
                                    activation=activation,
                                    input_dim=NUM_FEATURES))
    
    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int("num_layers", 1, 3)):
        model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
                                                     6, NUM_FEATURES*2, step=6),
                                        activation=activation))
    # output layer
    model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    model.compile(loss="binary_crossentropy",
                  optimizer='adam',
                  metrics=["accuracy"])
    return model

tuner = kt.Hyperband(create_model,
                     objective="val_accuracy",
                     max_epochs=25,
                     overwrite=True,
                     hyperband_iterations=2)

Features: 37


In [32]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,
             y_train,
             epochs=25,
             validation_data=(X_test_scaled, y_test))

Trial 60 Complete [00h 00m 39s]
val_accuracy: 0.7281632423400879

Best val_accuracy So Far: 0.7281632423400879
Total elapsed time: 00h 13m 27s
INFO:tensorflow:Oracle triggered exit


In [33]:
# Get top 3 model hyperparameters and print the values
top_3_hp = tuner.get_best_hyperparameters(3)
for hparam in top_3_hp:
    print(hparam.values)

# Evaluate the top 3 models against the test dataset
top_3_model = tuner.get_best_models(3)
for model in top_3_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test, verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

{'activation': 'tanh', 'first_units': 66, 'num_layers': 3, 'units_0': 36, 'units_1': 42, 'units_2': 48, 'tuner/epochs': 25, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
{'activation': 'tanh', 'first_units': 36, 'num_layers': 3, 'units_0': 66, 'units_1': 36, 'units_2': 48, 'tuner/epochs': 25, 'tuner/initial_epoch': 9, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0044'}
{'activation': 'sigmoid', 'first_units': 42, 'num_layers': 3, 'units_0': 60, 'units_1': 66, 'units_2': 66, 'tuner/epochs': 25, 'tuner/initial_epoch': 9, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0053'}
268/268 - 0s - loss: 0.5529 - accuracy: 0.7282 - 487ms/epoch - 2ms/step
Loss: 0.552858293056488, Accuracy: 0.7281632423400879
268/268 - 0s - loss: 0.5527 - accuracy: 0.7275 - 457ms/epoch - 2ms/step
Loss: 0.552712082862854, Accuracy: 0.7274635434150696
268/268 - 0s - loss: 0.5545 - accuracy: 0.7275 - 463ms/epoch - 2ms/step
Loss: 0.5544846653938293, Accuracy: 0.7274635434150696


In [64]:
best_tuned_model = tuner.get_best_models(1)[0]
# Evaluate the model using the test data
model_loss2, model_accuracy2 = best_tuned_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss2}, Accuracy: {model_accuracy2}")

268/268 - 0s - loss: 0.5529 - accuracy: 0.7282 - 349ms/epoch - 1ms/step
Loss: 0.552858293056488, Accuracy: 0.7281632423400879


---
## Attempt 3
Try using few neurons (< number of features) with sigmoid for non input layer

In [58]:
NUM_FEATURES = X_train.shape[1]
print("Features:", NUM_FEATURES)

nn3 = tf.keras.models.Sequential()

nn3.add(tf.keras.layers.Dense(units=10, activation="relu", input_dim=NUM_FEATURES))
nn3.add(tf.keras.layers.Dense(units=8, activation="sigmoid"))
nn3.add(tf.keras.layers.Dense(units=6, activation="sigmoid"))
nn3.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

display(nn3.summary())
nn3.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

Features: 37
Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_42 (Dense)            (None, 10)                380       
                                                                 
 dense_43 (Dense)            (None, 8)                 88        
                                                                 
 dense_44 (Dense)            (None, 6)                 54        
                                                                 
 dense_45 (Dense)            (None, 1)                 7         
                                                                 
Total params: 529
Trainable params: 529
Non-trainable params: 0
_________________________________________________________________


None

In [59]:
# Train the model
nn3.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x214d6b2ac70>

In [60]:
# Evaluate the model using the test data
model_loss3, model_accuracy3 = nn3.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss3}, Accuracy: {model_accuracy3}")

268/268 - 0s - loss: 0.5547 - accuracy: 0.7270 - 493ms/epoch - 2ms/step
Loss: 0.5547155737876892, Accuracy: 0.7269970774650574


---
## Attempt 4
Many more internal layers

In [61]:
NUM_FEATURES = X_train.shape[1]
print("Features:", NUM_FEATURES)

nn4 = tf.keras.models.Sequential()

nn4.add(tf.keras.layers.Dense(units=NUM_FEATURES*2, activation="relu", input_dim=NUM_FEATURES))
nn4.add(tf.keras.layers.Dense(units=NUM_FEATURES*2, activation="relu"))
nn4.add(tf.keras.layers.Dense(units=NUM_FEATURES*2, activation="relu"))
nn4.add(tf.keras.layers.Dense(units=NUM_FEATURES*2, activation="relu"))
nn4.add(tf.keras.layers.Dense(units=NUM_FEATURES*2, activation="relu"))
nn4.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

display(nn4.summary())
nn4.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

Features: 37
Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_46 (Dense)            (None, 74)                2812      
                                                                 
 dense_47 (Dense)            (None, 74)                5550      
                                                                 
 dense_48 (Dense)            (None, 74)                5550      
                                                                 
 dense_49 (Dense)            (None, 74)                5550      
                                                                 
 dense_50 (Dense)            (None, 74)                5550      
                                                                 
 dense_51 (Dense)            (None, 1)                 75        
                                                                 
Total params: 25,087
Trainable params: 2

None

In [62]:
# Train the model
nn4.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x214d6b30a60>

In [63]:
# Evaluate the model using the test data
model_loss4, model_accuracy4 = nn4.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss4}, Accuracy: {model_accuracy4}")

268/268 - 0s - loss: 0.5679 - accuracy: 0.7247 - 430ms/epoch - 2ms/step
Loss: 0.5678597092628479, Accuracy: 0.7246647477149963


---
## Overall Results

None of the 4 different attempts at improving accuracy succeeded in passing 75%.

All four settled around 72.5-72.8% which is trivially better than the original 72.4% found in `AlphabetSoupCharity.ipynb`.

## Save the Best Final Model
This was the best result from using the Keras tuner

In [65]:
# Export our model to HDF5 file
best_tuned_model.save("models/AlphabetSoupCharity_Optimization_best.h5")