## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df_cleaned = application_df.drop(columns=['EIN', 'NAME'])
application_df_cleaned.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [3]:
# Determine the number of unique values in each column.
application_df_cleaned_unique = application_df_cleaned.nunique()
application_df_cleaned_unique

Unnamed: 0,0
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
STATUS,2
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2
ASK_AMT,8747
IS_SUCCESSFUL,2


In [4]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
# View the original value counts for the APPLICATION_TYPE column
print("Original APPLICATION_TYPE counts:")
print(application_df_cleaned['APPLICATION_TYPE'].value_counts())

# Replace all values in the APPLICATION_TYPE column with "Other"
application_df_cleaned['APPLICATION_TYPE'] = 'Other'

# Verify the changes
print("\nUpdated APPLICATION_TYPE counts:")
print(application_df_cleaned['APPLICATION_TYPE'].value_counts())


Original APPLICATION_TYPE counts:
APPLICATION_TYPE
T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: count, dtype: int64

Updated APPLICATION_TYPE counts:
APPLICATION_TYPE
Other    34299
Name: count, dtype: int64


In [5]:
# Calculate value counts of application types
value_counts = application_df_cleaned['APPLICATION_TYPE'].value_counts()

# Choose a cutoff value
cutoff_value = 35000

# Create a list of application types to be replaced
application_types_to_replace = value_counts[value_counts < cutoff_value].index.tolist()

# Display the list of application types to be replaced
print("Application types to be replaced:", application_types_to_replace)

# Replace in dataframe using a single replace call
application_df_cleaned['APPLICATION_TYPE'] = application_df_cleaned['APPLICATION_TYPE'].replace(application_types_to_replace, "Other")

# Check to make sure replacement was successful
print(application_df_cleaned['APPLICATION_TYPE'].value_counts())

Application types to be replaced: ['Other']
APPLICATION_TYPE
Other    34299
Name: count, dtype: int64


In [6]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
print("Original CLASSIFICATION counts:")
print(application_df_cleaned['CLASSIFICATION'].value_counts())

# Calculate classification value counts
value_counts = application_df_cleaned['CLASSIFICATION'].value_counts()

# Filter to find application types with counts greater than 1
print("List of Classification value counts (greater than 1):")
classification_value_counts = value_counts[value_counts > 1]
print(classification_value_counts)
# Display the total of application types with counts greater than 1
print("Classification value counts (greater than 1):")
print(classification_value_counts.count())

# Replace all values in the CLASSIFICATION column with "Other"
application_df_cleaned['CLASSIFICATION'] = 'Other'

# Verify the changes
print("\nUpdated CLASSIFICATION counts:")
print(application_df_cleaned['CLASSIFICATION'].value_counts())

Original CLASSIFICATION counts:
CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: count, Length: 71, dtype: int64
List of Classification value counts (greater than 1):
CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
C2800       95
C7100       75
C1300       58
C1280       50
C1230       36
C1400       34
C7200       32
C2300       32
C1240       30
C8000       20
C7120       18
C1500       16
C1800       15
C6000       15
C1250       14
C8200       11
C1238       10
C1278       10
C1235        9
C1237        9
C7210        7
C2400        6
C1720        6
C4100        6
C1257        5
C1600        5
C1260        3
C2710        3
C0           3
C3200        2
C1234        2
C1246        2
C1267        2
C1256        2
Name: 

In [7]:
# Calculate value counts of classifications
value_counts = application_df_cleaned['CLASSIFICATION'].value_counts()

# Choose a cutoff value
cutoff_value = 35000

# Create a list of classifications to be replaced
classifications_to_replace = value_counts[value_counts < cutoff_value].index.tolist()

# Replace in dataframe
for cls in classifications_to_replace:
    application_df_cleaned['CLASSIFICATION'] = application_df_cleaned['CLASSIFICATION'].replace(cls, "Other")

# Check to make sure replacement was successful
print("Updated CLASSIFICATION counts:")
print(application_df_cleaned['CLASSIFICATION'].value_counts())

Updated CLASSIFICATION counts:
CLASSIFICATION
Other    34299
Name: count, dtype: int64


In [8]:
# Convert categorical data to numeric with pd.get_dummies
categorical_df_dummies = pd.get_dummies(application_df_cleaned, columns=['APPLICATION_TYPE', 'CLASSIFICATION'], drop_first=True)

# Display the first few rows of the new DataFrame to verify the conversion
categorical_df_dummies.head()

Unnamed: 0,AFFILIATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,Independent,ProductDev,Association,1,0,N,5000,1
1,Independent,Preservation,Co-operative,1,1-9999,N,108590,1
2,CompanySponsored,ProductDev,Association,1,0,N,5000,0
3,CompanySponsored,Preservation,Trust,1,10000-24999,N,6692,1
4,Independent,Heathcare,Trust,1,100000-499999,N,142590,1


In [9]:
# Split our preprocessed data into our features and target arrays
X = categorical_df_dummies.drop(columns=['IS_SUCCESSFUL'])  # Features
y = categorical_df_dummies['IS_SUCCESSFUL']  # Target variable

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Re-apply one-hot encoding if necessary
X_train_dummies = pd.get_dummies(X_train, drop_first=True)
X_test_dummies = pd.get_dummies(X_test, drop_first=True)

# Align the two DataFrames to ensure they have the same columns
X_train_dummies, X_test_dummies = X_train_dummies.align(X_test_dummies, join='left', axis=1, fill_value=0)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train_dummies)

# Scale the data
X_train_scaled = X_scaler.transform(X_train_dummies)
X_test_scaled = X_scaler.transform(X_test_dummies)

## Compile, Train and Evaluate the Model

In [11]:
# Determine the number of input features
number_input_features = X_train_scaled.shape[1]
# Define the number of nodes in each hidden layer
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

# Create the Sequential model
nn_model = tf.keras.models.Sequential()

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))  # For binary classification

# Compile the model
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Check the structure of the model
nn_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [12]:
# Compile the model
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [18]:
class CustomModelCheckpoint(tf.keras.callbacks.Callback):
    def __init__(self, save_freq):
        super(CustomModelCheckpoint, self).__init__()
        self.save_freq = save_freq

    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.save_freq == 0:  # Check if the current epoch is a multiple of save_freq
            self.model.save_weights(f'model_weights_epoch_{epoch + 1:02d}.weights.h5')  # Save weights

# Create an instance of the custom callback
custom_checkpoint_callback = CustomModelCheckpoint(save_freq=5)

In [19]:
# Train the model
nn_model.fit(X_train_scaled, y_train, epochs=50, callbacks=[custom_checkpoint_callback])

Epoch 1/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6989 - loss: 0.6075
Epoch 2/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7053 - loss: 0.5998
Epoch 3/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6987 - loss: 0.6019
Epoch 4/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6997 - loss: 0.6004
Epoch 5/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.6997 - loss: 0.6005
Epoch 6/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6997 - loss: 0.5999
Epoch 7/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7002 - loss: 0.5993
Epoch 8/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.6993 - loss: 0.6003
Epoch 9/50
[1m858/858[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x79ad25c6fbb0>

In [20]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled, y_test, verbose=2)

# Print the results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

215/215 - 0s - 2ms/step - accuracy: 0.6946 - loss: 0.6042
Loss: 0.6042428612709045, Accuracy: 0.6946064233779907


In [21]:
# Export our model to HDF5 file
nn_model.save('AlphabetSoupCharity.h5')

