## Preprocessing

In [4]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df = pd.DataFrame(application_df)
application_df

2024-07-06 15:48:40.824040: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


In [None]:
!pip install keras-tuner

In [5]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
clean_application_df = application_df.drop(['EIN','NAME'], axis=1)

In [6]:
clean_application_df

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...
34294,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


In [7]:
# Determine the number of unique values in each column.
clean_application_df.nunique()

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64

### For columns with more than 10 unique values, determine the number of data points for each unique value
### Create a new value called 'Other' that contains rare categorical variables

In [8]:
# For columns with more than 10 unique values ('APPLICATION_TYPE' and 'CLASSIFICATION'), determine the number of data points for each unique value
# Create a list of application types to be replaced. Replace rare categorical variables with 'Other'

# Step 1: Determine the number of data points for each unique value
application_type_counts = clean_application_df['APPLICATION_TYPE'].value_counts()

# Step 2: Create a list of application types to be replaced
# Define a threshold for what we consider as 'rare'
threshold = 10
application_types_to_replace = application_type_counts[application_type_counts < threshold].index.tolist()

# Step 3: Replace rare application types with 'Other' using .replace()
clean_application_df['APPLICATION_TYPE'] = clean_application_df['APPLICATION_TYPE'].replace(application_types_to_replace, "Other")

# Step 4: Check to make sure replacement was successful
print(clean_application_df['APPLICATION_TYPE'].value_counts())

APPLICATION_TYPE
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
T9         156
T13         66
T12         27
T2          16
Other       11
Name: count, dtype: int64


In [9]:
# Do the same for 'Classification'

# Step 1: Determine the number of data points for each unique value
classification_counts = clean_application_df['CLASSIFICATION'].value_counts()

# Step 2: Create a list of classification types to be replaced
# Define a threshold for what we consider as 'rare'
threshold = 10
classifications_to_replace = classification_counts[classification_counts < threshold].index.tolist()

# Step 3: Replace rare classification types with 'Other' using .replace()
clean_application_df['CLASSIFICATION'] = clean_application_df['CLASSIFICATION'].replace(classifications_to_replace, "Other")

# Step 4: Check to make sure replacement was successful
print(clean_application_df['CLASSIFICATION'].value_counts())

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
Other       98
C2800       95
C7100       75
C1300       58
C1280       50
C1230       36
C1400       34
C7200       32
C2300       32
C1240       30
C8000       20
C7120       18
C1500       16
C1800       15
C6000       15
C1250       14
C8200       11
C1238       10
C1278       10
Name: count, dtype: int64


In [17]:
# Convert categorical data to numeric with `pd.get_dummies`
cat_application_df = pd.get_dummies(clean_application_df[['APPLICATION_TYPE','AFFILIATION','CLASSIFICATION','USE_CASE',
                                      'ORGANIZATION', 'INCOME_AMT',	'SPECIAL_CONSIDERATIONS', 'STATUS', 'IS_SUCCESSFUL', 'ASK_AMT']])

In [18]:
cat_application_df

Unnamed: 0,STATUS,IS_SUCCESSFUL,ASK_AMT,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T12,APPLICATION_TYPE_T13,APPLICATION_TYPE_T19,APPLICATION_TYPE_T2,APPLICATION_TYPE_T3,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,1,5000,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,1,1,108590,False,False,False,False,False,False,True,...,True,False,False,False,False,False,False,False,True,False
2,1,0,5000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,1,1,6692,False,False,False,False,False,False,True,...,False,True,False,False,False,False,False,False,True,False
4,1,1,142590,False,False,False,False,False,False,True,...,False,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34294,1,0,5000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
34295,1,0,5000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
34296,1,0,5000,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
34297,1,1,5000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [25]:
# Split our preprocessed data into our features and target arrays
# Remove "IS_SUCCESSFUL" target from features data
y = cat_application_df.IS_SUCCESSFUL.values
X = cat_application_df.drop(columns="IS_SUCCESSFUL").values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [26]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [32]:
# Define the model - deep neural net
nn_model = tf.keras.models.Sequential()

# Add first Dense layer, including the input layer (input_dim should match the number of features)
input_dim = X_train_scaled.shape[1]
nn_model.add(tf.keras.layers.Dense(units=32, activation="relu", input_dim=input_dim))

# Add additional hidden layers
nn_model.add(tf.keras.layers.Dense(units=16, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=8, activation="relu"))

# Add the output layer (assuming binary classification)
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
nn_model.summary()

# Compile the model
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.6883 - loss: 0.6159
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7343 - loss: 0.5516
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7310 - loss: 0.5503
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7314 - loss: 0.5499
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7342 - loss: 0.5485
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7352 - loss: 0.5424
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7364 - loss: 0.5416
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7333 - loss: 0.5448
Epoch 9/100
[1m804/804[0m [32

In [33]:
# Export our model to HDF5 file
nn_model.save("AlphabetSoupCharity_Optimization.h5")

