### The first step in optimizing the model is to repeat the data preprocessing done in the original code but with any needed modifications.

In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

In [5]:
#read in CSV data to a Pandas dataframe
application_df = pd.read_csv("charity_data.csv")
application_df

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


In [6]:
#checking for column usefulness of columns based on value counts
print("STATUS")
print(application_df["STATUS"].value_counts())
print("...")
print("SPECIAL_CONSIDERATIONS")
print(application_df["SPECIAL_CONSIDERATIONS"].value_counts())
print("...")
print("INCOME_AMT")
print(application_df["INCOME_AMT"].value_counts())

STATUS
1    34294
0        5
Name: STATUS, dtype: int64
...
SPECIAL_CONSIDERATIONS
N    34272
Y       27
Name: SPECIAL_CONSIDERATIONS, dtype: int64
...
INCOME_AMT
0                24388
25000-99999       3747
100000-499999     3374
1M-5M              955
1-9999             728
10000-24999        543
10M-50M            240
5M-10M             185
50M+               139
Name: INCOME_AMT, dtype: int64


In [7]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME' as well as "STATUS" and "SPECIAL_CONSIDERATIONS" columns which don't vary enough to justfity inclusion.
dropped = application_df.drop(['EIN', 'NAME', "STATUS", "SPECIAL_CONSIDERATIONS"], axis=1)
dropped.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,0,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1-9999,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,0,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,10000-24999,6692,1
4,T3,Independent,C1000,Heathcare,Trust,100000-499999,142590,1


In [8]:
#create a copy dataframe to work with
dropped_copy = dropped.copy()

In [9]:
# Application types with under 100 instances will be replaced with 'Other'
application_types_to_replace = ['T13', 'T12', 'T2', 'T25', 'T14', 'T15', 'T29', 'T17']

# Replace in dataframe
for app in application_types_to_replace:
    dropped_copy['APPLICATION_TYPE'] = dropped_copy['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure binning was successful
dropped_copy['APPLICATION_TYPE'].value_counts()

T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
T9         156
Other      120
Name: APPLICATION_TYPE, dtype: int64

In [12]:
# continue binning with the classification column starting with finding the values to be binned as other
v_counts = pd.DataFrame(dropped_copy['CLASSIFICATION'].value_counts())

v_counts.reset_index(inplace=True)

classifications_to_replace = []

for index, row in v_counts.iterrows():
    if row["CLASSIFICATION"] < 100:
        classifications_to_replace.append(row["index"])
        

In [13]:
# bin the classification column
for cls in classifications_to_replace:
    dropped_copy['CLASSIFICATION'] = dropped_copy['CLASSIFICATION'].replace(cls,"Other")
    
# Check to make sure binning was successful
dropped_copy['CLASSIFICATION'].value_counts()

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
Other      669
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
Name: CLASSIFICATION, dtype: int64

In [15]:
# Convert categorical data to numeric with `pd.get_dummies`
dum = pd.get_dummies(dropped_copy)
dum

Unnamed: 0,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,...,ORGANIZATION_Trust,INCOME_AMT_0,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M
0,5000,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,108590,1,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,5000,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,6692,1,0,0,0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,142590,1,0,0,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34294,5000,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
34295,5000,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
34296,5000,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
34297,5000,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [16]:
# Split our preprocessed data into our features and target arrays
X = dum.drop('IS_SUCCESSFUL', axis=1)

y = dum['IS_SUCCESSFUL']

In [17]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [18]:
#preview the training data
X_train

Unnamed: 0,ASK_AMT,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,APPLICATION_TYPE_T8,...,ORGANIZATION_Trust,INCOME_AMT_0,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M
10491,5000,0,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
9384,5000,0,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
11614,5000,0,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
17386,5000,0,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
3844,5000,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,333518,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
6265,5000,0,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
11284,5000,0,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
860,5000,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
