# Alphabet Soup's Charity Risk Assessment

Alphabet Soup is a fictional nonprofit foundation that makes donations to various organizations that aim to make the world a better place by protecting the environment, improving people's well-being, etc. Unfortunately, not every investment by the foundation is impactful. The goal of this project is to come up with a data-driven solution to help ensure that the foundation's money is going to organizations that are worth donating to. The solution is to build a model to assess an organization's risk before donating to them.

### Data

The data contains more than 34,000 organizations that have received funding from Alphabet Soup over the years. Within this dataset are a number of columns that capture metadata about each organization:
- `EIN` and `NAME` — Identification columns
- `APPLICATION_TYPE` — Alphabet Soup application type
- `AFFILIATION` — Affiliated sector of industry
- `CLASSIFICATION` — Government organization classification
- `USE_CASE` — Use case for funding
- `ORGANIZATION` — Organization type
- `STATUS` — Active status
- `INCOME_AMT` — Income classification
- `SPECIAL_CONSIDERATIONS` — Special consideration for application
- `ASK_AMT` — Funding amount requested
- `IS_SUCCESSFUL` — Was the money used effectively

### Dependencies and data

In [None]:
# Dependencies
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from tensorflow import keras

In [None]:
# Data
df = pd.read_csv('data/charity_data.csv')
df.head(3)

### Numerical variables

In [None]:
# Rename columns
df.columns = ['ein', 'name', 'application_type', 'affiliation', 'classification', 'use', 
              'organization', 'active', 'income', 'special_considerations', 'amount', 'successful']
df.info()

In [None]:
# Unique value counts
df.nunique()

In [None]:
# Inspect numerical columns
df.describe()

In [None]:
# Inspect `amount`
ask = df['amount']
ask.plot(kind='hist', bins=10**np.arange(3, 10, 0.5), grid=True, figsize=(12, 4), title='Ask Amount Distribution')
plt.yscale('log')
plt.xscale('log')
plt.xlabel('Ask amount')
plt.show()

### Categorical variables

In [None]:
# Convert `ein` to a categorical variable
df['ein'] = df['ein'].astype(str)

# Unique value counts for categorical variables
cat_feats = df.dtypes[df.dtypes == object].index.tolist() # list of categorical variables
df[cat_feats].nunique()

In [None]:
# Inspect the top 10 labels of `classification` and `application_type`
print(df['classification'].value_counts()[:10])
df['application_type'].value_counts()[:10]

In [None]:
# Inspect labels of all other categorical variables
for feat in [cat_feats[3]] + cat_feats[5:]:
    print(df[feat].value_counts())

In [None]:
# Reduce `classification` to the top 3 and an other label
df['classification'] = df['classification'].apply(lambda c: c if c in ['C1000', 'C2000', 'C1200'] else '0ther')

# Create a dichotomous variable indicating whether the `application_type` is T3
df['application_T3'] = (df['application_type'] == 'T3').astype(int)

df.head(3)

In [None]:
# Create a variable reducing `affiliation` to the top 2 and an other label
df['affiliation'] = df['affiliation'].apply(lambda a: a if a in ['Independent', 'CompanySponsored'] else '0ther')

# Create a variable reducing `use` to the top 2 and an other label
df['use'] = df['use'].apply(lambda a: a if a in ['Preservation', 'ProductDev'] else '0ther')

# Create a variable reducing `organization` to the top 2 and an other label
df['organization'] = df['organization'].apply(lambda a: a if a in ['Trust', 'Association'] else '0ther')

# Create a dichotomous variable indicating whether there were `special_considerations`
df['special'] = (df['special_considerations'] == 'Y').astype(int)

df.head(3)

In [None]:
# Combine `income` labels into 3 buckets: 0, <100k, 100k+
labels_lt100k = ['1-9999', '10000-24999', '25000-99999'] # labels for <100k bucket
df['income'] = df['income'].apply(lambda i: '<100k' if i in labels_lt100k else i) # <100k bucket
df['income'] = df['income'].apply(lambda i: i if i in ['0', '<100k'] else '100k+') # 100k+ bucket
df.head(3)

In [None]:
# Unique value counts for categorical variables
cols_to_encode = cat_feats[3:-2]
df[cols_to_encode].nunique()

### Categorical encoding

In [None]:
# Value counts of categorical features
for feat in cols_to_encode:
    print(df[feat].value_counts())

In [None]:
# One-hot encoding
ohe = OneHotEncoder(drop='first', sparse=False)
df_ohe = pd.DataFrame(ohe.fit_transform(df[cols_to_encode]), 
                      columns=ohe.get_feature_names(cols_to_encode)).astype(int)
df_ohe.head(3)

In [None]:
# One-hot encoding for `income`
df = pd.get_dummies(df, columns=['income'])

# Merge data
df_merged = df.merge(df_ohe, left_index=True, right_index=True)
df_merged.shape

In [None]:
# Drop categorical columns
cat_feats.remove('income') # `income` already dropped
df_merged.drop(columns=cat_feats, inplace=True)
df_merged.head(3)

### Data preprocessing

In [None]:
# Transform `amount`
df_merged['transformed_amount'] = np.log10(df_merged['amount']) / 10 # log-transform and divide by 10
df_merged.drop(columns='amount', inplace=True) # drop `amount`

# Plot transformed `amount`
df_merged['transformed_amount'].hist()
plt.xlabel('Log10(amount) / 10')
plt.show()

In [None]:
# Inspect values
df_merged.describe()

In [None]:
# Feature/target split
X = df_merged.drop(columns='successful')
y = df_merged['successful']

# Inspect target
y.value_counts()

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=24)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Logistic regression

In [None]:
# Logistic regression
lr = LogisticRegression(max_iter=1e4, random_state=1)
lr.fit(X_train, y_train)

# Evaluate model
print('Training set accuracy:', lr.score(X_train, y_train))
print('Test set accuracy:', lr.score(X_test, y_test))

In [None]:
# Coefficients
sorted(list(zip(lr.coef_[0], X_train.columns)))

### Random forest

In [None]:
# Random forest
rf = RandomForestClassifier(random_state=100)
rf.fit(X_train, y_train)

# Evaluate model
print('Training set accuracy:', rf.score(X_train, y_train))
print('Test set accuracy:', rf.score(X_test, y_test))

In [None]:
# Parameters to tune
params = {'max_depth': [5, 10, None],
          'min_samples_split': [2, 8, 32],
          'min_samples_leaf': [1, 4, 16],
          'max_features': ['sqrt', 0.5, None],
          'max_samples': [0.5, 0.75, None]}

# Grid search
rf_search = GridSearchCV(RandomForestClassifier(random_state=100), params, cv=3, verbose=2, n_jobs=-1)
rf_search.fit(X_train, y_train)
print('Best score:', rf_search.best_score_)
rf_search.best_params_

In [None]:
# Evaluate best model
rf_best = rf_search.best_estimator_
print('Training set accuracy:', rf_best.score(X_train, y_train))
print('Test set accuracy:', rf_best.score(X_test, y_test))

In [None]:
# Coefficients
sorted(list(zip(rf_best.feature_importances_, X_train.columns)), reverse=True)

### Basic neural network

In [None]:
# Basic NN
bnn = keras.models.Sequential(name='risk_basic')

# Parameters
input_dim = X_train.shape[1]
hidden_units = input_dim * 2
output_units = 1

# Layers
bnn.add(keras.layers.Dense(units=hidden_units, activation='relu', input_dim=input_dim)) # input & hidden layer
bnn.add(keras.layers.Dense(units=output_units, activation='sigmoid')) # output layer

bnn.summary()

In [None]:
# Checkpoint path
bnn_cdir = 'checkpoints/bnn/'
os.makedirs(bnn_cdir, exist_ok=True) # create directory for checkpoints
bnn_cpath = bnn_cdir + 'bnn_cp{epoch}.hdf5'

# Checkpoint callback
cfreq = X_train.shape[0] * 10 # save cp every 10 epochs
bnn_ccallback = keras.callbacks.ModelCheckpoint(bnn_cpath, save_weights_only=True, save_freq=cfreq)

In [None]:
# Compile model
bnn.compile('adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
bnn.fit(X_train, y_train, epochs=200, callbacks=[bnn_ccallback])

In [None]:
# Evaluate model
bnn.evaluate(X_test, y_test)

### 2-layer NN

In [None]:
# Sequential NN
nn1 = keras.models.Sequential(name='risk1')

# Parameters
h1_units = 12
h2_units = 6

# Layers
nn1.add(keras.layers.Dense(units=h1_units, activation='relu', input_dim=input_dim)) # input & 1st hidden layer
nn1.add(keras.layers.Dense(units=h2_units, activation='relu')) # 2nd hidden layer
nn1.add(keras.layers.Dense(units=output_units, activation='sigmoid')) # output layer
        
nn1.summary()

In [None]:
# Checkpoint path
nn1_cdir = 'checkpoints/nn1/'
os.makedirs(nn1_cdir, exist_ok=True) # create directory for checkpoints
nn1_cpath = nn1_cdir + 'nn1_cp{epoch}.hdf5'

# Checkpoint callback
nn1_ccallback = keras.callbacks.ModelCheckpoint(nn1_cpath, save_weights_only=True, save_freq=cfreq)

In [None]:
# Compile model
nn1.compile('adam', 'binary_crossentropy', ['accuracy'])

# Train model
nn1.fit(X_train, y_train, epochs=200, callbacks=[nn1_ccallback])

In [None]:
# Evaluate model
nn1.evaluate(X_test, y_test)

### 3-layer NN

In [None]:
# Sequential NN
nn2 = keras.models.Sequential(name='risk2')

# Parameters
h1_units = 12
h2_units = 6
h3_units = 6

# Layers
nn2.add(keras.layers.Dense(units=h1_units, activation='relu', input_dim=input_dim)) # input & 1st hidden layer
nn2.add(keras.layers.Dense(units=h2_units, activation='relu')) # 2nd hidden layer
nn2.add(keras.layers.Dense(units=h3_units, activation='relu')) # 3rd hidden layer
nn2.add(keras.layers.Dense(units=output_units, activation='sigmoid')) # output layer
        
nn2.summary()

In [None]:
# Checkpoint path
nn2_cdir = 'checkpoints/nn1/'
os.makedirs(nn2_cdir, exist_ok=True) # create directory for checkpoints
nn2_cpath = nn2_cdir + 'nn2_cp{epoch}.hdf5'

# Checkpoint callback
nn2_ccallback = keras.callbacks.ModelCheckpoint(nn2_cpath, save_weights_only=True, save_freq=cfreq)

In [None]:
# Compile model
nn2.compile('adam', 'binary_crossentropy', ['accuracy'])

# Train model
nn2.fit(X_train, y_train, epochs=100, callbacks=[nn2_ccallback])

In [None]:
# Evaluate model
nn2.evaluate(X_test, y_test)

### NN with 12 features

In [None]:
# Drop 4 columns
cols_to_drop = ['active', 'special', 'classification_C1200', 'classification_C2000']
X12_train = X_train.drop(columns=cols_to_drop)
X12_test = X_test.drop(columns=cols_to_drop)
X12_train.shape, X12_test.shape

In [None]:
# Sequential NN
nn3 = keras.models.Sequential(name='risk3')

# Parameters
input_dim = X12_train.shape[1]
h1_units = 8
h2_units = 4
h3_units = 4

# Layers
nn3.add(keras.layers.Dense(units=h1_units, activation='relu', input_dim=input_dim)) # input & 1st hidden layer
nn3.add(keras.layers.Dense(units=h2_units, activation='relu')) # 2nd hidden layer
nn3.add(keras.layers.Dense(units=h3_units, activation='relu')) # 3rd hidden layer
nn3.add(keras.layers.Dense(units=output_units, activation='sigmoid')) # output layer
        
nn3.summary()

In [None]:
# Checkpoint path
nn3_cdir = 'checkpoints/nn3/'
os.makedirs(nn3_cdir, exist_ok=True) # create directory for checkpoints
nn3_cpath = nn3_cdir + 'nn3_cp{epoch}.hdf5'

# Checkpoint callback
nn3_ccallback = keras.callbacks.ModelCheckpoint(nn3_cpath, save_weights_only=True, save_freq=cfreq)

In [None]:
# Compile model
nn3.compile('adam', 'binary_crossentropy', ['accuracy'])

# Train model
nn3.fit(X12_train, y_train, epochs=100, callbacks=[nn2_ccallback])

In [None]:
# Evaluate model
nn3.evaluate(X12_test, y_test)