In [48]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

In [49]:
# Loading the dataset from the resources folder

charity_df = pd.read_csv(Path('Resources/charity_data.csv'))

In [50]:
charity_df

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


# Exploratory Data Analysis (EDA):

1. Drop the unnamed column as it does not contribute to clustering or add value to input data
2. List the DataFrame's data types to ensure they're aligned to the type of data stored on each column.
3. Is there any column whose data type need to be changed? If so, make the corresponding adjustments.
4. Is there any unnecessary column that needs to be dropped? If so, make the corresponding adjustments.
5. Check for duplicates.
6. In order to use unsupervised learning algorithms, all the features should be numeric, and also, on similar scales.
7. Rename the column if needed.
8. Check for categorical data in columns.
9. Determine the number of unique values for each column.
10. For those columns that have more than 10 unique values, determine the number of data points for each unique value.
11. Use the number of data points for each unique value to pick a cutoff point to bin "rare" categorical variables together in a new value, Other, and then check if the binning was successful.
12. What variable(s) are considered the target(s) for your model?
13. What variable(s) are considered the feature(s) for your model?

In [51]:
charity_df.shape

(34299, 12)

In [52]:
charity_df.dtypes

EIN                        int64
NAME                      object
APPLICATION_TYPE          object
AFFILIATION               object
CLASSIFICATION            object
USE_CASE                  object
ORGANIZATION              object
STATUS                     int64
INCOME_AMT                object
SPECIAL_CONSIDERATIONS    object
ASK_AMT                    int64
IS_SUCCESSFUL              int64
dtype: object

In [53]:
# Find null values
for column in charity_df.columns:
    print(f"Column {column} has {charity_df[column].isnull().sum()} null values")


Column EIN has 0 null values
Column NAME has 0 null values
Column APPLICATION_TYPE has 0 null values
Column AFFILIATION has 0 null values
Column CLASSIFICATION has 0 null values
Column USE_CASE has 0 null values
Column ORGANIZATION has 0 null values
Column STATUS has 0 null values
Column INCOME_AMT has 0 null values
Column SPECIAL_CONSIDERATIONS has 0 null values
Column ASK_AMT has 0 null values
Column IS_SUCCESSFUL has 0 null values


In [54]:
# Find duplicate entries
print(f"Duplicate entries: {charity_df.duplicated().sum()}")

Duplicate entries: 0


In [55]:
# A list of the columns from the original DataFrame
charity_df.columns

Index(['EIN', 'NAME', 'APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION',
       'USE_CASE', 'ORGANIZATION', 'STATUS', 'INCOME_AMT',
       'SPECIAL_CONSIDERATIONS', 'ASK_AMT', 'IS_SUCCESSFUL'],
      dtype='object')

In [56]:
charity_df.columns

Index(['EIN', 'NAME', 'APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION',
       'USE_CASE', 'ORGANIZATION', 'STATUS', 'INCOME_AMT',
       'SPECIAL_CONSIDERATIONS', 'ASK_AMT', 'IS_SUCCESSFUL'],
      dtype='object')

In [57]:
charity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34299 entries, 0 to 34298
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   EIN                     34299 non-null  int64 
 1   NAME                    34299 non-null  object
 2   APPLICATION_TYPE        34299 non-null  object
 3   AFFILIATION             34299 non-null  object
 4   CLASSIFICATION          34299 non-null  object
 5   USE_CASE                34299 non-null  object
 6   ORGANIZATION            34299 non-null  object
 7   STATUS                  34299 non-null  int64 
 8   INCOME_AMT              34299 non-null  object
 9   SPECIAL_CONSIDERATIONS  34299 non-null  object
 10  ASK_AMT                 34299 non-null  int64 
 11  IS_SUCCESSFUL           34299 non-null  int64 
dtypes: int64(4), object(8)
memory usage: 3.1+ MB


In [58]:
# Function checking for missing values
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns

In [59]:
missing_values_table(charity_df)

Your selected dataframe has 12 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


## Data Preprocessing:

In [60]:
# Drop 'EIN', 'NAME' columns from the dataframe

charity_df = charity_df.drop(['EIN', 'NAME'], axis=1)
charity_df

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...
34294,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


In [61]:
charity_df["INCOME_AMT"]

0                    0
1               1-9999
2                    0
3          10000-24999
4        100000-499999
             ...      
34294                0
34295                0
34296                0
34297                0
34298            1M-5M
Name: INCOME_AMT, Length: 34299, dtype: object

In [62]:
# new data frame with split value columns
charity_df[['INCOME_LOWER','INCOME_UPPER']] = charity_df["INCOME_AMT"].str.split("-", n = 1, expand = True)
charity_df

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,INCOME_LOWER,INCOME_UPPER
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1,0,
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1,1,9999
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0,0,
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1,10000,24999
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1,100000,499999
...,...,...,...,...,...,...,...,...,...,...,...,...
34294,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0,0,
34295,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0,0,
34296,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0,0,
34297,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1,0,


In [63]:
charity_df = charity_df.fillna(0)
charity_df

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,INCOME_LOWER,INCOME_UPPER
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1,0,0
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1,1,9999
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0,0,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1,10000,24999
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1,100000,499999
...,...,...,...,...,...,...,...,...,...,...,...,...
34294,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0,0,0
34295,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0,0,0
34296,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0,0,0
34297,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1,0,0


In [64]:
charity_df['INCOME_UPPER'] = charity_df['INCOME_UPPER'].replace('M','', regex=True)
charity_df['INCOME_LOWER'] = charity_df['INCOME_LOWER'].replace('M','', regex=True)

In [65]:
charity_df["INCOME_LOWER"] 

0             0
1             1
2             0
3         10000
4        100000
          ...  
34294         0
34295         0
34296         0
34297         0
34298         1
Name: INCOME_LOWER, Length: 34299, dtype: object

In [66]:
# Drop 'EIN', 'NAME' columns from the dataframe

charity_df = charity_df.drop(['INCOME_AMT', 'CLASSIFICATION', 'APPLICATION_TYPE'], axis=1)
charity_df

Unnamed: 0,AFFILIATION,USE_CASE,ORGANIZATION,STATUS,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,INCOME_LOWER,INCOME_UPPER
0,Independent,ProductDev,Association,1,N,5000,1,0,0
1,Independent,Preservation,Co-operative,1,N,108590,1,1,9999
2,CompanySponsored,ProductDev,Association,1,N,5000,0,0,0
3,CompanySponsored,Preservation,Trust,1,N,6692,1,10000,24999
4,Independent,Heathcare,Trust,1,N,142590,1,100000,499999
...,...,...,...,...,...,...,...,...,...
34294,Independent,ProductDev,Association,1,N,5000,0,0,0
34295,CompanySponsored,ProductDev,Association,1,N,5000,0,0,0
34296,CompanySponsored,Preservation,Association,1,N,5000,0,0,0
34297,Independent,ProductDev,Association,1,N,5000,1,0,0


In [67]:
# Generate our categorical variable lists
charity_cat = charity_df.dtypes[charity_df.dtypes == "object"].index.tolist()

In [68]:
# Check the number of unique values in each column
charity_df[charity_cat].nunique()

AFFILIATION               6
USE_CASE                  5
ORGANIZATION              4
SPECIAL_CONSIDERATIONS    2
INCOME_LOWER              8
INCOME_UPPER              8
dtype: int64

In [73]:
charity_df[charity_cat].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34299 entries, 0 to 34298
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   AFFILIATION             34299 non-null  object
 1   USE_CASE                34299 non-null  object
 2   ORGANIZATION            34299 non-null  object
 3   SPECIAL_CONSIDERATIONS  34299 non-null  object
 4   INCOME_LOWER            34299 non-null  object
 5   INCOME_UPPER            34299 non-null  object
dtypes: object(6)
memory usage: 1.6+ MB


In [75]:
# # Create a OneHotEncoder instance
# enc = OneHotEncoder(sparse=False)

# # Fit and transform the OneHotEncoder using the categorical variable list
# encode_df = pd.DataFrame(enc.fit_transform(charity_df[charity_cat]))

# # Add the encoded variable names to the dataframe
# encode_df.columns = enc.get_feature_names(charity_cat)
# encode_df.head()

In [78]:
# import pandas as pd
# from sklearn.preprocessing import LabelBinarizer

# jobs_encoder = LabelBinarizer()
# jobs_encoder.fit(charity_df[charity_cat])
# transformed = jobs_encoder.transform(charity_df[charity_cat])
# ohe_df = pd.DataFrame(transformed)
# data = pd.concat([charity_df, ohe_df], axis=1).drop(['charity_cat'], axis=1)