In [159]:
import pandas as pd
import numpy as np
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [160]:
df = pd.read_csv('data/ppp_data_after_openrefine.csv')
original_df = pd.read_csv('data/ppp_data.csv')

In [161]:
df.head()

Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,RaceEthnicity,Gender,Veteran,NonProfit,JobsReported,DateApproved,Lender,CD
0,149957.5,Honolulu,HI,96813,238220.0,Corporation,Unanswered,Unanswered,Unanswered,,14.0,04/16/2020,First Hawaiian Bank,HI-01
1,149900.0,Honolulu,HI,96814,541990.0,Non-Profit Organization,Unanswered,Unanswered,Unanswered,Y,17.0,04/08/2020,First Hawaiian Bank,HI-01
2,149800.0,Honolulu,HI,96816,722511.0,Corporation,Asian,Male Owned,Non-Veteran,,42.0,04/12/2020,Central Pacific Bank,HI-01
3,149800.0,Honolulu,HI,96815,722511.0,Corporation,Unanswered,Unanswered,Unanswered,,27.0,05/27/2020,"American Savings Bank, FSB",HI-01
4,149700.0,Aiea,HI,96701,621111.0,Limited Liability Partnership,Unanswered,Unanswered,Unanswered,,10.0,04/06/2020,"American Savings Bank, FSB",HI-01


# Additional Cleaning

In [162]:
# Replace all 0s in City with NaN
df.loc[df.City == '0', 'City'] = np.nan

# missed a '_' in OpenRefine so change that to NaN as well
df.loc[df.City == '_', 'City'] = np.nan

# also missed a "Suite 620"
df.loc[df.City == 'Suite 620', 'City'] = np.nan

# Set NaN jobs to 0
df['JobsReported'] = df['JobsReported'].fillna(0)

# Set JobsReported to int
df['JobsReported'] = df['JobsReported'].astype('int')

# Set NaN to N (NonProfit originally only consists of "Y" and "N/A")
df['NonProfit'] = df['NonProfit'].fillna('N')

# Change the single 'OR-02' value in the CD column to NaN
df.loc[df.CD == 'OR-02', 'CD'] = np.nan

# keep a copy of df without dropping NaNs for data summary purposes
df_no_drop_nans = df

# Drop all rows with NaN
df = df.dropna()

In [172]:
df.head()

Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,RaceEthnicity,Gender,Veteran,NonProfit,JobsReported,DateApproved,Lender,CD
0,149957.5,Honolulu,HI,96813,238220.0,Corporation,Unanswered,Unanswered,Unanswered,N,14,04/16/2020,First Hawaiian Bank,HI-01
1,149900.0,Honolulu,HI,96814,541990.0,Non-Profit Organization,Unanswered,Unanswered,Unanswered,Y,17,04/08/2020,First Hawaiian Bank,HI-01
2,149800.0,Honolulu,HI,96816,722511.0,Corporation,Asian,Male Owned,Non-Veteran,N,42,04/12/2020,Central Pacific Bank,HI-01
3,149800.0,Honolulu,HI,96815,722511.0,Corporation,Unanswered,Unanswered,Unanswered,N,27,05/27/2020,"American Savings Bank, FSB",HI-01
4,149700.0,Aiea,HI,96701,621111.0,Limited Liability Partnership,Unanswered,Unanswered,Unanswered,N,10,04/06/2020,"American Savings Bank, FSB",HI-01


In [111]:
set(df['CD'])

{'HI-01', 'HI-02'}

# Data Changes

In [197]:
def num_changed_values(col_name):
    og_col = original_df[col_name]
    new_col = df_no_drop_nans[col_name]
    num_changed = 0
    for i in range(len(og_col)):
        if pd.isna(og_col[i]) and pd.isna(new_col[i]):
            continue
        if og_col[i] != new_col[i]:
            num_changed += 1
    return num_changed
for col_name in df.columns:
    print(col_name + ": " + str(num_changed_values(col_name)))

LoanAmount: 0
City: 15511
State: 0
Zip: 0
NAICSCode: 0
BusinessType: 0
RaceEthnicity: 0
Gender: 0
Veteran: 0
NonProfit: 21226
JobsReported: 2447
DateApproved: 0
Lender: 1
CD: 1


In [205]:
def num_unique_values_dif(col_name):
    og_unique = set(original_df[col_name])
    new_unique = set(df_no_drop_nans[col_name])
    try:
        og_unique.remove(np.nan)
    except:
        pass
    try:
        new_unique.remove(np.nan)
    except:
        pass
    
    
    return len(og_unique) - len(new_unique)
for col_name in df.columns:
    print(col_name + ": " + str(num_unique_values_dif(col_name)))

LoanAmount: 0
City: 247
State: 0
Zip: 0
NAICSCode: 0
BusinessType: 0
RaceEthnicity: 0
Gender: 0
Veteran: 0
NonProfit: -1
JobsReported: 2447
DateApproved: 0
Lender: 1
CD: 1


# Data Quality

## 1

In [62]:
# before
pysqldf("SELECT * FROM original_df WHERE City REGEXP '[0-9]+|^[.!?\\-_]$';")

Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,RaceEthnicity,Gender,Veteran,NonProfit,JobsReported,DateApproved,Lender,CD
0,113667.5,SUITE 620,HI,96701,621210.0,Corporation,Unanswered,Unanswered,Unanswered,,7.0,04/30/2020,First Hawaiian Bank,HI-01
1,107397.5,184 Puueo Street,HI,96720,561710.0,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,,12.0,05/01/2020,First Hawaiian Bank,HI-02
2,49442.5,D1,HI,96761,238220.0,Subchapter S Corporation,Unanswered,Male Owned,Non-Veteran,,4.0,04/15/2020,First Hawaiian Bank,HI-02
3,44000.0,5,HI,96814,541511.0,Corporation,Unanswered,Unanswered,Unanswered,,2.0,04/04/2020,"American Savings Bank, FSB",HI-01
4,24300.0,_,HI,96749,561499.0,Self-Employed Individuals,Unanswered,Unanswered,Unanswered,,35.0,04/27/2020,First Hawaiian Bank,HI-02
5,17895.0,Kailua-kona 96740,HI,96740,531390.0,Sole Proprietorship,Unanswered,Unanswered,Unanswered,,1.0,05/05/2020,WebBank,HI-02
6,17309.57,1137 11th Avenue,HI,96816,453220.0,Sole Proprietorship,Unanswered,Unanswered,Unanswered,,0.0,04/30/2020,Celtic Bank Corporation,HI-01


In [63]:
# after
pysqldf("SELECT * FROM df WHERE City REGEXP '[0-9]+|^[.!?\\-_]$';")

Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,RaceEthnicity,Gender,Veteran,NonProfit,JobsReported,DateApproved,Lender,CD


## 2

In [79]:
# before
pysqldf("SELECT * FROM original_df WHERE CD NOT IN ('HI-01', 'HI-02');")

Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,RaceEthnicity,Gender,Veteran,NonProfit,JobsReported,DateApproved,Lender,CD
0,12500.0,Honolulu,HI,97817,315220.0,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,,1.0,05/03/2020,WebBank,OR-02


In [80]:
# after
pysqldf("SELECT * FROM df WHERE CD NOT IN ('HI-01', 'HI-02');")

Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,RaceEthnicity,Gender,Veteran,NonProfit,JobsReported,DateApproved,Lender,CD


## 3

In [147]:
# before
pysqldf("SELECT * FROM original_df WHERE typeof(JobsReported) != 'real' AND typeof(JobsReported) != 'integer'")

Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,RaceEthnicity,Gender,Veteran,NonProfit,JobsReported,DateApproved,Lender,CD
0,149657.00,CAPTAIN COOK,HI,96704,721110.0,Corporation,Unanswered,Male Owned,Non-Veteran,,,04/16/2020,Bank of Hawaii,HI-02
1,149615.00,HONOLULU,HI,96816,621210.0,Subchapter S Corporation,Unanswered,Unanswered,Unanswered,,,04/11/2020,Bank of Hawaii,HI-01
2,149320.00,KAHULUI,HI,96732,238220.0,Limited Liability Company(LLC),White,Male Owned,Unanswered,,,04/12/2020,Bank of Hawaii,HI-02
3,148595.00,AIEA,HI,96701,531311.0,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,,,04/15/2020,Bank of Hawaii,HI-01
4,147700.00,HILO,HI,96720,621111.0,Sole Proprietorship,Unanswered,Unanswered,Unanswered,,,04/12/2020,Bank of Hawaii,HI-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2442,415.00,HONOLULU,HI,96830,561720.0,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,,,06/19/2020,Bank of Hawaii,HI-01
2443,325.00,HONOLULU,HI,96813,485999.0,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,,,05/05/2020,Bank of Hawaii,HI-01
2444,294.57,KAPOLEI,HI,96707,621111.0,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,,,06/12/2020,Bank of Hawaii,HI-02
2445,255.00,HONOLULU,HI,96819,711410.0,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,,,04/15/2020,Bank of Hawaii,HI-01


In [148]:
# after
pysqldf("SELECT * FROM df WHERE typeof(JobsReported) != 'real' AND typeof(JobsReported) != 'integer'")

Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,RaceEthnicity,Gender,Veteran,NonProfit,JobsReported,DateApproved,Lender,CD


## 4

In [155]:
# before
pysqldf("SELECT * FROM original_df WHERE NonProfit NOT IN ('Y', 'N') OR NonProfit IS NULL;")

Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,RaceEthnicity,Gender,Veteran,NonProfit,JobsReported,DateApproved,Lender,CD
0,149957.5,HONOLULU,HI,96813,238220.0,Corporation,Unanswered,Unanswered,Unanswered,,14.0,04/16/2020,First Hawaiian Bank,HI-01
1,149800.0,HONOLULU,HI,96816,722511.0,Corporation,Asian,Male Owned,Non-Veteran,,42.0,04/12/2020,Central Pacific Bank,HI-01
2,149800.0,HONOLULU,HI,96815,722511.0,Corporation,Unanswered,Unanswered,Unanswered,,27.0,05/27/2020,"American Savings Bank, FSB",HI-01
3,149700.0,AIEA,HI,96701,621111.0,Limited Liability Partnership,Unanswered,Unanswered,Unanswered,,10.0,04/06/2020,"American Savings Bank, FSB",HI-01
4,149657.0,CAPTAIN COOK,HI,96704,721110.0,Corporation,Unanswered,Male Owned,Non-Veteran,,,04/16/2020,Bank of Hawaii,HI-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21221,200.0,EWA BEACH,HI,96706,531210.0,Sole Proprietorship,Unanswered,Unanswered,Unanswered,,0.0,06/24/2020,Navy FCU,HI-01
21222,117.0,Honolulu,HI,96814,541922.0,Sole Proprietorship,Unanswered,Unanswered,Unanswered,,1.0,05/13/2020,First Hawaiian Bank,HI-01
21223,104.0,Haiku,HI,96708,561510.0,Sole Proprietorship,Unanswered,Unanswered,Unanswered,,1.0,05/06/2020,First Hawaiian Bank,HI-02
21224,89.0,KIHEI,HI,96753,721199.0,Sole Proprietorship,Unanswered,Unanswered,Unanswered,,,05/07/2020,Bank of Hawaii,HI-02


In [156]:
# after
pysqldf("SELECT * FROM df WHERE NonProfit NOT IN ('Y', 'N') OR NonProfit IS NULL;")

Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,RaceEthnicity,Gender,Veteran,NonProfit,JobsReported,DateApproved,Lender,CD
