## Data Cleaning
### Thea Yang, Nick Gammal, Nick Hausman, Charlie Ward

Cleaning file: `application_train.csv`

In [48]:
#importing libraries
import pandas as pd
import numpy as np

In [49]:
# reading in data
df = pd.read_csv("/Users/nickhausman/Desktop/DATA/DATA_403_Project_2/home-credit-default-risk/application_train.csv")

In [50]:
"""
dropping columns that either had too high correlation with other columns or 
too many missing obs that could not be imputed or modified
""" 
df = df.drop(columns=['AMT_GOODS_PRICE', 'CNT_CHILDREN', 'FLAG_EMP_PHONE', 'REGION_RATING_CLIENT_W_CITY', 
 'REG_REGION_NOT_WORK_REGION', 'LIVE_CITY_NOT_WORK_CITY', 'LIVINGAPARTMENTS_MEDI', 
 'ELEVATORS_MEDI', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'LIVINGAREA_MEDI', 'EXT_SOURCE_1'])

In [51]:
# dropping obs for rows that have very little missing values or can't be modified and we still want to keep
clean_df = df[df['DAYS_LAST_PHONE_CHANGE'].notnull() & 
   df['CNT_FAM_MEMBERS'].notnull() & 
   df['EXT_SOURCE_2'].notnull() & 
   df['DEF_30_CNT_SOCIAL_CIRCLE'].notnull() &
   df['OBS_30_CNT_SOCIAL_CIRCLE'].notnull() &
   df['EXT_SOURCE_3'].notnull()
  ]

# imputing amt ammunity column with mean amt
mean_amt_annuity = clean_df.loc[:,'AMT_ANNUITY'].mean()
clean_df['AMT_ANNUITY'].fillna(value=mean_amt_annuity, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['AMT_ANNUITY'].fillna(value=mean_amt_annuity, inplace=True)


### Functions

In [52]:
def map_amt_req(c):
    if c >= 1:
        return 1
    else:
        return 0
    
# discretize own car age
def agemap(num):
    if num <= 1:
        return "new"
    elif num <= 5:
        return "young"
    elif num <= 10:
        return "middle"
    elif num <= 20:
        return "aging"
    elif num <= 60:
        return "old"
    elif num > 60:
        return "classic"
    else:
        return "no car"

# Refactor occupation type
blue = ["Laborers", "Drivers", "Medicine staff", "Security staff", "Cooking staff", "Cleaning staff", "Private service staff", "Low-skill Laborers", "Secretaries", "Waiters/barmen staff"]
white = ["Sales staff", "Core staff", "Managers", "High skill tech staff", "Accountants", "Realty agents", "HR staff", "IT staff"]
def workmap(job):
    if job in blue:
        return "blue"
    elif job in white:
        return "white"
    else:
        return "other"
    
def accompany_map(c):
    if c == 'Unaccompanied':
        return 'Unaccompanied'
    elif c in ['Family', 'Spouse, partner', 'Children', 'Other_B', 'Other_A', 'Group of people']:
        return 'Accompanied'
    else:
        return 'Unknown'

In [53]:
# makng new column based on whether the person has made an enquiry to the Credit Bureau at all in the past year

clean_df['SUM_AMT_REQ_CREDIT'] = clean_df.loc[:,['AMT_REQ_CREDIT_BUREAU_MON',
'AMT_REQ_CREDIT_BUREAU_WEEK',
'AMT_REQ_CREDIT_BUREAU_DAY',
'AMT_REQ_CREDIT_BUREAU_HOUR',
'AMT_REQ_CREDIT_BUREAU_QRT',
'AMT_REQ_CREDIT_BUREAU_YEAR']].sum(axis=1)

clean_df['AMT_REQ_CREDIT'] = clean_df.loc[:,'SUM_AMT_REQ_CREDIT'].apply(map_amt_req)

clean_df = clean_df.drop(columns=['AMT_REQ_CREDIT_BUREAU_WEEK',
'AMT_REQ_CREDIT_BUREAU_DAY',
'AMT_REQ_CREDIT_BUREAU_HOUR',
'AMT_REQ_CREDIT_BUREAU_QRT',
'AMT_REQ_CREDIT_BUREAU_YEAR',
'AMT_REQ_CREDIT_BUREAU_MON',
'SUM_AMT_REQ_CREDIT'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['SUM_AMT_REQ_CREDIT'] = clean_df.loc[:,['AMT_REQ_CREDIT_BUREAU_MON',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['AMT_REQ_CREDIT'] = clean_df.loc[:,'SUM_AMT_REQ_CREDIT'].apply(map_amt_req)


In [54]:
# remapping categorical columns to simplify levels and account for missing values
clean_df['OCCUPATION_TYPE'] = clean_df['OCCUPATION_TYPE'].apply(workmap)
clean_df['OWN_CAR_AGE'] = clean_df['OWN_CAR_AGE'].apply(agemap)
clean_df['NAME_TYPE_SUITE']= clean_df['NAME_TYPE_SUITE'].apply(accompany_map)

In [55]:
# get all the 'housing-related' columns and keep only the median ones
mode_cols = np.array(clean_df.columns[clean_df.columns.str.contains("_MODE")])
avg_cols = np.array(clean_df.columns[clean_df.columns.str.contains("_AVG")])
med_cols = np.array(clean_df.columns[clean_df.columns.str.contains("_MEDI")])

clean_df = clean_df.drop(columns=mode_cols).drop(columns=avg_cols)

In [56]:
# code related to created a 'HOUSING SCORE' based on number of housing columns that are above average for that row
clean_df = clean_df.reset_index(drop=True)
combine = clean_df[med_cols]

housing_columns_above_mean_cnt = pd.Series(np.zeros(len(combine.index)))
for col in combine.columns:
    housing_columns_above_mean_cnt += (combine[col] > combine[col].mean()).astype(int)

na_bool_series = [combine[col].isna() for col in combine.columns]
undefined_housing_indicies = []
for i in range(len(na_bool_series[0])):
    if all(l[i] for l in na_bool_series):
        undefined_housing_indicies.append(i)
        
for i in undefined_housing_indicies: housing_columns_above_mean_cnt[i] = 'NO INFO' 
    
clean_df['HOUSING_SCORE'] = housing_columns_above_mean_cnt
# dropping the original columns
clean_df = clean_df.drop(columns=med_cols)

In [57]:
# final filter to get columns with only known housing scores
clean_df_2 = clean_df[clean_df['HOUSING_SCORE'] != 'NO INFO']

In [58]:
# check for missing values
test = pd.DataFrame(clean_df.isna().sum()).reset_index()
test.columns = ['name', 'count']
test[test['count'] > 0 ]

Unnamed: 0,name,count


## Adding Additional Features from Supplemental Tables

In [59]:
# removing flag document columns
clean_df_2 = clean_df_2.drop(clean_df_2.loc[:, 'FLAG_DOCUMENT_2':'FLAG_DOCUMENT_21' ].columns, axis=1)

In [60]:
# reading in additional tables
df_credit = pd.read_csv('home-credit-default-risk/credit_card_balance.csv')
df_prev = pd.read_csv('home-credit-default-risk/previous_application.csv')

### Average Annuity Credit Ratio from Previous Loans

In [61]:
# get average annuity credit ratio for previous applications
df_prev['ANNUITY_CREDIT_RATIO'] = df_prev['AMT_ANNUITY']/df_prev['AMT_CREDIT']
avg_ann_cred_ratio = pd.DataFrame(df_prev.groupby('SK_ID_CURR')['ANNUITY_CREDIT_RATIO'].mean()).reset_index()

In [62]:
# merge in new feature
df_clean = clean_df_2.merge(avg_ann_cred_ratio, on='SK_ID_CURR', how='left')

In [63]:
df_clean['TARGET'].value_counts()

0    118965
1      8676
Name: TARGET, dtype: int64

In [64]:
# there's not much lost from removing obs that don't have previous data on avg annuity credit ratio so dropping
df_clean[df_clean['ANNUITY_CREDIT_RATIO'].notnull()]['TARGET'].value_counts()

0    111463
1      8249
Name: TARGET, dtype: int64

In [65]:
df_clean = df_clean[df_clean['ANNUITY_CREDIT_RATIO'].notnull()]

### Number of Months of Missed Minimum Payments

In [66]:
df_credit['payment diff'] = df_credit['AMT_PAYMENT_CURRENT'] - df_credit['AMT_INST_MIN_REGULARITY']
df_credit['CNT_MISSED_MIN'] = np.where(df_credit['payment diff'] < 0, 1, 0)

cnt_missed_min = pd.DataFrame(df_credit.groupby('SK_ID_CURR')['CNT_MISSED_MIN'].sum()).reset_index()

In [67]:
# merge in new feature
df_clean = df_clean.merge(cnt_missed_min, on='SK_ID_CURR', how='left')

In [68]:
# fill in 0 if never missed min
df_clean['CNT_MISSED_MIN'].fillna(0, inplace=True)

### Number of Previous Total Applied Loans & Number of Previous Accepted Loans

In [69]:
# previous total amount of applied loans per id
prev_loan_cnt = pd.DataFrame(df_prev['SK_ID_CURR'].value_counts()).reset_index()
prev_loan_cnt.columns = ['SK_ID_CURR', 'CNT_PREV_LOANS']

In [70]:
# previous total amount of accepted loans per id
prev_acc_loan = pd.DataFrame(df_prev[df_prev['NAME_CONTRACT_STATUS'] == 'Approved']['SK_ID_CURR'].value_counts()).reset_index()
prev_acc_loan.columns = ['SK_ID_CURR', 'CNT_ACCEPTED_LOANS']

In [71]:
# merge in new feature
df_clean = df_clean.merge(prev_loan_cnt, on='SK_ID_CURR', how='left')
df_clean = df_clean.merge(prev_acc_loan, on='SK_ID_CURR', how='left')

In [72]:
# if there is missing data means they never previously applied for loan -> fill in with 0
df_clean['CNT_PREV_LOANS'].fillna(0, inplace=True)
df_clean['CNT_ACCEPTED_LOANS'].fillna(0, inplace=True)

In [73]:
df_clean = df_clean.rename(columns={'ANNUITY_CREDIT_RATIO':'PREV_AVG_AC_RATIO'})

### Bureau data

In [74]:
len(df_clean.index)

119712

In [75]:
bureau = pd.read_csv('bureau.csv')
df_clean = pd.merge(df_clean, bureau, on='SK_ID_CURR')

In [76]:
len(df_clean.index)

119103

### Dropping FLAG_MOBIL because it breaks modeling

In [77]:
df_clean = df_clean.drop('FLAG_MOBIL', axis=1)

In [78]:
# export to csv
df_clean.to_csv('cleaned_training_data.csv', index=False)