## Data Cleaning
### Thea Yang, Nick Gammal, Nick Hausman, Charlie Ward

Cleaning file: `application_train.csv`

In [1]:
#importing libraries
import pandas as pd
import numpy as np

In [2]:
# reading in data
df = pd.read_csv("application_train.csv")

In [4]:
"""
dropping columns that either had too high correlation with other columns or 
too many missing obs that could not be imputed or modified
""" 
df = df.drop(columns=['AMT_GOODS_PRICE', 'CNT_CHILDREN', 'FLAG_EMP_PHONE', 'REGION_RATING_CLIENT_W_CITY', 
 'REG_REGION_NOT_WORK_REGION', 'LIVE_CITY_NOT_WORK_CITY', 'LIVINGAPARTMENTS_MEDI', 
 'ELEVATORS_MEDI', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'LIVINGAREA_MEDI', 'EXT_SOURCE_1'])

In [5]:
# dropping obs for rows that have very little missing values or can't be modified and we still want to keep
clean_df = df[df['DAYS_LAST_PHONE_CHANGE'].notnull() & 
   df['CNT_FAM_MEMBERS'].notnull() & 
   df['EXT_SOURCE_2'].notnull() & 
   df['DEF_30_CNT_SOCIAL_CIRCLE'].notnull() &
   df['OBS_30_CNT_SOCIAL_CIRCLE'].notnull() &
   df['EXT_SOURCE_3'].notnull()
  ]

# imputing amt ammunity column with mean amt
mean_amt_annuity = clean_df['AMT_ANNUITY'].mean()
clean_df['AMT_ANNUITY'].fillna(value=mean_amt_annuity, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


### Functions

In [45]:
def map_amt_req(c):
    if c >= 1:
        return 1
    else:
        return 0
    
# discretize own car age
def agemap(num):
    if num <= 1:
        return "new"
    elif num <= 5:
        return "young"
    elif num <= 10:
        return "middle"
    elif num <= 20:
        return "aging"
    elif num <= 60:
        return "old"
    elif num > 60:
        return "classic"
    else:
        return "no car"

# Refactor occupation type
blue = ["Laborers", "Drivers", "Medicine staff", "Security staff", "Cooking staff", "Cleaning staff", "Private service staff", "Low-skill Laborers", "Secretaries", "Waiters/barmen staff"]
white = ["Sales staff", "Core staff", "Managers", "High skill tech staff", "Accountants", "Realty agents", "HR staff", "IT staff"]
def workmap(job):
    if job in blue:
        return "blue"
    elif job in white:
        return "white"
    else:
        return "other"
    
def accompany_map(c):
    if c == 'Unaccompanied':
        return 'Unaccompanied'
    elif c in ['Family', 'Spouse, partner', 'Children', 'Other_B', 'Other_A', 'Group of people']:
        return 'Accompanied'
    else:
        return 'Unknown'

In [6]:
# makng new column based on whether the person has made an enquiry to the Credit Bureau at all in the past year

clean_df['SUM_AMT_REQ_CREDIT'] = clean_df[['AMT_REQ_CREDIT_BUREAU_MON',
'AMT_REQ_CREDIT_BUREAU_WEEK',
'AMT_REQ_CREDIT_BUREAU_DAY',
'AMT_REQ_CREDIT_BUREAU_HOUR',
'AMT_REQ_CREDIT_BUREAU_QRT',
'AMT_REQ_CREDIT_BUREAU_YEAR']].sum(axis=1)

clean_df['AMT_REQ_CREDIT'] = clean_df['SUM_AMT_REQ_CREDIT'].apply(map_amt_req)

clean_df = clean_df.drop(columns=['AMT_REQ_CREDIT_BUREAU_WEEK',
'AMT_REQ_CREDIT_BUREAU_DAY',
'AMT_REQ_CREDIT_BUREAU_HOUR',
'AMT_REQ_CREDIT_BUREAU_QRT',
'AMT_REQ_CREDIT_BUREAU_YEAR',
'AMT_REQ_CREDIT_BUREAU_MON',
'SUM_AMT_REQ_CREDIT'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['SUM_AMT_REQ_CREDIT'] = clean_df[['AMT_REQ_CREDIT_BUREAU_MON',


In [9]:
# remapping categorical columns to simplify levels and account for missing values
clean_df['OCCUPATION_TYPE'] = clean_df['OCCUPATION_TYPE'].apply(workmap)
clean_df['OWN_CAR_AGE'] = clean_df['OWN_CAR_AGE'].apply(agemap)
clean_df['NAME_TYPE_SUITE']= clean_df['NAME_TYPE_SUITE'].apply(accompany_map)

In [12]:
# get all the 'housing-related' columns and keep only the median ones
mode_cols = np.array(clean_df.columns[clean_df.columns.str.contains("_MODE")])
avg_cols = np.array(clean_df.columns[clean_df.columns.str.contains("_AVG")])
med_cols = np.array(clean_df.columns[clean_df.columns.str.contains("_MEDI")])

clean_df = clean_df.drop(columns=mode_cols).drop(columns=avg_cols)

In [28]:
# code related to created a 'HOUSING SCORE' based on number of housing columns that are above average for that row
clean_df = clean_df.drop(columns=['index']).reset_index(drop=True)
combine = clean_df[med_cols]

housing_columns_above_mean_cnt = pd.Series(np.zeros(len(combine.index)))
for col in combine.columns:
    housing_columns_above_mean_cnt += (combine[col] > combine[col].mean()).astype(int)

na_bool_series = [combine[col].isna() for col in combine.columns]
undefined_housing_indicies = []
for i in range(len(na_bool_series[0])):
    if all(l[i] for l in na_bool_series):
        undefined_housing_indicies.append(i)
        
for i in undefined_housing_indicies: housing_columns_above_mean_cnt[i] = 'NO INFO' 
    
clean_df['HOUSING_SCORE'] = housing_columns_above_mean_cnt
# dropping the original columns
clean_df = clean_df.drop(columns=med_cols)

0.0     139932
1.0      18704
2.0      17924
4.0      14518
3.0      14121
5.0      12202
6.0       9282
7.0       6837
8.0       5052
9.0       4274
10.0      1831
11.0       691
dtype: int64

In [41]:
# final filter to get columns with only known housing scores
clean_df_2 = clean_df[clean_df['HOUSING_SCORE'] != 'NO INFO']

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_TYPE_SUITE,...,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT,HOUSING_SCORE
0,100002,1,Cash loans,M,N,Y,202500.0,406597.5,24700.5,Unaccompanied,...,0,0,0,0,0,0,0,0,1,0.0
8,100016,0,Cash loans,F,N,Y,67500.0,80865.0,5881.5,Unaccompanied,...,0,0,0,0,0,0,0,0,1,2.0
9,100017,0,Cash loans,M,Y,N,225000.0,918468.0,28966.5,Unaccompanied,...,0,0,0,0,0,0,0,0,1,7.0
12,100022,0,Revolving loans,F,N,Y,112500.0,157500.0,7875.0,Accompanied,...,0,0,0,0,0,0,0,0,0,2.0
15,100026,0,Cash loans,F,N,N,450000.0,497520.0,32521.5,Unaccompanied,...,0,0,0,0,0,0,0,0,1,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245361,456244,0,Cash loans,F,N,Y,261000.0,1303812.0,35982.0,Unaccompanied,...,0,0,0,0,0,0,0,0,1,3.0
245364,456249,0,Cash loans,F,N,Y,112500.0,225000.0,22050.0,Unaccompanied,...,0,0,0,0,0,0,0,0,1,10.0
245365,456253,0,Cash loans,F,N,Y,153000.0,677664.0,29979.0,Unaccompanied,...,0,0,0,0,0,0,0,0,1,2.0
245366,456254,1,Cash loans,F,N,Y,171000.0,370107.0,20205.0,Unaccompanied,...,0,0,0,0,0,0,0,0,0,0.0


In [42]:
# check for missing values
test = pd.DataFrame(clean_df.isna().sum()).reset_index()
test.columns = ['name', 'count']
test[test['count'] > 0 ]

Unnamed: 0,name,count


In [44]:
# export to csv
# clean_df_2.to_csv('cleaned_training_data.csv')