## Discretizing Attributes

In [79]:
# Import package
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [80]:
# Set up directory

dir_data = 'C:\\Users\\victo\\Dropbox\\AI_M\\data_p1'
f_app_train = os.path.join(dir_data, 'application_train.csv')
f_app_test = os.path.join(dir_data, 'application_test.csv')

app_train = pd.read_csv(f_app_train)
app_test = pd.read_csv(f_app_test)

## Data Prep

In [81]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

# Create an anomalous flag column
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
# also apply to testing dataset
app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

# absolute the value of DAYS_BIRTH
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_test['DAYS_BIRTH'] = abs(app_test['DAYS_BIRTH'])

In [109]:
age_train = pd.DataFrame({"age" : app_train['DAYS_BIRTH'] / 365})
age_train.head()

Unnamed: 0,age
0,25.920548
1,45.931507
2,52.180822
3,52.068493
4,54.608219


#### Bining

In [110]:
age_train['Equal_Bin_age'] = pd.cut(age_train["age"], 4)

In [111]:
# Each bin's width is equal
age_train['Equal_Bin_age'].value_counts()

(32.668, 44.819]    100808
(44.819, 56.97]      84685
(20.469, 32.668]     66536
(56.97, 69.121]      55482
Name: Equal_Bin_age, dtype: int64

#### Equal Depth

In [112]:
age_train['Equal_Depth_age'] = pd.qcut(age_train["age"], 4)

In [113]:
# Each bin's depth is the same
age_train['Equal_Depth_age'].value_counts()

(43.151, 53.923]    76887
(20.517, 34.008]    76884
(34.008, 43.151]    76877
(53.923, 69.121]    76863
Name: Equal_Depth_age, dtype: int64

#### Self-Define Bins

In [114]:
age_train["customized_age"] = pd.cut(age_train["age"], [20, 30, 50, 100])

In [115]:
age_train["customized_age"].value_counts()

(30, 50]     158849
(50, 100]    103641
(20, 30]      45021
Name: customized_age, dtype: int64