In [1]:
# General imports
import re
import warnings
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# Data processing
from sklearn.model_selection import train_test_split
import imblearn

# Models
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import calibration_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Fairlearn algorithms and utils
from fairlearn.postprocessing import ThresholdOptimizer
from fairlearn.reductions import GridSearch, ExponentiatedGradient, EqualizedOdds, DemographicParity

# Metrics
from fairlearn.metrics import demographic_parity_ratio, equalized_odds_ratio, equalized_odds_difference, demographic_parity_difference
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform


# AIF360 algorithms and utils
from aif360.algorithms.postprocessing.reject_option_classification import RejectOptionClassification
from aif360.algorithms.inprocessing import MetaFairClassifier
from aif360.datasets import StandardDataset


from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [2]:
home_df = pd.read_csv("application_train.csv", sep=',')

print(home_df["TARGET"].value_counts())
# In[3]:


home_df["EXT_SOURCE_1"].fillna(home_df["EXT_SOURCE_1"].mean(), inplace=True)
home_df["EXT_SOURCE_2"].fillna(home_df["EXT_SOURCE_1"].mean(), inplace=True)
home_df["EXT_SOURCE_3"].fillna(home_df["EXT_SOURCE_1"].mean(), inplace=True)

home_df.dropna(axis='index', how='any', subset=["AMT_ANNUITY", "DAYS_LAST_PHONE_CHANGE", "NAME_TYPE_SUITE"], inplace=True)
home_df.drop(home_df[home_df["CODE_GENDER"] == "XNA"].index, inplace=True, axis='index')

# delete 1292 rows that have missing values for these features
home_df.drop(home_df[home_df["OBS_30_CNT_SOCIAL_CIRCLE"].isna()].index, inplace=True, axis='index')

# delete ID since we're using only the trainset
home_df.drop(home_df.columns[0], inplace=True, axis=1)

for name, missing in home_df.isnull().sum().iteritems():
    norm = len(home_df["TARGET"])
    if missing > 0:
        print(name, missing / norm)




0    282686
1     24825
Name: TARGET, dtype: int64
OWN_CAR_AGE 0.6598674229391738
OCCUPATION_TYPE 0.31351230908870475
APARTMENTS_AVG 0.5080100006225814
BASEMENTAREA_AVG 0.5856655558504625
YEARS_BEGINEXPLUATATION_AVG 0.4883429833443104
YEARS_BUILD_AVG 0.6653789062883994
COMMONAREA_AVG 0.6990802179690085
ELEVATORS_AVG 0.533545666342269
ENTRANCES_AVG 0.5040287567050373
FLOORSMAX_AVG 0.49815027803172546
FLOORSMIN_AVG 0.6788954751442586
LANDAREA_AVG 0.5942244110871909
LIVINGAPARTMENTS_AVG 0.6839351073625161
LIVINGAREA_AVG 0.5024690265776703
NONLIVINGAPARTMENTS_AVG 0.6947188717515179
NONLIVINGAREA_AVG 0.5523574534456601
APARTMENTS_MODE 0.5080100006225814
BASEMENTAREA_MODE 0.5856655558504625
YEARS_BEGINEXPLUATATION_MODE 0.4883429833443104
YEARS_BUILD_MODE 0.6653789062883994
COMMONAREA_MODE 0.6990802179690085
ELEVATORS_MODE 0.533545666342269
ENTRANCES_MODE 0.5040287567050373
FLOORSMAX_MODE 0.49815027803172546
FLOORSMIN_MODE 0.6788954751442586
LANDAREA_MODE 0.5942244110871909
LIVINGAPARTMENTS_M

In [3]:
for idx in tqdm(home_df.loc[(home_df["OCCUPATION_TYPE"].isna()) & (home_df["ORGANIZATION_TYPE"] != "XNA") & (home_df["NAME_INCOME_TYPE"] != "Pensioner")].index):
    org_type = home_df.at[idx, "ORGANIZATION_TYPE"]
    unnorm_distr = home_df.loc[home_df["ORGANIZATION_TYPE"] == org_type, "OCCUPATION_TYPE"].value_counts()
    distr = unnorm_distr / unnorm_distr.sum()
    new_occ = np.random.choice(a=distr.index.tolist(), p=distr.tolist())
    home_df.at[idx, "OCCUPATION_TYPE"] = new_occ

100%|██████████| 40666/40666 [12:30<00:00, 54.22it/s]


In [4]:


# remaining jobless people
print(home_df.loc[home_df["OCCUPATION_TYPE"].isna(), "ORGANIZATION_TYPE"].value_counts())
print(home_df.loc[home_df["OCCUPATION_TYPE"].isna(), "NAME_INCOME_TYPE"].value_counts())

home_df.loc[home_df["NAME_INCOME_TYPE"] == "Pensioner", "OCCUPATION_TYPE"] = "Pensioner"
home_df.loc[home_df["NAME_INCOME_TYPE"] == "Unemployed", "OCCUPATION_TYPE"] = "Unemployed"

# home_df["ORGANIZATION_TYPE"].isna().sum().sum()


# In[6]:


# impute car age with -1
home_df.loc[(home_df["FLAG_OWN_CAR"] == "N") & (home_df["OWN_CAR_AGE"].isna()), "OWN_CAR_AGE"] = -1
home_df["OWN_CAR_AGE"].fillna(0, inplace=True)
# home_df.loc[home_df["FLAG_OWN_CAR"] == "N", "OWN_CAR_AGE"].value_counts(dropna=False)


# In[7]:


housing_info = [x for x in sorted(home_df.columns) if re.search("_AVG|_MEDI|_MODE", x)]
set(housing_info).issubset(set(home_df.isnull().sum().index))

# are there people who own a house but have missing house information
# home_df.loc[home_df["FLAG_OWN_REALTY"] == "Y", housing_info].isnull().sum().sum()

# are there people who don't own a house but have housing information
# home_df.loc[home_df["FLAG_OWN_REALTY"] == "N", housing_info]
# home_df.loc[home_df["FLAG_OWN_REALTY"] == "N", "NAME_HOUSING_TYPE"].value_counts(dropna=False)

# impute all housing info with -1 or drop if more than 60% is missing
for name, missing in home_df[housing_info].isnull().sum().iteritems():
    norm = len(home_df["TARGET"])
    if (missing / norm) > 0.6:
        home_df.drop(name, inplace=True, axis=1)

    else:
        home_df[name].fillna(-1, inplace=True)



# In[8]:


# Fill in enquirement data with -1 if missing
req_info = [x for x in sorted(home_df.columns) if re.search("_REQ", x)]

home_df[req_info] = home_df[req_info].fillna(-1)


# In[9]:


for name, col in home_df.isnull().sum().iteritems():
    if col > 0:
        print(name, col)



XNA                       55006
Business Entity Type 3        2
Trade: type 7                 1
Business Entity Type 2        1
Military                      1
School                        1
Name: ORGANIZATION_TYPE, dtype: int64
Pensioner     54993
Unemployed       19
Name: NAME_INCOME_TYPE, dtype: int64


In [12]:
names = ["NAME_TYPE_SUITE", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "NAME_CONTRACT_TYPE", "OCCUPATION_TYPE", "WEEKDAY_APPR_PROCESS_START", "ORGANIZATION_TYPE", "HOUSETYPE_MODE", "WALLSMATERIAL_MODE"]
categorical_names = names + [x for x in sorted(home_df.columns) if re.search("FLAG_", x)] + ["MALE"]


home_df["CODE_GENDER"].replace(to_replace = {'M':1, 'F':0}, inplace=True)
home_df["FLAG_OWN_CAR"].replace(to_replace = {'Y':1, 'N':0}, inplace=True)
home_df["FLAG_OWN_REALTY"].replace(to_replace = {'Y':1, 'N':0}, inplace=True)
home_df["EMERGENCYSTATE_MODE"].replace(to_replace = {'Yes':1, 'No':0}, inplace=True)
home_df.rename(columns={"CODE_GENDER":"MALE"}, inplace=True)

mappings = {}
for c in home_df.columns:
    if c in categorical_names:
        if not all([x.isnumeric() for x in home_df[c].values if type(x) == str]):
            categories = set(home_df[c].values)
            if np.nan in categories:
                categories.remove(np.nan)
            mapping = dict(zip(categories, range(len(categories))))
            mappings[c] = mapping
            home_df[c] = home_df[c].map(mapping)
    home_df[c] = home_df[c].astype('float64')

print(home_df.dtypes)
home_df

KeyError: 'CODE_GENDER'

In [16]:
names = ["NAME_TYPE_SUITE", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "NAME_CONTRACT_TYPE", "OCCUPATION_TYPE", "WEEKDAY_APPR_PROCESS_START", "ORGANIZATION_TYPE", "HOUSETYPE_MODE", "WALLSMATERIAL_MODE"]
categorical_names = names + [x for x in sorted(home_df.columns) if re.search("FLAG_", x)] + ["MALE"]
print(categorical_names)
home_df

['NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'NAME_CONTRACT_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'FLAG_CONT_MOBILE', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_EMAIL', 'FLAG_EMP_PHONE', 'FLAG_MOBIL', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_PHONE', 'FLAG_WORK_PHONE', 'MALE']


Unnamed: 0,NAME_CONTRACT_TYPE,MALE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,0.0,1.0,0.0,1.0,0.0,202500.0,406597.5,24700.5,351000.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,0.0,67500.0,135000.0,6750.0,135000.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,135000.0,312682.5,29686.5,297000.0,4.0,...,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,0.0,1.0,0.0,1.0,0.0,121500.0,513000.0,21865.5,513000.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0.0,1.0,0.0,0.0,0.0,157500.0,254700.0,27558.0,225000.0,4.0,...,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
307507,0.0,0.0,0.0,1.0,0.0,72000.0,269550.0,12001.5,225000.0,4.0,...,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
307508,0.0,0.0,0.0,1.0,0.0,153000.0,677664.0,29979.0,585000.0,4.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
307509,0.0,0.0,0.0,1.0,0.0,171000.0,370107.0,20205.0,319500.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
print(labels)
categorical_ixs = [home_df.columns.get_loc(name) for name in categorical_names]
print(labels.value_counts(normalize=True))

oversample = imblearn.over_sampling.SMOTENC(sampling_strategy=0.12, categorical_features=categorical_ixs)
X, y = oversample.fit_resample(home_df, labels)

print(y.value_counts(normalize=True), len(y))

undersample = imblearn.under_sampling.RandomUnderSampler(sampling_strategy=0.5)
X, y = undersample.fit_resample(X, y)
print(y.value_counts(normalize=True), len(y))


# In[ ]:


X["Senior"] = X["DAYS_BIRTH"].apply(lambda x: -x / 365.25)
# Split groups in minors, middle-aged and seniors
X["Senior"] = X["Senior"].apply(lambda x : 0 if (x <= 26) else (1 if x > 50 else 2))
X.drop("DAYS_BIRTH", inplace=True, axis=1)
print(X["Senior"].value_counts())
X["TARGET"] = y
X

0         1.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
307506    0.0
307507    0.0
307508    0.0
307509    1.0
307510    0.0
Name: TARGET, Length: 305181, dtype: float64
0.0    0.919002
1.0    0.080998
Name: TARGET, dtype: float64
0.0    0.892858
1.0    0.107142
Name: TARGET, dtype: float64 314117
0.0    0.666667
1.0    0.333333
Name: TARGET, dtype: float64 100965
2    64226
1    30962
0     5777
Name: Senior, dtype: int64


Unnamed: 0,NAME_CONTRACT_TYPE,MALE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,Senior,TARGET
0,0.0,1.0,1.0,0.0,0.000000,171900.000000,1.006920e+06,42790.500000,9.000000e+05,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,2.000000,1.000000,1,0.0
1,0.0,0.0,1.0,1.0,0.000000,90000.000000,4.904955e+05,28287.000000,4.545000e+05,4.0,...,0.0,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,1,0.0
2,1.0,0.0,0.0,1.0,0.000000,360000.000000,9.000000e+05,45000.000000,9.000000e+05,4.0,...,0.0,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,2,0.0
3,1.0,1.0,1.0,1.0,1.000000,72000.000000,1.800000e+05,9000.000000,1.800000e+05,4.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2,0.0
4,0.0,0.0,0.0,1.0,0.000000,270000.000000,5.925600e+05,35937.000000,4.500000e+05,4.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,5.000000,2,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100960,0.0,0.0,0.0,1.0,0.000000,93442.730638,3.735000e+05,18617.173949,3.735000e+05,4.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,2.071072,1,1.0
100961,0.0,1.0,1.0,1.0,1.892397,180000.000000,5.132601e+05,22033.505553,4.587579e+05,4.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.946199,0.000000,1.322808,2,1.0
100962,0.0,1.0,0.0,1.0,0.379115,227559.026649,1.260141e+06,46294.875177,1.116470e+06,4.0,...,0.0,0.0,-0.620885,-0.620885,-0.620885,-0.620885,-0.620885,-0.241770,2,1.0
100963,0.0,0.0,0.0,1.0,0.000000,180000.000000,5.400000e+05,15618.741075,5.400000e+05,4.0,...,0.0,0.0,-0.261681,-0.261681,-0.261681,0.476637,-0.261681,1.953274,2,1.0


In [None]:
X["MALE"].value_counts()

In [None]:
X.to_csv("Homecredit_cleaned_smoted.csv", index=False)

In [None]:
Z = pd.read_csv("Homecredit_cleaned_smoted.csv")

In [None]:
Z

In [17]:
req_info = [x for x in sorted(Z.columns) if re.search("FLAG_", x)]

In [18]:
req_info

['FLAG_CONT_MOBILE',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_EMAIL',
 'FLAG_EMP_PHONE',
 'FLAG_MOBIL',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'FLAG_PHONE',
 'FLAG_WORK_PHONE']