In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import warnings

In [33]:
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")
sample_sub = pd.read_csv("input/sample_submission.csv")

In [34]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [35]:
original_feature  = [
    "EXT_SOURCE_1",
    "EXT_SOURCE_2", 
    "EXT_SOURCE_3",
    
    "AMT_CREDIT",
    "AMT_ANNUITY",
    "AMT_INCOME_TOTAL",
    "AMT_GOODS_PRICE",
    
    "OWN_CAR_AGE",

    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "CODE_GENDER",
    "CNT_CHILDREN",
    
    "NAME_EDUCATION_TYPE",
    "NAME_INCOME_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_CONTRACT_TYPE",
    
    "ORGANIZATION_TYPE",
    "REGION_POPULATION_RELATIVE",

    "NAME_HOUSING_TYPE",           # Housing situation (renting, own, etc.)
    "REGION_RATING_CLIENT",        # Region rating
    "DAYS_REGISTRATION",           # How long registered
    "DAYS_ID_PUBLISH",            # How long since ID published
    "FLAG_MOBIL",                 # Did client provide mobile phone
    "FLAG_EMP_PHONE",             # Did client provide work phone
    "FLAG_WORK_PHONE",            # Did client provide home phone
    "OCCUPATION_TYPE",            # Client's occupation
    "CNT_FAM_MEMBERS",            # Family size
    "DAYS_LAST_PHONE_CHANGE",    # Days since phone change
    "AMT_REQ_CREDIT_BUREAU_HOUR", # Credit bureau inquiries (last hour)
    "AMT_REQ_CREDIT_BUREAU_MON",  # Credit bureau inquiries (last month)
    "AMT_REQ_CREDIT_BUREAU_QRT",  # Credit bureau inquiries (last quarter)
    "AMT_REQ_CREDIT_BUREAU_YEAR", # Credit bureau inquiries (last year)

]

target = train["TARGET"].values

train = train[original_feature]
train["TARGET"] = target
test = test[original_feature]

In [36]:
pd.set_option('display.max_columns', None)

In [37]:
train

Unnamed: 0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,AMT_CREDIT,AMT_ANNUITY,AMT_INCOME_TOTAL,AMT_GOODS_PRICE,OWN_CAR_AGE,DAYS_BIRTH,DAYS_EMPLOYED,CODE_GENDER,CNT_CHILDREN,NAME_EDUCATION_TYPE,NAME_INCOME_TYPE,NAME_FAMILY_STATUS,NAME_CONTRACT_TYPE,ORGANIZATION_TYPE,REGION_POPULATION_RELATIVE,NAME_HOUSING_TYPE,REGION_RATING_CLIENT,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,OCCUPATION_TYPE,CNT_FAM_MEMBERS,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET
0,,0.372591,,755190.0,36328.5,112500.0,675000.0,,-9233,-878,F,0,Higher education,Working,Married,Cash loans,School,0.010032,House / apartment,2,-333.0,-522,1,1,1,Core staff,2.0,-292.0,,,,,0
1,,0.449567,0.553165,585000.0,16893.0,225000.0,585000.0,,-20148,365243,F,0,Secondary / secondary special,Pensioner,Married,Cash loans,XNA,0.008019,House / apartment,2,-4469.0,-3436,1,0,0,Other,2.0,-617.0,0.0,0.0,0.0,1.0,0
2,,0.569503,,334152.0,18256.5,54000.0,270000.0,,-18496,-523,F,0,Secondary / secondary special,State servant,Married,Cash loans,Postal,0.004960,House / apartment,2,-3640.0,-2050,1,1,1,Core staff,2.0,-542.0,,,,,0
3,,0.105235,0.767523,152820.0,8901.0,67500.0,135000.0,,-24177,365243,F,0,Lower secondary,Pensioner,Widow,Cash loans,XNA,0.005002,House / apartment,3,-4950.0,-3951,1,0,0,Other,1.0,0.0,0.0,0.0,0.0,0.0,0
4,0.342344,0.202490,0.669057,271066.5,21546.0,157500.0,234000.0,,-10685,-697,M,0,Secondary / secondary special,Commercial associate,Married,Cash loans,Business Entity Type 3,0.006296,With parents,3,-5101.0,-3226,1,1,1,Drivers,2.0,-1243.0,0.0,0.0,0.0,4.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171197,,0.404560,0.768808,404325.0,20772.0,83250.0,337500.0,21.0,-20529,-3059,M,0,Secondary / secondary special,Working,Married,Cash loans,Agriculture,0.031329,House / apartment,2,-11581.0,-3689,1,1,1,Laborers,2.0,-2341.0,0.0,0.0,1.0,0.0,0
171198,,0.608542,,601470.0,29065.5,247500.0,450000.0,1.0,-22083,-129,F,0,Secondary / secondary special,Working,Single / not married,Cash loans,Industry: type 3,0.010006,House / apartment,2,-4629.0,-1773,1,1,0,Cooking staff,1.0,-1688.0,0.0,0.0,1.0,5.0,0
171199,,0.664305,0.758393,1237684.5,49216.5,292500.0,1138500.0,,-11053,-2536,F,2,Higher education,State servant,Married,Cash loans,School,0.006629,House / apartment,2,-4858.0,-3393,1,1,0,Core staff,4.0,-515.0,0.0,0.0,0.0,1.0,0
171200,0.210918,0.627050,,239850.0,25186.5,112500.0,225000.0,7.0,-8505,-165,M,0,Secondary / secondary special,Commercial associate,Single / not married,Cash loans,Business Entity Type 3,0.009657,House / apartment,2,-3318.0,-1176,1,1,0,Sales staff,1.0,-1133.0,,,,,0


In [38]:
# Numerization of ORGANIZATION_TYPE (Count Encoding）
organization_ce = train["ORGANIZATION_TYPE"].value_counts()
train["ORGANIZATION_TYPE"] = train["ORGANIZATION_TYPE"].map(organization_ce)
test["ORGANIZATION_TYPE"] = test["ORGANIZATION_TYPE"].map(organization_ce)

In [39]:
nameFamStatus = train["NAME_FAMILY_STATUS"].value_counts()
train["NAME_FAMILY_STATUS"] = train["NAME_FAMILY_STATUS"].map(nameFamStatus)
test["NAME_FAMILY_STATUS"] = test["NAME_FAMILY_STATUS"].map(nameFamStatus)

In [40]:
incomeType = train["NAME_INCOME_TYPE"].value_counts()
train["NAME_INCOME_TYPE"] = train["NAME_INCOME_TYPE"].map(incomeType)
test["NAME_INCOME_TYPE"] = test["NAME_INCOME_TYPE"].map(incomeType)

In [41]:
eduType = train["NAME_EDUCATION_TYPE"].value_counts()
train["NAME_EDUCATION_TYPE"] = train["NAME_EDUCATION_TYPE"].map(eduType)
test["NAME_EDUCATION_TYPE"] = test["NAME_EDUCATION_TYPE"].map(eduType)

In [42]:
occType = train["OCCUPATION_TYPE"].value_counts()
train["OCCUPATION_TYPE"] = train["OCCUPATION_TYPE"].map(occType)
test["OCCUPATION_TYPE"] = test["OCCUPATION_TYPE"].map(occType)

In [43]:
occType = train["NAME_HOUSING_TYPE"].value_counts()
train["NAME_HOUSING_TYPE"] = train["NAME_HOUSING_TYPE"].map(occType)
test["NAME_HOUSING_TYPE"] = test["NAME_HOUSING_TYPE"].map(occType)

In [44]:
# Treat values above 60 (outliers) in OWN_CAR_AGE as missing values
train.loc[train["REGION_POPULATION_RELATIVE"] >= 0.07, "OWN_CAR_AGE"] = np.nan
test.loc[test["REGION_POPULATION_RELATIVE"] >= 0.07, "OWN_CAR_AGE"] = np.nan

In [45]:
# Numerization of NAME_CONTRACT_TYPE（Label Encoding）
train["NAME_CONTRACT_TYPE"].replace({'Cash loans': 0, 'Revolving loans': 1}, inplace=True)
test["NAME_CONTRACT_TYPE"].replace({'Cash loans': 0, 'Revolving loans': 1}, inplace=True)

In [46]:
# Numerization of NAME_CONTRACT_TYPE（Label Encoding）
train["CODE_GENDER"].replace({'XNA': 0, 'F': 1, 'M' : 2 }, inplace=True)
test["CODE_GENDER"].replace({'XNA': 0, 'F': 1, 'M' : 2}, inplace=True)

In [47]:
train["EXT_SOURCE_2"].fillna(train["EXT_SOURCE_2"].mean(), inplace=True)
test["EXT_SOURCE_2"].fillna(train["EXT_SOURCE_2"].mean(), inplace=True) 

In [48]:
#train["EXT_SOURCE_1"].fillna(train["EXT_SOURCE_1"].mean(), inplace=True)
#test["EXT_SOURCE_1"].fillna(train["EXT_SOURCE_1"].mean(), inplace=True) 

In [49]:
#train["EXT_SOURCE_3"].fillna(train["EXT_SOURCE_3"].mean(), inplace=True)
#test["EXT_SOURCE_3"].fillna(train["EXT_SOURCE_3"].mean(), inplace=True) 

try to feture engginering later to have better data

In [50]:
# Treat values above 60 (outliers) in OWN_CAR_AGE as missing values
train.loc[train["OWN_CAR_AGE"] >= 60, "OWN_CAR_AGE"] = np.nan
test.loc[test["OWN_CAR_AGE"] >= 60, "OWN_CAR_AGE"] = np.nan

In [51]:
# Divide OWN_CAR_AGE into groups
train["OWN_CAR_AGE"] = train["OWN_CAR_AGE"] // 10
test["OWN_CAR_AGE"] = test["OWN_CAR_AGE"] // 10

In [52]:
# Apply One Hot Encoding to OWN_CAR_AGE
train_car_age_ohe = pd.get_dummies(train["OWN_CAR_AGE"]).add_prefix("OWN_CAR_AGE_")
test_car_age_ohe = pd.get_dummies(test["OWN_CAR_AGE"]).add_prefix("OWN_CAR_AGE_")

# Add the one hot encoded columns to train/test
train = pd.concat([train, train_car_age_ohe], axis=1)
test = pd.concat([test, test_car_age_ohe], axis=1)

# Remove original OWN_CAR_AGE
train.drop('OWN_CAR_AGE', axis=1, inplace=True)
test.drop('OWN_CAR_AGE', axis=1, inplace=True)

In [53]:
test

Unnamed: 0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,AMT_CREDIT,AMT_ANNUITY,AMT_INCOME_TOTAL,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_EMPLOYED,CODE_GENDER,CNT_CHILDREN,NAME_EDUCATION_TYPE,NAME_INCOME_TYPE,NAME_FAMILY_STATUS,NAME_CONTRACT_TYPE,ORGANIZATION_TYPE,REGION_POPULATION_RELATIVE,NAME_HOUSING_TYPE,REGION_RATING_CLIENT,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,OCCUPATION_TYPE,CNT_FAM_MEMBERS,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,OWN_CAR_AGE_0.0,OWN_CAR_AGE_1.0,OWN_CAR_AGE_2.0,OWN_CAR_AGE_3.0,OWN_CAR_AGE_4.0,OWN_CAR_AGE_5.0
0,,0.720416,,961146.0,28233.0,144000.0,688500.0,-12108,-2372,1,1,41645,88268,109323,0,3866,0.025164,151916,2,-2446.0,-3022,1,1,0,4744,3.0,-1.0,,,,,False,False,False,False,False,False
1,,0.287306,,296280.0,16069.5,103500.0,225000.0,-17907,-1712,1,0,121612,88268,109323,0,4991,0.007020,151916,2,-10450.0,-253,1,1,1,2567,2.0,-212.0,,,,,False,False,False,False,False,False
2,,0.352456,0.389339,183694.5,11236.5,180000.0,139500.0,-15221,-553,1,1,121612,40007,25285,0,4336,0.006852,151916,3,-1056.0,-4495,1,1,0,53745,2.0,-428.0,0.0,1.0,1.0,1.0,False,False,False,False,False,False
3,,0.470384,0.217629,450000.0,22500.0,225000.0,450000.0,-11217,-1438,1,2,41645,88268,109323,1,37943,0.035792,151916,2,-6096.0,-1189,1,1,0,53745,4.0,-442.0,0.0,0.0,0.0,3.0,False,False,False,False,False,False
4,0.269931,0.373133,,545040.0,26640.0,144000.0,450000.0,-11415,-2362,1,2,41645,88268,109323,0,37943,0.020713,151916,3,-3257.0,-1728,1,1,0,53745,4.0,-1333.0,0.0,0.0,0.0,3.0,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61495,0.263678,0.018172,0.307737,1288350.0,37800.0,315000.0,1125000.0,-11430,-792,1,0,41645,40007,109323,0,4336,0.007020,151916,2,-9772.0,-2705,1,1,0,11855,2.0,-1.0,0.0,1.0,1.0,1.0,False,True,False,False,False,False
61496,,0.668578,0.434733,273636.0,15408.0,90000.0,247500.0,-17181,-839,1,0,41645,88268,109323,0,21340,0.006671,151916,2,-5125.0,-668,1,1,0,17793,2.0,-2732.0,0.0,0.0,0.0,0.0,False,False,False,False,False,False
61497,0.510226,0.574151,,291384.0,26725.5,144000.0,270000.0,-14515,-722,1,0,41645,40007,25285,0,37943,0.018801,151916,2,-7225.0,-4795,1,1,1,5506,1.0,-615.0,0.0,0.0,1.0,0.0,False,False,False,False,False,False
61498,0.353295,0.226714,,746280.0,59094.0,193500.0,675000.0,-16914,-8756,1,1,41645,12007,109323,0,9272,0.002042,151916,3,-5233.0,-231,1,1,0,53745,3.0,-1610.0,0.0,0.0,1.0,3.0,False,False,False,False,False,False


In [54]:
train

Unnamed: 0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,AMT_CREDIT,AMT_ANNUITY,AMT_INCOME_TOTAL,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_EMPLOYED,CODE_GENDER,CNT_CHILDREN,NAME_EDUCATION_TYPE,NAME_INCOME_TYPE,NAME_FAMILY_STATUS,NAME_CONTRACT_TYPE,ORGANIZATION_TYPE,REGION_POPULATION_RELATIVE,NAME_HOUSING_TYPE,REGION_RATING_CLIENT,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,OCCUPATION_TYPE,CNT_FAM_MEMBERS,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET,OWN_CAR_AGE_0.0,OWN_CAR_AGE_1.0,OWN_CAR_AGE_2.0,OWN_CAR_AGE_3.0,OWN_CAR_AGE_4.0,OWN_CAR_AGE_5.0
0,,0.372591,,755190.0,36328.5,112500.0,675000.0,-9233,-878,1,0,41645,88268,109323,0,4991,0.010032,151916,2,-333.0,-522,1,1,1,15271,2.0,-292.0,,,,,0,False,False,False,False,False,False
1,,0.449567,0.553165,585000.0,16893.0,225000.0,585000.0,-20148,365243,1,0,121612,30895,109323,0,30898,0.008019,151916,2,-4469.0,-3436,1,0,0,53745,2.0,-617.0,0.0,0.0,0.0,1.0,0,False,False,False,False,False,False
2,,0.569503,,334152.0,18256.5,54000.0,270000.0,-18496,-523,1,0,121612,12007,109323,0,1185,0.004960,151916,2,-3640.0,-2050,1,1,1,15271,2.0,-542.0,,,,,0,False,False,False,False,False,False
3,,0.105235,0.767523,152820.0,8901.0,67500.0,135000.0,-24177,365243,1,0,2133,30895,8952,0,30898,0.005002,151916,3,-4950.0,-3951,1,0,0,53745,1.0,0.0,0.0,0.0,0.0,0.0,0,False,False,False,False,False,False
4,0.342344,0.202490,0.669057,271066.5,21546.0,157500.0,234000.0,-10685,-697,2,0,121612,40007,109323,0,37943,0.006296,8263,3,-5101.0,-3226,1,1,1,10353,2.0,-1243.0,0.0,0.0,0.0,4.0,1,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171197,,0.404560,0.768808,404325.0,20772.0,83250.0,337500.0,-20529,-3059,2,0,121612,88268,109323,0,1338,0.031329,151916,2,-11581.0,-3689,1,1,1,30723,2.0,-2341.0,0.0,0.0,1.0,0.0,0,False,False,True,False,False,False
171198,,0.608542,,601470.0,29065.5,247500.0,450000.0,-22083,-129,1,0,121612,88268,25285,0,1833,0.010006,151916,2,-4629.0,-1773,1,1,0,3353,1.0,-1688.0,0.0,0.0,1.0,5.0,0,True,False,False,False,False,False
171199,,0.664305,0.758393,1237684.5,49216.5,292500.0,1138500.0,-11053,-2536,1,2,41645,12007,109323,0,4991,0.006629,151916,2,-4858.0,-3393,1,1,0,15271,4.0,-515.0,0.0,0.0,0.0,1.0,0,False,False,False,False,False,False
171200,0.210918,0.627050,,239850.0,25186.5,112500.0,225000.0,-8505,-165,2,0,121612,40007,25285,0,37943,0.009657,151916,2,-3318.0,-1176,1,1,0,17793,1.0,-1133.0,,,,,0,True,False,False,False,False,False
