In [108]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import warnings

In [109]:
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")
sample_sub = pd.read_csv("input/sample_submission.csv")

In [110]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [111]:
original_feature  = [
    "EXT_SOURCE_1",
    "EXT_SOURCE_2", 
    "EXT_SOURCE_3",
    
    "AMT_CREDIT",
    "AMT_ANNUITY",
    "AMT_INCOME_TOTAL",
    "AMT_GOODS_PRICE",
    
    "OWN_CAR_AGE",

    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "CODE_GENDER",
    "CNT_CHILDREN",
    
    "NAME_EDUCATION_TYPE",
    "NAME_INCOME_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_CONTRACT_TYPE",
    
    "ORGANIZATION_TYPE",
    "REGION_POPULATION_RELATIVE",

    "NAME_HOUSING_TYPE",           # Housing situation (renting, own, etc.)
    "REGION_RATING_CLIENT",        # Region rating
    "DAYS_REGISTRATION",           # How long registered
    "DAYS_ID_PUBLISH",            # How long since ID published
    "FLAG_MOBIL",                 # Did client provide mobile phone
    "FLAG_EMP_PHONE",             # Did client provide work phone
    "FLAG_WORK_PHONE",            # Did client provide home phone
    "OCCUPATION_TYPE",            # Client's occupation
    "CNT_FAM_MEMBERS",            # Family size
    "DAYS_LAST_PHONE_CHANGE",    # Days since phone change
    "AMT_REQ_CREDIT_BUREAU_HOUR", # Credit bureau inquiries (last hour)
    "AMT_REQ_CREDIT_BUREAU_MON",  # Credit bureau inquiries (last month)
    "AMT_REQ_CREDIT_BUREAU_QRT",  # Credit bureau inquiries (last quarter)
    "AMT_REQ_CREDIT_BUREAU_YEAR", # Credit bureau inquiries (last year)
]


In [112]:
def create_derived_features(df):
    """Create powerful derived features"""
    
    # 1. INCOME-RELATED FEATURES (5 features)
    df['CREDIT_INCOME_RATIO'] = df['AMT_CREDIT'] / (df['AMT_INCOME_TOTAL'] + 1)
    df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / (df['AMT_INCOME_TOTAL'] + 1)
    df['CREDIT_GOODS_RATIO'] = df['AMT_CREDIT'] / (df['AMT_GOODS_PRICE'] + 1)
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / (df['CNT_FAM_MEMBERS'] + 1)
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / (df['AMT_CREDIT'] + 1)
    
    # 2. AGE-RELATED FEATURES (4 features)
    df['AGE_YEARS'] = -df['DAYS_BIRTH'] / 365.25
    df['EMPLOYED_YEARS'] = -df['DAYS_EMPLOYED'] / 365.25
    df.loc[df['EMPLOYED_YEARS'] > 100, 'EMPLOYED_YEARS'] = np.nan  # Fix anomalies
    df['EMPLOY_TO_AGE_RATIO'] = df['EMPLOYED_YEARS'] / (df['AGE_YEARS'] + 1)
    df['WORKING_LIFE_RATIO'] = df['EMPLOYED_YEARS'] / df['AGE_YEARS']
    
    # 3. EXTERNAL SOURCES COMBINATIONS (6 features)
    ext_sources = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
    df['EXT_SOURCE_MEAN'] = df[ext_sources].mean(axis=1)
    df['EXT_SOURCE_STD'] = df[ext_sources].std(axis=1)
    df['EXT_SOURCE_MIN'] = df[ext_sources].min(axis=1)
    df['EXT_SOURCE_MAX'] = df[ext_sources].max(axis=1)
    df['EXT_SOURCE_WEIGHTED'] = (
        df['EXT_SOURCE_1'].fillna(0) * 0.2 + 
        df['EXT_SOURCE_2'].fillna(0) * 0.4 + 
        df['EXT_SOURCE_3'].fillna(0) * 0.4
    )
    df['EXT_SOURCE_PRODUCT'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    
    # 4. DOCUMENT & REGISTRATION FEATURES (3 features)
    df['DAYS_ID_PUBLISH_YEARS'] = -df['DAYS_ID_PUBLISH'] / 365.25
    df['DAYS_REGISTRATION_YEARS'] = -df['DAYS_REGISTRATION'] / 365.25
    df['ID_PUBLISH_TO_BIRTH_RATIO'] = df['DAYS_ID_PUBLISH'] / df['DAYS_BIRTH']
    
    # 5. CREDIT BUREAU FEATURES (3 features)
    bureau_cols = ['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_MON',
                   'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']
    df['CREDIT_BUREAU_TOTAL'] = df[bureau_cols].sum(axis=1)
    df['CREDIT_BUREAU_WEIGHTED'] = (
        df['AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(0) * 4 +
        df['AMT_REQ_CREDIT_BUREAU_MON'].fillna(0) * 2 +
        df['AMT_REQ_CREDIT_BUREAU_QRT'].fillna(0) * 1 +
        df['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(0) * 0.5
    )
    df['HAS_RECENT_INQUIRY'] = (df['AMT_REQ_CREDIT_BUREAU_HOUR'] > 0).astype(int)
    
    # 6. PHONE CHANGE FEATURES (2 features)
    df['DAYS_LAST_PHONE_CHANGE_YEARS'] = -df['DAYS_LAST_PHONE_CHANGE'] / 365.25
    df['PHONE_COUNT'] = (
        df['FLAG_MOBIL'].fillna(0) + 
        df['FLAG_EMP_PHONE'].fillna(0) + 
        df['FLAG_WORK_PHONE'].fillna(0)
    )
    
    # 7. CAR FEATURES (1 feature)
    df['CAR_TO_AGE_RATIO'] = df['OWN_CAR_AGE'] / (df['AGE_YEARS'] + 1)
    
    # 8. FAMILY FEATURES (1 feature)
    df['CHILDREN_RATIO'] = df['CNT_CHILDREN'] / (df['CNT_FAM_MEMBERS'] + 1)
    
    # 9. REGIONAL FEATURES (2 features)
    df['POPULATION_SCORE'] = df['REGION_POPULATION_RELATIVE'] * df['REGION_RATING_CLIENT']
    df['REGIONAL_INCOME'] = df['AMT_INCOME_TOTAL'] * df['REGION_POPULATION_RELATIVE']
    
    # 10. BINNED AGE GROUPS (will be one-hot encoded)
    df['AGE_GROUP'] = pd.cut(df['AGE_YEARS'], 
                             bins=[0, 25, 35, 45, 55, 100],
                             labels=['young', 'adult', 'middle', 'senior', 'elder'])
    
    # 11. FINANCIAL STRESS INDICATORS (3 features)
    df['HIGH_CREDIT_BURDEN'] = (df['CREDIT_INCOME_RATIO'] > 5).astype(int)
    df['HIGH_PAYMENT_BURDEN'] = (df['ANNUITY_INCOME_RATIO'] > 0.3).astype(int)
    df['GOODS_OVERPAID'] = (df['AMT_CREDIT'] > df['AMT_GOODS_PRICE'] * 1.5).astype(int)
    
    return df

In [113]:
train = create_derived_features(train)
test = create_derived_features(test)

In [114]:
use_features = original_feature + [
    "CREDIT_INCOME_RATIO", "ANNUITY_INCOME_RATIO",
    "CREDIT_GOODS_RATIO", "INCOME_PER_PERSON", "PAYMENT_RATE",
    "AGE_YEARS", "EMPLOYED_YEARS", "EMPLOY_TO_AGE_RATIO", "WORKING_LIFE_RATIO",
    "EXT_SOURCE_MEAN", "EXT_SOURCE_STD", "EXT_SOURCE_MIN", 
    "EXT_SOURCE_MAX", "EXT_SOURCE_WEIGHTED", "EXT_SOURCE_PRODUCT",
    "DAYS_ID_PUBLISH_YEARS", "DAYS_REGISTRATION_YEARS", "ID_PUBLISH_TO_BIRTH_RATIO",
    "CREDIT_BUREAU_TOTAL", "CREDIT_BUREAU_WEIGHTED", "HAS_RECENT_INQUIRY",
    "DAYS_LAST_PHONE_CHANGE_YEARS", "PHONE_COUNT",
    "CAR_TO_AGE_RATIO", "CHILDREN_RATIO",
    "POPULATION_SCORE", "REGIONAL_INCOME",
    "AGE_GROUP",
    "HIGH_CREDIT_BURDEN", "HIGH_PAYMENT_BURDEN", "GOODS_OVERPAID"
]

target = train["TARGET"].values

train = train[original_feature]
train["TARGET"] = target
test = test[original_feature]

In [115]:
train = train[use_features + ["TARGET"]]
test = test[use_features]

In [116]:

pd.set_option('display.max_columns', None)

In [117]:
train

Unnamed: 0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,AMT_CREDIT,AMT_ANNUITY,AMT_INCOME_TOTAL,AMT_GOODS_PRICE,OWN_CAR_AGE,DAYS_BIRTH,DAYS_EMPLOYED,CODE_GENDER,CNT_CHILDREN,NAME_EDUCATION_TYPE,NAME_INCOME_TYPE,NAME_FAMILY_STATUS,NAME_CONTRACT_TYPE,ORGANIZATION_TYPE,REGION_POPULATION_RELATIVE,NAME_HOUSING_TYPE,REGION_RATING_CLIENT,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,OCCUPATION_TYPE,CNT_FAM_MEMBERS,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CREDIT_INCOME_RATIO,ANNUITY_INCOME_RATIO,CREDIT_GOODS_RATIO,INCOME_PER_PERSON,PAYMENT_RATE,AGE_YEARS,EMPLOYED_YEARS,EMPLOY_TO_AGE_RATIO,WORKING_LIFE_RATIO,EXT_SOURCE_MEAN,EXT_SOURCE_STD,EXT_SOURCE_MIN,EXT_SOURCE_MAX,EXT_SOURCE_WEIGHTED,EXT_SOURCE_PRODUCT,DAYS_ID_PUBLISH_YEARS,DAYS_REGISTRATION_YEARS,ID_PUBLISH_TO_BIRTH_RATIO,CREDIT_BUREAU_TOTAL,CREDIT_BUREAU_WEIGHTED,HAS_RECENT_INQUIRY,DAYS_LAST_PHONE_CHANGE_YEARS,PHONE_COUNT,CAR_TO_AGE_RATIO,CHILDREN_RATIO,POPULATION_SCORE,REGIONAL_INCOME,AGE_GROUP,HIGH_CREDIT_BURDEN,HIGH_PAYMENT_BURDEN,GOODS_OVERPAID,TARGET
0,,0.372591,,755190.0,36328.5,112500.0,675000.0,,-9233,-878,F,0,Higher education,Working,Married,Cash loans,School,0.010032,House / apartment,2,-333.0,-522,1,1,1,Core staff,2.0,-292.0,,,,,6.712740,0.322917,1.118798,37500.0,0.048105,25.278576,2.403833,0.091475,0.095094,0.372591,,0.372591,0.372591,0.149036,,1.429158,0.911704,0.056536,0.0,0.0,0,0.799452,3,,0.0,0.020064,1128.60000,adult,1,1,0,0
1,,0.449567,0.553165,585000.0,16893.0,225000.0,585000.0,,-20148,365243,F,0,Secondary / secondary special,Pensioner,Married,Cash loans,XNA,0.008019,House / apartment,2,-4469.0,-3436,1,0,0,Other,2.0,-617.0,0.0,0.0,0.0,1.0,2.599988,0.075080,0.999998,75000.0,0.028877,55.162218,-999.980835,-17.805223,-18.128003,0.501366,0.073255,0.449567,0.553165,0.401093,,9.407255,12.235455,0.170538,1.0,0.5,0,1.689254,1,,0.0,0.016038,1804.27500,elder,0,0,0,0
2,,0.569503,,334152.0,18256.5,54000.0,270000.0,,-18496,-523,F,0,Secondary / secondary special,State servant,Married,Cash loans,Postal,0.004960,House / apartment,2,-3640.0,-2050,1,1,1,Core staff,2.0,-542.0,,,,,6.187885,0.338077,1.237595,18000.0,0.054635,50.639288,1.431896,0.027729,0.028276,0.569503,,0.569503,0.569503,0.227801,,5.612594,9.965777,0.110835,0.0,0.0,0,1.483915,3,,0.0,0.009920,267.84000,senior,1,1,0,0
3,,0.105235,0.767523,152820.0,8901.0,67500.0,135000.0,,-24177,365243,F,0,Lower secondary,Pensioner,Widow,Cash loans,XNA,0.005002,House / apartment,3,-4950.0,-3951,1,0,0,Other,1.0,0.0,0.0,0.0,0.0,0.0,2.263966,0.131865,1.131992,33750.0,0.058245,66.193018,-999.980835,-14.882213,-15.107044,0.436379,0.468309,0.105235,0.767523,0.349103,,10.817248,13.552361,0.163420,0.0,0.0,0,-0.000000,1,,0.0,0.015006,337.63500,elder,0,0,0,0
4,0.342344,0.202490,0.669057,271066.5,21546.0,157500.0,234000.0,,-10685,-697,M,0,Secondary / secondary special,Commercial associate,Married,Cash loans,Business Entity Type 3,0.006296,With parents,3,-5101.0,-3226,1,1,1,Drivers,2.0,-1243.0,0.0,0.0,0.0,4.0,1.721046,0.136799,1.158399,52500.0,0.079486,29.253936,1.908282,0.063075,0.065232,0.404630,0.239439,0.202490,0.669057,0.417087,0.04638,8.832307,13.965777,0.301919,4.0,2.0,0,3.403149,3,,0.0,0.018888,991.62000,adult,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171197,,0.404560,0.768808,404325.0,20772.0,83250.0,337500.0,21.0,-20529,-3059,M,0,Secondary / secondary special,Working,Married,Cash loans,Agriculture,0.031329,House / apartment,2,-11581.0,-3689,1,1,1,Laborers,2.0,-2341.0,0.0,0.0,1.0,0.0,4.856698,0.249511,1.197996,27750.0,0.051374,56.205339,8.375086,0.146404,0.149009,0.586684,0.257562,0.404560,0.768808,0.469347,,10.099932,31.707050,0.179697,1.0,1.0,0,6.409309,3,0.367099,0.0,0.062658,2608.13925,elder,0,0,0,0
171198,,0.608542,,601470.0,29065.5,247500.0,450000.0,1.0,-22083,-129,F,0,Secondary / secondary special,Working,Single / not married,Cash loans,Industry: type 3,0.010006,House / apartment,2,-4629.0,-1773,1,1,0,Cooking staff,1.0,-1688.0,0.0,0.0,1.0,5.0,2.430172,0.117436,1.336597,123750.0,0.048324,60.459959,0.353183,0.005747,0.005842,0.608542,,0.608542,0.608542,0.243417,,4.854209,12.673511,0.080288,6.0,3.5,0,4.621492,2,0.016271,0.0,0.020012,2476.48500,elder,0,0,0,0
171199,,0.664305,0.758393,1237684.5,49216.5,292500.0,1138500.0,,-11053,-2536,F,2,Higher education,State servant,Married,Cash loans,School,0.006629,House / apartment,2,-4858.0,-3393,1,1,0,Core staff,4.0,-515.0,0.0,0.0,0.0,1.0,4.231386,0.168261,1.087118,58500.0,0.039765,30.261465,6.943190,0.222101,0.229440,0.711349,0.066530,0.664305,0.758393,0.569079,,9.289528,13.300479,0.306975,1.0,0.5,0,1.409993,2,,0.4,0.013258,1938.98250,adult,0,0,0,0
171200,0.210918,0.627050,,239850.0,25186.5,112500.0,225000.0,7.0,-8505,-165,M,0,Secondary / secondary special,Commercial associate,Single / not married,Cash loans,Business Entity Type 3,0.009657,House / apartment,2,-3318.0,-1176,1,1,0,Sales staff,1.0,-1133.0,,,,,2.131981,0.223878,1.065995,56250.0,0.105009,23.285421,0.451745,0.018602,0.019400,0.418984,0.294250,0.210918,0.627050,0.293004,,3.219713,9.084189,0.138272,0.0,0.0,0,3.101985,2,0.288239,0.0,0.019314,1086.41250,young,0,0,0,0


COUNT ENCODING UNDER HERE

In [118]:
# Numerization of ORGANIZATION_TYPE (Count Encoding）
organization_ce = train["ORGANIZATION_TYPE"].value_counts()
train["ORGANIZATION_TYPE"] = train["ORGANIZATION_TYPE"].map(organization_ce)
test["ORGANIZATION_TYPE"] = test["ORGANIZATION_TYPE"].map(organization_ce)

In [119]:
nameFamStatus = train["NAME_FAMILY_STATUS"].value_counts()
train["NAME_FAMILY_STATUS"] = train["NAME_FAMILY_STATUS"].map(nameFamStatus)
test["NAME_FAMILY_STATUS"] = test["NAME_FAMILY_STATUS"].map(nameFamStatus)

In [120]:
incomeType = train["NAME_INCOME_TYPE"].value_counts()
train["NAME_INCOME_TYPE"] = train["NAME_INCOME_TYPE"].map(incomeType)
test["NAME_INCOME_TYPE"] = test["NAME_INCOME_TYPE"].map(incomeType)

In [121]:
eduType = train["NAME_EDUCATION_TYPE"].value_counts()
train["NAME_EDUCATION_TYPE"] = train["NAME_EDUCATION_TYPE"].map(eduType)
test["NAME_EDUCATION_TYPE"] = test["NAME_EDUCATION_TYPE"].map(eduType)

In [122]:
occType = train["OCCUPATION_TYPE"].value_counts()
train["OCCUPATION_TYPE"] = train["OCCUPATION_TYPE"].map(occType)
test["OCCUPATION_TYPE"] = test["OCCUPATION_TYPE"].map(occType)

In [123]:
occType = train["NAME_HOUSING_TYPE"].value_counts()
train["NAME_HOUSING_TYPE"] = train["NAME_HOUSING_TYPE"].map(occType)
test["NAME_HOUSING_TYPE"] = test["NAME_HOUSING_TYPE"].map(occType)

In [124]:
# Treat values above 60 (outliers) in OWN_CAR_AGE as missing values
train.loc[train["REGION_POPULATION_RELATIVE"] >= 0.07, "OWN_CAR_AGE"] = np.nan
test.loc[test["REGION_POPULATION_RELATIVE"] >= 0.07, "OWN_CAR_AGE"] = np.nan

LABEL ENCODING UNDER HERE

In [125]:
# Numerization of NAME_CONTRACT_TYPE（Label Encoding）
train["NAME_CONTRACT_TYPE"].replace({'Cash loans': 0, 'Revolving loans': 1}, inplace=True)
test["NAME_CONTRACT_TYPE"].replace({'Cash loans': 0, 'Revolving loans': 1}, inplace=True)

In [126]:
# Numerization of NAME_CONTRACT_TYPE（Label Encoding）
train["CODE_GENDER"].replace({'XNA': 0, 'F': 1, 'M' : 2 }, inplace=True)
test["CODE_GENDER"].replace({'XNA': 0, 'F': 1, 'M' : 2}, inplace=True)

CHANGE NAN UNDER HERE

In [127]:
train["EXT_SOURCE_2"].fillna(train["EXT_SOURCE_2"].mean(), inplace=True)
test["EXT_SOURCE_2"].fillna(train["EXT_SOURCE_2"].mean(), inplace=True) 

In [128]:
train["EXT_SOURCE_1"].fillna(train["EXT_SOURCE_1"].mean(), inplace=True)
test["EXT_SOURCE_1"].fillna(train["EXT_SOURCE_1"].mean(), inplace=True) 

In [129]:
train["EXT_SOURCE_3"].fillna(train["EXT_SOURCE_3"].mean(), inplace=True)
test["EXT_SOURCE_3"].fillna(train["EXT_SOURCE_3"].mean(), inplace=True) 

In [130]:
train["AMT_REQ_CREDIT_BUREAU_HOUR"].fillna(train["AMT_REQ_CREDIT_BUREAU_HOUR"].mean(), inplace=True)
test["AMT_REQ_CREDIT_BUREAU_HOUR"].fillna(train["AMT_REQ_CREDIT_BUREAU_HOUR"].mean(), inplace=True) 

In [131]:
train["AMT_REQ_CREDIT_BUREAU_MON"].fillna(train["AMT_REQ_CREDIT_BUREAU_MON"].mean(), inplace=True)
test["AMT_REQ_CREDIT_BUREAU_MON"].fillna(train["AMT_REQ_CREDIT_BUREAU_MON"].mean(), inplace=True) 

In [132]:
train["AMT_REQ_CREDIT_BUREAU_QRT"].fillna(train["AMT_REQ_CREDIT_BUREAU_QRT"].mean(), inplace=True)
test["AMT_REQ_CREDIT_BUREAU_QRT"].fillna(train["AMT_REQ_CREDIT_BUREAU_QRT"].mean(), inplace=True) 

In [133]:
train["AMT_REQ_CREDIT_BUREAU_YEAR"].fillna(train["AMT_REQ_CREDIT_BUREAU_YEAR"].mean(), inplace=True)
test["AMT_REQ_CREDIT_BUREAU_YEAR"].fillna(train["AMT_REQ_CREDIT_BUREAU_YEAR"].mean(), inplace=True) 

In [134]:
train["CAR_TO_AGE_RATIO"].fillna(train["CAR_TO_AGE_RATIO"].mean(), inplace=True)
test["CAR_TO_AGE_RATIO"].fillna(train["CAR_TO_AGE_RATIO"].mean(), inplace=True) 

In [135]:
train["EXT_SOURCE_PRODUCT"].fillna(train["EXT_SOURCE_PRODUCT"].mean(), inplace=True)
test["EXT_SOURCE_PRODUCT"].fillna(train["EXT_SOURCE_PRODUCT"].mean(), inplace=True) 

In [136]:
train["EXT_SOURCE_PRODUCT"].fillna(train["EXT_SOURCE_PRODUCT"].mean(), inplace=True)
test["EXT_SOURCE_PRODUCT"].fillna(train["EXT_SOURCE_PRODUCT"].mean(), inplace=True) 

ONE HOT ENCONDING UNDER HERE

In [137]:
# Treat values above 60 (outliers) in OWN_CAR_AGE as missing values
train.loc[train["OWN_CAR_AGE"] >= 60, "OWN_CAR_AGE"] = np.nan
test.loc[test["OWN_CAR_AGE"] >= 60, "OWN_CAR_AGE"] = np.nan

In [138]:
# Divide OWN_CAR_AGE into groups
train["OWN_CAR_AGE"] = train["OWN_CAR_AGE"] // 10
test["OWN_CAR_AGE"] = test["OWN_CAR_AGE"] // 10

In [139]:
# Apply One Hot Encoding to OWN_CAR_AGE
train_car_age_ohe = pd.get_dummies(train["OWN_CAR_AGE"]).add_prefix("OWN_CAR_AGE_")
test_car_age_ohe = pd.get_dummies(test["OWN_CAR_AGE"]).add_prefix("OWN_CAR_AGE_")

# Add the one hot encoded columns to train/test
train = pd.concat([train, train_car_age_ohe], axis=1)
test = pd.concat([test, test_car_age_ohe], axis=1)

# Remove original OWN_CAR_AGE
train.drop('OWN_CAR_AGE', axis=1, inplace=True)
test.drop('OWN_CAR_AGE', axis=1, inplace=True)

In [140]:
train = pd.get_dummies(train, columns=['AGE_GROUP'], prefix='AGE_GROUP')
test = pd.get_dummies(test, columns=['AGE_GROUP'], prefix='AGE_GROUP')

In [141]:
train, test = train.align(test, join='left', axis=1, fill_value=0)

In [142]:
test.head(20)

Unnamed: 0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,AMT_CREDIT,AMT_ANNUITY,AMT_INCOME_TOTAL,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_EMPLOYED,CODE_GENDER,CNT_CHILDREN,NAME_EDUCATION_TYPE,NAME_INCOME_TYPE,NAME_FAMILY_STATUS,NAME_CONTRACT_TYPE,ORGANIZATION_TYPE,REGION_POPULATION_RELATIVE,NAME_HOUSING_TYPE,REGION_RATING_CLIENT,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,OCCUPATION_TYPE,CNT_FAM_MEMBERS,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CREDIT_INCOME_RATIO,ANNUITY_INCOME_RATIO,CREDIT_GOODS_RATIO,INCOME_PER_PERSON,PAYMENT_RATE,AGE_YEARS,EMPLOYED_YEARS,EMPLOY_TO_AGE_RATIO,WORKING_LIFE_RATIO,EXT_SOURCE_MEAN,EXT_SOURCE_STD,EXT_SOURCE_MIN,EXT_SOURCE_MAX,EXT_SOURCE_WEIGHTED,EXT_SOURCE_PRODUCT,DAYS_ID_PUBLISH_YEARS,DAYS_REGISTRATION_YEARS,ID_PUBLISH_TO_BIRTH_RATIO,CREDIT_BUREAU_TOTAL,CREDIT_BUREAU_WEIGHTED,HAS_RECENT_INQUIRY,DAYS_LAST_PHONE_CHANGE_YEARS,PHONE_COUNT,CAR_TO_AGE_RATIO,CHILDREN_RATIO,POPULATION_SCORE,REGIONAL_INCOME,HIGH_CREDIT_BURDEN,HIGH_PAYMENT_BURDEN,GOODS_OVERPAID,TARGET,OWN_CAR_AGE_0.0,OWN_CAR_AGE_1.0,OWN_CAR_AGE_2.0,OWN_CAR_AGE_3.0,OWN_CAR_AGE_4.0,OWN_CAR_AGE_5.0,AGE_GROUP_young,AGE_GROUP_adult,AGE_GROUP_middle,AGE_GROUP_senior,AGE_GROUP_elder
0,0.501692,0.720416,0.511177,961146.0,28233.0,144000.0,688500.0,-12108,-2372,1,1,41645,88268,109323,0,3866,0.025164,151916,2,-2446.0,-3022,1,1,0,4744,3.0,-1.0,0.006652,0.268418,0.267547,1.903644,6.674579,0.196061,1.395998,36000.0,0.029374,33.149897,6.494182,0.190167,0.195904,0.720416,,0.720416,0.720416,0.288166,0.143567,8.273785,6.696783,0.249587,0.0,0.0,0,0.002738,2,0.301663,0.25,0.050328,3623.616,1,0,0,0,False,False,False,False,False,False,False,True,False,False,False
1,0.501692,0.287306,0.511177,296280.0,16069.5,103500.0,225000.0,-17907,-1712,1,0,121612,88268,109323,0,4991,0.00702,151916,2,-10450.0,-253,1,1,1,2567,2.0,-212.0,0.006652,0.268418,0.267547,1.903644,2.862581,0.155259,1.316794,34500.0,0.054237,49.026694,4.687201,0.093694,0.095605,0.287306,,0.287306,0.287306,0.114923,0.143567,0.692676,28.610541,0.014129,0.0,0.0,0,0.580424,3,0.301663,0.0,0.01404,726.57,0,0,0,0,False,False,False,False,False,False,False,False,False,True,False
2,0.501692,0.352456,0.389339,183694.5,11236.5,180000.0,139500.0,-15221,-553,1,1,121612,40007,25285,0,4336,0.006852,151916,3,-1056.0,-4495,1,1,0,53745,2.0,-428.0,0.0,1.0,1.0,1.0,1.020519,0.062425,1.316797,60000.0,0.061169,41.672827,1.514031,0.03548,0.036331,0.370897,0.02608,0.352456,0.389339,0.296718,0.143567,12.306639,2.89117,0.295316,3.0,3.5,0,1.1718,2,0.301663,0.333333,0.020556,1233.36,0,0,0,0,False,False,False,False,False,False,False,False,True,False,False
3,0.501692,0.470384,0.217629,450000.0,22500.0,225000.0,450000.0,-11217,-1438,1,2,41645,88268,109323,1,37943,0.035792,151916,2,-6096.0,-1189,1,1,0,53745,4.0,-442.0,0.0,0.0,0.0,3.0,1.999991,0.1,0.999998,45000.0,0.05,30.710472,3.937029,0.124155,0.128198,0.344006,0.178725,0.217629,0.470384,0.275205,0.143567,3.255305,16.689938,0.106,3.0,1.5,0,1.21013,2,0.301663,0.4,0.071584,8053.2,0,0,0,0,False,False,False,False,False,False,False,True,False,False,False
4,0.269931,0.373133,0.511177,545040.0,26640.0,144000.0,450000.0,-11415,-2362,1,2,41645,88268,109323,0,37943,0.020713,151916,3,-3257.0,-1728,1,1,0,53745,4.0,-1333.0,0.0,0.0,0.0,3.0,3.784974,0.184999,1.211197,28800.0,0.048877,31.252567,6.466804,0.200505,0.206921,0.321532,0.072975,0.269931,0.373133,0.203239,0.143567,4.731006,8.91718,0.15138,3.0,1.5,0,3.649555,2,0.434074,0.4,0.062139,2982.672,0,0,0,0,False,True,False,False,False,False,False,True,False,False,False
5,0.501692,0.719388,0.304672,2115000.0,55791.0,274590.0,2115000.0,-17067,-2737,2,0,41645,40007,109323,0,37943,0.072508,151916,1,-1546.0,-601,1,1,1,11855,2.0,-1.0,0.0,0.0,0.0,0.0,7.702365,0.203179,1.0,91530.0,0.026379,46.726899,7.493498,0.157008,0.160368,0.51203,0.293248,0.304672,0.719388,0.409624,0.143567,1.645448,4.232717,0.035214,0.0,0.0,0,0.002738,3,0.301663,0.0,0.072508,19909.97172,1,0,0,0,False,False,False,False,False,False,False,False,False,True,False
6,0.501692,0.187916,0.511177,560664.0,18216.0,202500.0,468000.0,-14871,-2652,2,0,121612,88268,25285,0,21340,0.010643,151916,2,-7003.0,-4273,1,1,0,10353,1.0,-2169.0,0.006652,0.268418,0.267547,1.903644,2.768697,0.089955,1.197997,101250.0,0.03249,40.714579,7.26078,0.174059,0.178334,0.187916,,0.187916,0.187916,0.075166,0.143567,11.698836,19.173169,0.287338,0.0,0.0,0,5.938398,2,0.263697,0.0,0.021286,2155.2075,0,0,0,0,False,True,False,False,False,False,False,False,True,False,False
7,0.501692,0.41203,0.511177,225000.0,22252.5,90000.0,225000.0,-20361,-1392,1,0,121612,88268,109323,0,21340,0.011657,151916,1,-10718.0,-3665,1,1,1,17793,2.0,-1761.0,0.006652,0.268418,0.267547,1.903644,2.499972,0.247247,0.999996,30000.0,0.0989,55.74538,3.811088,0.067161,0.068366,0.41203,,0.41203,0.41203,0.164812,0.143567,10.034223,29.344285,0.180001,0.0,0.0,0,4.821355,3,0.301663,0.0,0.011657,1049.13,0,0,0,0,False,False,False,False,False,False,False,False,False,False,True
8,0.501692,0.276091,0.450747,675000.0,43267.5,225000.0,675000.0,-9095,-1740,2,1,121612,88268,109323,0,21340,0.01452,151916,2,-1535.0,-1708,1,1,0,53745,3.0,-108.0,0.0,0.0,0.0,4.0,2.999987,0.192299,0.999999,56250.0,0.0641,24.900753,4.76386,0.183927,0.191314,0.363419,0.123501,0.276091,0.450747,0.290735,0.143567,4.676249,4.202601,0.187795,4.0,2.0,0,0.295688,2,0.301663,0.25,0.02904,3267.0,0,0,0,0,False,False,False,False,False,False,True,False,False,False,False
9,0.389262,0.555237,0.336062,431280.0,22149.0,180000.0,360000.0,-14033,-4037,1,0,121612,40007,25285,0,37943,0.018801,8263,2,-8160.0,-393,1,1,1,1457,1.0,-2729.0,0.0,0.0,1.0,7.0,2.395987,0.123049,1.197997,90000.0,0.051356,38.42026,11.052704,0.280381,0.287679,0.426854,0.114321,0.336062,0.555237,0.434372,0.072634,1.075975,22.340862,0.028005,8.0,4.5,0,7.471595,3,0.405883,0.0,0.037602,3384.18,0,0,0,0,False,True,False,False,False,False,False,False,True,False,False


In [143]:
train

Unnamed: 0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,AMT_CREDIT,AMT_ANNUITY,AMT_INCOME_TOTAL,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_EMPLOYED,CODE_GENDER,CNT_CHILDREN,NAME_EDUCATION_TYPE,NAME_INCOME_TYPE,NAME_FAMILY_STATUS,NAME_CONTRACT_TYPE,ORGANIZATION_TYPE,REGION_POPULATION_RELATIVE,NAME_HOUSING_TYPE,REGION_RATING_CLIENT,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,OCCUPATION_TYPE,CNT_FAM_MEMBERS,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,CREDIT_INCOME_RATIO,ANNUITY_INCOME_RATIO,CREDIT_GOODS_RATIO,INCOME_PER_PERSON,PAYMENT_RATE,AGE_YEARS,EMPLOYED_YEARS,EMPLOY_TO_AGE_RATIO,WORKING_LIFE_RATIO,EXT_SOURCE_MEAN,EXT_SOURCE_STD,EXT_SOURCE_MIN,EXT_SOURCE_MAX,EXT_SOURCE_WEIGHTED,EXT_SOURCE_PRODUCT,DAYS_ID_PUBLISH_YEARS,DAYS_REGISTRATION_YEARS,ID_PUBLISH_TO_BIRTH_RATIO,CREDIT_BUREAU_TOTAL,CREDIT_BUREAU_WEIGHTED,HAS_RECENT_INQUIRY,DAYS_LAST_PHONE_CHANGE_YEARS,PHONE_COUNT,CAR_TO_AGE_RATIO,CHILDREN_RATIO,POPULATION_SCORE,REGIONAL_INCOME,HIGH_CREDIT_BURDEN,HIGH_PAYMENT_BURDEN,GOODS_OVERPAID,TARGET,OWN_CAR_AGE_0.0,OWN_CAR_AGE_1.0,OWN_CAR_AGE_2.0,OWN_CAR_AGE_3.0,OWN_CAR_AGE_4.0,OWN_CAR_AGE_5.0,AGE_GROUP_young,AGE_GROUP_adult,AGE_GROUP_middle,AGE_GROUP_senior,AGE_GROUP_elder
0,0.501692,0.372591,0.511177,755190.0,36328.5,112500.0,675000.0,-9233,-878,1,0,41645,88268,109323,0,4991,0.010032,151916,2,-333.0,-522,1,1,1,15271,2.0,-292.0,0.006652,0.268418,0.267547,1.903644,6.712740,0.322917,1.118798,37500.0,0.048105,25.278576,2.403833,0.091475,0.095094,0.372591,,0.372591,0.372591,0.149036,0.143567,1.429158,0.911704,0.056536,0.0,0.0,0,0.799452,3,0.301663,0.0,0.020064,1128.60000,1,1,0,0,False,False,False,False,False,False,False,True,False,False,False
1,0.501692,0.449567,0.553165,585000.0,16893.0,225000.0,585000.0,-20148,365243,1,0,121612,30895,109323,0,30898,0.008019,151916,2,-4469.0,-3436,1,0,0,53745,2.0,-617.0,0.000000,0.000000,0.000000,1.000000,2.599988,0.075080,0.999998,75000.0,0.028877,55.162218,-999.980835,-17.805223,-18.128003,0.501366,0.073255,0.449567,0.553165,0.401093,0.143567,9.407255,12.235455,0.170538,1.0,0.5,0,1.689254,1,0.301663,0.0,0.016038,1804.27500,0,0,0,0,False,False,False,False,False,False,False,False,False,False,True
2,0.501692,0.569503,0.511177,334152.0,18256.5,54000.0,270000.0,-18496,-523,1,0,121612,12007,109323,0,1185,0.004960,151916,2,-3640.0,-2050,1,1,1,15271,2.0,-542.0,0.006652,0.268418,0.267547,1.903644,6.187885,0.338077,1.237595,18000.0,0.054635,50.639288,1.431896,0.027729,0.028276,0.569503,,0.569503,0.569503,0.227801,0.143567,5.612594,9.965777,0.110835,0.0,0.0,0,1.483915,3,0.301663,0.0,0.009920,267.84000,1,1,0,0,False,False,False,False,False,False,False,False,False,True,False
3,0.501692,0.105235,0.767523,152820.0,8901.0,67500.0,135000.0,-24177,365243,1,0,2133,30895,8952,0,30898,0.005002,151916,3,-4950.0,-3951,1,0,0,53745,1.0,0.0,0.000000,0.000000,0.000000,0.000000,2.263966,0.131865,1.131992,33750.0,0.058245,66.193018,-999.980835,-14.882213,-15.107044,0.436379,0.468309,0.105235,0.767523,0.349103,0.143567,10.817248,13.552361,0.163420,0.0,0.0,0,-0.000000,1,0.301663,0.0,0.015006,337.63500,0,0,0,0,False,False,False,False,False,False,False,False,False,False,True
4,0.342344,0.202490,0.669057,271066.5,21546.0,157500.0,234000.0,-10685,-697,2,0,121612,40007,109323,0,37943,0.006296,8263,3,-5101.0,-3226,1,1,1,10353,2.0,-1243.0,0.000000,0.000000,0.000000,4.000000,1.721046,0.136799,1.158399,52500.0,0.079486,29.253936,1.908282,0.063075,0.065232,0.404630,0.239439,0.202490,0.669057,0.417087,0.046380,8.832307,13.965777,0.301919,4.0,2.0,0,3.403149,3,0.301663,0.0,0.018888,991.62000,0,0,0,1,False,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171197,0.501692,0.404560,0.768808,404325.0,20772.0,83250.0,337500.0,-20529,-3059,2,0,121612,88268,109323,0,1338,0.031329,151916,2,-11581.0,-3689,1,1,1,30723,2.0,-2341.0,0.000000,0.000000,1.000000,0.000000,4.856698,0.249511,1.197996,27750.0,0.051374,56.205339,8.375086,0.146404,0.149009,0.586684,0.257562,0.404560,0.768808,0.469347,0.143567,10.099932,31.707050,0.179697,1.0,1.0,0,6.409309,3,0.367099,0.0,0.062658,2608.13925,0,0,0,0,False,False,True,False,False,False,False,False,False,False,True
171198,0.501692,0.608542,0.511177,601470.0,29065.5,247500.0,450000.0,-22083,-129,1,0,121612,88268,25285,0,1833,0.010006,151916,2,-4629.0,-1773,1,1,0,3353,1.0,-1688.0,0.000000,0.000000,1.000000,5.000000,2.430172,0.117436,1.336597,123750.0,0.048324,60.459959,0.353183,0.005747,0.005842,0.608542,,0.608542,0.608542,0.243417,0.143567,4.854209,12.673511,0.080288,6.0,3.5,0,4.621492,2,0.016271,0.0,0.020012,2476.48500,0,0,0,0,True,False,False,False,False,False,False,False,False,False,True
171199,0.501692,0.664305,0.758393,1237684.5,49216.5,292500.0,1138500.0,-11053,-2536,1,2,41645,12007,109323,0,4991,0.006629,151916,2,-4858.0,-3393,1,1,0,15271,4.0,-515.0,0.000000,0.000000,0.000000,1.000000,4.231386,0.168261,1.087118,58500.0,0.039765,30.261465,6.943190,0.222101,0.229440,0.711349,0.066530,0.664305,0.758393,0.569079,0.143567,9.289528,13.300479,0.306975,1.0,0.5,0,1.409993,2,0.301663,0.4,0.013258,1938.98250,0,0,0,0,False,False,False,False,False,False,False,True,False,False,False
171200,0.210918,0.627050,0.511177,239850.0,25186.5,112500.0,225000.0,-8505,-165,2,0,121612,40007,25285,0,37943,0.009657,151916,2,-3318.0,-1176,1,1,0,17793,1.0,-1133.0,0.006652,0.268418,0.267547,1.903644,2.131981,0.223878,1.065995,56250.0,0.105009,23.285421,0.451745,0.018602,0.019400,0.418984,0.294250,0.210918,0.627050,0.293004,0.143567,3.219713,9.084189,0.138272,0.0,0.0,0,3.101985,2,0.288239,0.0,0.019314,1086.41250,0,0,0,0,True,False,False,False,False,False,True,False,False,False,False
