In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import warnings

In [None]:
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")
sample_sub = pd.read_csv("input/sample_submission.csv")

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
original_feature  = [
    "EXT_SOURCE_1",
    "EXT_SOURCE_2", 
    "EXT_SOURCE_3",
    
    "AMT_CREDIT",
    "AMT_ANNUITY",
    "AMT_INCOME_TOTAL",
    "AMT_GOODS_PRICE",
    
    "OWN_CAR_AGE",

    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "CODE_GENDER",
    "CNT_CHILDREN",
    
    "NAME_EDUCATION_TYPE",
    "NAME_INCOME_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_CONTRACT_TYPE",
    
    "ORGANIZATION_TYPE",
    "REGION_POPULATION_RELATIVE",

    "NAME_HOUSING_TYPE",           # Housing situation (renting, own, etc.)
    "REGION_RATING_CLIENT",        # Region rating
    "DAYS_REGISTRATION",           # How long registered
    "DAYS_ID_PUBLISH",            # How long since ID published
    "FLAG_MOBIL",                 # Did client provide mobile phone
    "FLAG_EMP_PHONE",             # Did client provide work phone
    "FLAG_WORK_PHONE",            # Did client provide home phone
    "OCCUPATION_TYPE",            # Client's occupation
    "CNT_FAM_MEMBERS",            # Family size
    "DAYS_LAST_PHONE_CHANGE",    # Days since phone change
    "AMT_REQ_CREDIT_BUREAU_HOUR", # Credit bureau inquiries (last hour)
    "AMT_REQ_CREDIT_BUREAU_MON",  # Credit bureau inquiries (last month)
    "AMT_REQ_CREDIT_BUREAU_QRT",  # Credit bureau inquiries (last quarter)
    "AMT_REQ_CREDIT_BUREAU_YEAR", # Credit bureau inquiries (last year)
]


In [None]:
def create_derived_features(df):
    """Only create features proven to boost performance"""
    
    # 1. EXT_SOURCE COMBINATIONS (Most Important!)
    # These are the most predictive features in the dataset
    df['EXT_SOURCE_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df['EXT_SOURCE_PRODUCT'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['EXT_SOURCE_WEIGHTED'] = (
        df['EXT_SOURCE_1'].fillna(0) * 0.3 + 
        df['EXT_SOURCE_2'].fillna(0) * 0.4 + 
        df['EXT_SOURCE_3'].fillna(0) * 0.3
    )
    
    # 2. CRITICAL FINANCIAL RATIOS (Top 3 only)
    df['CREDIT_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    
    # 3. AGE FEATURES (Convert to years for better interpretability)
    df['AGE_YEARS'] = -df['DAYS_BIRTH'] / 365.25
    df['EMPLOYED_YEARS'] = df['DAYS_EMPLOYED'] / 365.25
    # Fix employment anomaly (365243 = unemployed)
    df.loc[df['EMPLOYED_YEARS'] > 0, 'EMPLOYED_YEARS'] = np.nan
    df['EMPLOYED_YEARS'] = -df['EMPLOYED_YEARS']
    
    # 4. CREDIT BUREAU ACTIVITY (Combine into single feature)
    df['CREDIT_BUREAU_TOTAL'] = (
        df['AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(0) +
        df['AMT_REQ_CREDIT_BUREAU_MON'].fillna(0) +
        df['AMT_REQ_CREDIT_BUREAU_QRT'].fillna(0) +
        df['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(0)
    )
    
    # 5. DOCUMENT AGE (Convert to years)
    df['DAYS_ID_PUBLISH_YEARS'] = -df['DAYS_ID_PUBLISH'] / 365.25
    
    return df

In [None]:
train = create_derived_features(train)
test = create_derived_features(test)

In [None]:
train

In [None]:
test

In [None]:

pd.set_option('display.max_columns', None)

COUNT ENCODING UNDER HERE

In [None]:
# Numerization of ORGANIZATION_TYPE (Count Encoding）
organization_ce = train["ORGANIZATION_TYPE"].value_counts()
train["ORGANIZATION_TYPE"] = train["ORGANIZATION_TYPE"].map(organization_ce)
test["ORGANIZATION_TYPE"] = test["ORGANIZATION_TYPE"].map(organization_ce)

In [None]:
nameFamStatus = train["NAME_FAMILY_STATUS"].value_counts()
train["NAME_FAMILY_STATUS"] = train["NAME_FAMILY_STATUS"].map(nameFamStatus)
test["NAME_FAMILY_STATUS"] = test["NAME_FAMILY_STATUS"].map(nameFamStatus)

In [None]:
incomeType = train["NAME_INCOME_TYPE"].value_counts()
train["NAME_INCOME_TYPE"] = train["NAME_INCOME_TYPE"].map(incomeType)
test["NAME_INCOME_TYPE"] = test["NAME_INCOME_TYPE"].map(incomeType)

In [None]:
eduType = train["NAME_EDUCATION_TYPE"].value_counts()
train["NAME_EDUCATION_TYPE"] = train["NAME_EDUCATION_TYPE"].map(eduType)
test["NAME_EDUCATION_TYPE"] = test["NAME_EDUCATION_TYPE"].map(eduType)

In [None]:
occType = train["OCCUPATION_TYPE"].value_counts()
train["OCCUPATION_TYPE"] = train["OCCUPATION_TYPE"].map(occType)
test["OCCUPATION_TYPE"] = test["OCCUPATION_TYPE"].map(occType)

In [None]:
occType = train["NAME_HOUSING_TYPE"].value_counts()
train["NAME_HOUSING_TYPE"] = train["NAME_HOUSING_TYPE"].map(occType)
test["NAME_HOUSING_TYPE"] = test["NAME_HOUSING_TYPE"].map(occType)

In [None]:
# Treat values above 60 (outliers) in OWN_CAR_AGE as missing values
train.loc[train["REGION_POPULATION_RELATIVE"] >= 0.07, "OWN_CAR_AGE"] = np.nan
test.loc[test["REGION_POPULATION_RELATIVE"] >= 0.07, "OWN_CAR_AGE"] = np.nan

LABEL ENCODING UNDER HERE

In [None]:
# Numerization of NAME_CONTRACT_TYPE（Label Encoding）
train["NAME_CONTRACT_TYPE"].replace({'Cash loans': 0, 'Revolving loans': 1}, inplace=True)
test["NAME_CONTRACT_TYPE"].replace({'Cash loans': 0, 'Revolving loans': 1}, inplace=True)

In [None]:
# Numerization of NAME_CONTRACT_TYPE（Label Encoding）
train["CODE_GENDER"].replace({'XNA': 0, 'F': 1, 'M' : 2 }, inplace=True)
test["CODE_GENDER"].replace({'XNA': 0, 'F': 1, 'M' : 2}, inplace=True)

CHANGE NAN UNDER HERE

In [None]:
train["EXT_SOURCE_2"].fillna(train["EXT_SOURCE_2"].mean(), inplace=True)
test["EXT_SOURCE_2"].fillna(train["EXT_SOURCE_2"].mean(), inplace=True) 

In [None]:
train["EXT_SOURCE_1"].fillna(train["EXT_SOURCE_1"].mean(), inplace=True)
test["EXT_SOURCE_1"].fillna(train["EXT_SOURCE_1"].mean(), inplace=True) 

In [None]:
train["EXT_SOURCE_3"].fillna(train["EXT_SOURCE_3"].mean(), inplace=True)
test["EXT_SOURCE_3"].fillna(train["EXT_SOURCE_3"].mean(), inplace=True) 

In [None]:
train["AMT_REQ_CREDIT_BUREAU_HOUR"].fillna(train["AMT_REQ_CREDIT_BUREAU_HOUR"].mean(), inplace=True)
test["AMT_REQ_CREDIT_BUREAU_HOUR"].fillna(train["AMT_REQ_CREDIT_BUREAU_HOUR"].mean(), inplace=True) 

In [None]:
train["AMT_REQ_CREDIT_BUREAU_MON"].fillna(train["AMT_REQ_CREDIT_BUREAU_MON"].mean(), inplace=True)
test["AMT_REQ_CREDIT_BUREAU_MON"].fillna(train["AMT_REQ_CREDIT_BUREAU_MON"].mean(), inplace=True) 

In [None]:
train["AMT_REQ_CREDIT_BUREAU_QRT"].fillna(train["AMT_REQ_CREDIT_BUREAU_QRT"].mean(), inplace=True)
test["AMT_REQ_CREDIT_BUREAU_QRT"].fillna(train["AMT_REQ_CREDIT_BUREAU_QRT"].mean(), inplace=True) 

In [None]:
train["AMT_REQ_CREDIT_BUREAU_YEAR"].fillna(train["AMT_REQ_CREDIT_BUREAU_YEAR"].mean(), inplace=True)
test["AMT_REQ_CREDIT_BUREAU_YEAR"].fillna(train["AMT_REQ_CREDIT_BUREAU_YEAR"].mean(), inplace=True) 

ONE HOT ENCONDING UNDER HERE

In [None]:
# Treat values above 60 (outliers) in OWN_CAR_AGE as missing values
train.loc[train["OWN_CAR_AGE"] >= 60, "OWN_CAR_AGE"] = np.nan
test.loc[test["OWN_CAR_AGE"] >= 60, "OWN_CAR_AGE"] = np.nan

In [None]:
# Divide OWN_CAR_AGE into groups
train["OWN_CAR_AGE"] = train["OWN_CAR_AGE"] // 10
test["OWN_CAR_AGE"] = test["OWN_CAR_AGE"] // 10

In [None]:
# Apply One Hot Encoding to OWN_CAR_AGE
train_car_age_ohe = pd.get_dummies(train["OWN_CAR_AGE"]).add_prefix("OWN_CAR_AGE_")
test_car_age_ohe = pd.get_dummies(test["OWN_CAR_AGE"]).add_prefix("OWN_CAR_AGE_")

# Add the one hot encoded columns to train/test
train = pd.concat([train, train_car_age_ohe], axis=1)
test = pd.concat([test, test_car_age_ohe], axis=1)

# Remove original OWN_CAR_AGE
train.drop('OWN_CAR_AGE', axis=1, inplace=True)
test.drop('OWN_CAR_AGE', axis=1, inplace=True)

In [None]:
train

In [None]:
test