In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir
from modules.v1.catboostModel import trainCatboost, testCatboost

In [3]:
VERSION = 2
TRAIN_PATH = getDataDir("train")
TEST_PATH = getDataDir("test")
CLEANED_TRAIN_PATH = getDataDir("train", VERSION)
CLEANED_TEST_PATH = getDataDir("test", VERSION)

In [4]:
df = pd.read_csv(TRAIN_PATH)
df.columns

Index(['ID', 'pms_i_ymd', 'Area', 'Province', 'Shop Name', 'gender',
       'date_of_birth_week', 'date_of_birth', 'marital_status',
       'number_of_children', 'postal_code', 'tel_category',
       'number_of_resident', 'living_period_year', 'living_period_month',
       'type_of_residence', 'c_postal_code', 'c_business_type',
       'c_number_of_employee', 'c_position', 'c_occupation',
       'c_employment_status', 'c_monthly_salary', 'c_number_of_working_year',
       'c_number_of_working_month', 'c_salary_payment_methods',
       'c_date_of_salary_payment', 'media', 'place_for_sending_information',
       'r_expected_credit_limit', 'r_propose', 'r_allloan_case',
       'r_allloan_amount', 'r_additional_income', 'r_spouse_income',
       'r_generalcode1', 'r_generalcode2', 'r_generalcode3', 'r_generalcode4',
       'r_generalcode5', 'apply', 'default_12month'],
      dtype='object')

## Clean Datasets

In [None]:
cat_features = [
    'gender', 'c_business_type', 'c_position', 'c_occupation',
       'c_employment_status', 'c_monthly_salary', 'r_expected_credit_limit',
       'r_additional_income', 'r_generalcode3', 'apply',
       'age', 'months_at_residence', 'months_at_job', 'total_income'
]

In [6]:
def cleanDataset(df, cleanPath):

    # %%
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report
    from datetime import datetime

    # %%
    import sys
    import os
    root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
    if root_path not in sys.path:
        sys.path.append(root_path)
    from fileDir import getDataDir, getModelDir, getPredDir

    # %%
    df = pd.read_csv(getDataDir("train", 0))
    df.head()

    # %%
    df["pms_i_ymd"] = pd.to_datetime(df["pms_i_ymd"], errors="coerce")
    df["date_of_birth"] = pd.to_datetime(df["date_of_birth"], errors="coerce")

    # %%
    df["age"] = (((df["pms_i_ymd"] - df["date_of_birth"]).dt.days / 365).astype(int))
    df["months_at_residence"] = df["living_period_year"] * 12 + df["living_period_month"]
    df["months_at_job"] = df["c_number_of_working_year"] * 12 + df["c_number_of_working_month"]
    df["total_income"] = df["c_monthly_salary"] + df["r_additional_income"]
    #df["debt_to_income_ratio"] = df["r_allloan_amount"] / (df["total_income"] + 1)
    #df["loan_burden_per_income"] = df["r_expected_credit_limit"] / (df["total_income"] + 1)

    # %%
    df.head()

    # %%
    df.shape

    # %%
    df.columns

    # %%
    df = df.drop(columns=["ID", "pms_i_ymd", "Area", "Province", "Shop Name", "date_of_birth_week", "date_of_birth", "marital_status", "number_of_children", "postal_code", "tel_category", "number_of_resident", "living_period_year", "living_period_month", "c_number_of_working_year", "c_postal_code", "c_number_of_working_month", "c_salary_payment_methods", "c_date_of_salary_payment", "media", "place_for_sending_information", "r_propose", "r_allloan_case", "r_allloan_amount", "r_generalcode4", "r_generalcode5"])
    df.columns

    # %%
    df = df.dropna(axis=1, thresh=len(df)*0.6)

    df.columns

    # %%
    df = df.drop(columns=["r_spouse_income", "c_number_of_employee", "type_of_residence"])
    df.columns

    # %%
    categorical_cols = [
        "gender", "c_business_type", "c_position",
        "c_occupation", "c_employment_status",
        "c_employment_status",
        "r_generalcode3", "apply"
    ]

    # %%
    from sklearn.impute import SimpleImputer

    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns

    # Create imputers
    num_imputer = SimpleImputer(strategy='median')           # for numeric columns
    cat_imputer = SimpleImputer(strategy='most_frequent')    # for categorical columns

    # Fit and transform
    df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])
    df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

    # %%
    from sklearn.preprocessing import LabelEncoder
    df[["gender", "apply", "r_generalcode3", "r_additional_income", "default_12month"]] = df[["gender", "apply", "r_generalcode3", "r_additional_income", "default_12month"]].apply(LabelEncoder().fit_transform)

    # %%
    # Loop through each numeric column and remove outliers
    numeric_cols = list(numeric_cols)
    numeric_cols.remove("default_12month") 
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)   # 25th percentile
        Q3 = df[col].quantile(0.75)   # 75th percentile
        IQR = Q3 - Q1                 # Interquartile Range
        lower = Q1 - 1.5 * IQR        # Lower bound
        upper = Q3 + 1.5 * IQR        # Upper bound

        # Keep only the rows within the bounds
        df = df[(df[col] >= lower) & (df[col] <= upper)]

    # %%
    df['c_monthly_salary'] = pd.qcut(df['c_monthly_salary'], q=3, labels=['low', 'medium', 'high'])
    df['r_expected_credit_limit'] = pd.qcut(df['r_expected_credit_limit'], q=3, labels=['low', 'medium', 'high'])
    df['age'] = pd.qcut(df['age'], q=3, labels=['low', 'medium', 'high'])
    df['months_at_residence'] = pd.qcut(df['months_at_residence'], q=3, labels=['low', 'medium', 'high'])
    df['months_at_job'] = pd.qcut(df['months_at_job'], q=3, labels=['low', 'medium', 'high'])
    df['total_income'] = pd.qcut(df['total_income'], q=3, labels=['low', 'medium', 'high'])

    df[["c_employment_status", "c_occupation", "c_position", "c_business_type", "c_monthly_salary", "r_expected_credit_limit", "age", "months_at_residence", "months_at_job", "total_income"]] = df[["c_employment_status", "c_occupation", "c_position", "c_business_type", "c_monthly_salary", "r_expected_credit_limit", "age", "months_at_residence", "months_at_job", "total_income"]].apply(LabelEncoder().fit_transform)

    df.head()

    # %%
    categorical_cols = [
        "c_monthly_salary", "r_expected_credit_limit", "age", "months_at_residence", "months_at_job", "total_income"
    ]

    for col in cat_features:
        df[col] = df[col].astype("str")

    df.to_csv(cleanPath, index=False)
    return df

In [7]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
ids = test_df["ID"].copy()

train_df = cleanDataset(train_df, CLEANED_TRAIN_PATH)
train_df.head()
test_df = cleanDataset(test_df, CLEANED_TEST_PATH)

In [8]:
train_df.head()

Unnamed: 0,gender,c_business_type,c_position,c_occupation,c_employment_status,c_monthly_salary,r_expected_credit_limit,r_additional_income,r_generalcode3,apply,default_12month,age,months_at_residence,months_at_job,total_income
3,2,2,2,4,1,0,2,0,1,4,0,2,2,0,0
5,1,7,1,2,0,2,1,0,2,4,0,1,1,2,1
6,1,7,1,2,0,2,1,0,1,4,0,1,0,2,1
7,1,3,2,3,1,1,1,0,1,4,0,2,0,1,1
16,1,4,1,2,0,2,2,0,1,4,0,1,2,2,2


## Testing

In [9]:
threshold = trainCatboost(VERSION, train_df)

ValueError: 'default_12month' is not in list

In [None]:
testCatboost(VERSION, test_df, ids, threshold, cat_features)

NameError: name 'threshold' is not defined