In [128]:
import pandas as pd
import matplotlib as mplt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report,accuracy_score 
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

In [129]:
def load_df(file:str, direct:str) -> pd.DataFrame:
    df = pd.read_csv(f"data/{direct}/{file}.csv")   

    return df

In [130]:
DIR = "credit_default"
train = load_df("application_train", DIR)
test = load_df("application_test", DIR)

In [132]:
def enc_cols(df:pd.DataFrame) -> pd.DataFrame:
    categorical_cols = [col for col in df.columns if (df[col].dtype == 'object' and col != "SK_ID_CURR")]
    for cat_col in categorical_cols:
        le = LabelEncoder()
        le.fit(df[cat_col])
        df[cat_col]= le.transform(df[cat_col])
        
    df["CREDIT_INCOME_PERCENT"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]
    df["ANNUITY_INCOME_PERCENT"] = df["AMT_ANNUITY"] / df["AMT_INCOME_TOTAL"]
    df["CREDIT_TERM"] = df["AMT_ANNUITY"] / df["AMT_CREDIT"]
    df["DAYS_EMPLOYED_PERC"] = df["DAYS_EMPLOYED"] / df["DAYS_BIRTH"]
    df["INCOME_PER_PERSON"] = df["AMT_INCOME_TOTAL"] / df["CNT_FAM_MEMBERS"]
    doc_flags = [f"FLAG_DOCUMENT_{i}" for i in range(2,20)]
    df.drop(columns=doc_flags)
    df.drop(columns=["AMT_ANNUITY", "AMT_INCOME_TOTAL", "CNT_FAM_MEMBERS", "DAYS_BIRTH"])
    df["DOCS_PROVIDED_COUNT"] = df[doc_flags].sum(axis=1)
    return df

In [147]:
ratio = (train["TARGET"] == 0).sum() / (train["TARGET"] == 1).sum()
base_params = {
    "eval_metric": "auc",
    "learning_rate": 0.1,
    "max_depth": 6,
    "max_leaves": 6,
    "tree_method": "hist",
    "scale_pos_weight": ratio
}

params_2 = {
    "learning_rate": 0.03,
    "max_depth": 7,
    "n_estimators": 1000,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "scale_pos_weight": (y == 0).sum() / (y == 1).sum(),
    "eval_metric": "auc",
    "tree_method": "hist",
    "random_state": 42,
}


params = base_params

In [161]:
df = enc_cols(train)

X = df.drop(["SK_ID_CURR", "TARGET"], axis=1)
y = df['TARGET'].copy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.8, random_state=52)
pivot = len(X_train) // 2

tree = XGBClassifier(**params)
tree.fit(
        X_train, y_train,
        # eval_set=[(X_val[:pivot], y_val[:pivot])],
        verbose=False
        )

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [162]:
proba = tree.predict_proba((X_val))[:, 1]
pred = tree.predict(X_val)

print(f"Accurancy: {accuracy_score(pred, y_val):.6f}")
print(f"ROC-AUC: {roc_auc_score(y_val, proba):.6f}")
print(f"{classification_report(y_val, pred)}")
#ROC-AUC: 0.754266

Accurancy: 0.725126
ROC-AUC: 0.747130
              precision    recall  f1-score   support

           0       0.96      0.73      0.83    146807
           1       0.17      0.63      0.27     13099

    accuracy                           0.73    159906
   macro avg       0.57      0.68      0.55    159906
weighted avg       0.89      0.73      0.78    159906



In [168]:
test_clean = enc_cols(test)

tree.fit(X, y,)
commpetitions = pd.DataFrame({
    "SK_ID_CURR": test["SK_ID_CURR"].copy(),
    "TARGET": tree.predict_proba(test_clean.drop("SK_ID_CURR", axis=1))[:, 1]
})
commpetitions.to_csv("submission.csv", index=False)


In [169]:
commpetitions['SK_ID_CURR'] = commpetitions['SK_ID_CURR']
commpetitions.to_csv("submission.csv", index=False)

In [170]:
commpetitions.dtypes

SK_ID_CURR     object
TARGET        float32
dtype: object