In [1]:
from xgboost import XGBClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score, roc_auc_score, confusion_matrix 

In [2]:
def load_df(file_name:str, directory:str) -> pd.DataFrame:
    df = pd.read_csv(f"data/{directory}/{file_name}.csv")
    print(f"Dataset {file_name} info:\n\n"
            f"Is NA:\n{df.isna().sum()}\n\n" +  
            f"Is dtype:\n{df.dtypes}\n\n" +
            f"Columns:\n{'\n'.join(df.columns)}" +
            f"Length of dataset:\n{df.shape[0]}")
    return df
DIRECTORY = "credit"
test = load_df("test", DIRECTORY)
train = load_df("train", DIRECTORY)
print(1)

Dataset test info:

Is NA:
Id                                         0
RevolvingUtilizationOfUnsecuredLines       0
age                                        0
NumberOfTime30-59DaysPastDueNotWorse       0
DebtRatio                                  0
MonthlyIncome                           8950
NumberOfOpenCreditLinesAndLoans            0
NumberOfTimes90DaysLate                    0
NumberRealEstateLoansOrLines               0
NumberOfTime60-89DaysPastDueNotWorse       0
NumberOfDependents                      1175
dtype: int64

Is dtype:
Id                                        int64
RevolvingUtilizationOfUnsecuredLines    float64
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysP

In [3]:
params = {
    "n_estimators": 500,
    "learning_rate": 0.05,
    "max_depth": 3,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "eval_metric": "auc",
    "random_state": 42 
}

In [10]:
X = train.drop(["Id", "SeriousDlqin2yrs"], axis=1)
y = train["SeriousDlqin2yrs"].copy()
# params = {
#     **params,
#     "scale_pos_weight": (y == 0).sum() / (y == 1).sum()
# }
model = XGBClassifier(**params)
model.fit(X, y)
pred = model.predict(test.drop("Id", axis=1))
commpetitions = pd.DataFrame({
    "Id": test["Id"],
    "SeriousDlqin2yrs": model.predict_proba(test.drop("Id", axis=1))[:,1]
})
commpetitions.to_csv("submission.csv", index=False)

In [6]:
X = train.drop(["Id", "SeriousDlqin2yrs"], axis=1)
y = train["SeriousDlqin2yrs"].copy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    **params,
    "scale_pos_weight": (y == 0).sum() / (y == 1).sum()
}
model = XGBClassifier(**params)
model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [7]:
pred = model.predict(X_val)
pred_proba = model.predict_proba(X_val)[:, 1]
print(pred_proba)


print(f"AUC: {roc_auc_score(y_val, pred_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_val, pred):.4f}")
print(classification_report(y_val, pred, digits=3))
print(confusion_matrix(y_val, pred))

[0.14047405 0.5327456  0.06610609 ... 0.55911785 0.06217201 0.42060462]
AUC: 0.8671
Accuracy: 0.8007
              precision    recall  f1-score   support

           0      0.982     0.802     0.883     19637
           1      0.210     0.777     0.330      1324

    accuracy                          0.801     20961
   macro avg      0.596     0.790     0.607     20961
weighted avg      0.933     0.801     0.848     20961

[[15755  3882]
 [  295  1029]]
