In [2]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [3]:
def load_df(file_name:str) -> pd.DataFrame:
    df = pd.read_csv(f"data/credit/{file_name}.csv")
    print(f"Dataset {file_name} info:\n\n"
            f"Is NA:\n{df.isna().sum()}\n\n" +  
            f"Is dtype:\n{df.dtypes}\n\n" +
            f"Columns:\n{'\n'.join(df.columns)}" +
            f"Length of dataset:\n{df.shape[0]}")
    return df
test = load_df("test")
train = load_df("train")

def load_df(file_name:str) -> pd.DataFrame:
    df = pd.read_csv(f"data/credit/{file_name}.csv")
    print(f"Dataset {file_name} info:\n\n"
            f"Is NA:\n{df.isna().sum()}\n\n" +  
            f"Is dtype:\n{df.dtypes}\n\n" +
            f"Columns:\n{'\n'.join(df.columns)}" +
            f"Length of dataset:\n{df.shape[0]}")
    return df
test = load_df("test")
train = load_df("train")



Dataset test info:

Is NA:
Id                                         0
RevolvingUtilizationOfUnsecuredLines       0
age                                        0
NumberOfTime30-59DaysPastDueNotWorse       0
DebtRatio                                  0
MonthlyIncome                           8950
NumberOfOpenCreditLinesAndLoans            0
NumberOfTimes90DaysLate                    0
NumberRealEstateLoansOrLines               0
NumberOfTime60-89DaysPastDueNotWorse       0
NumberOfDependents                      1175
dtype: int64

Is dtype:
Id                                        int64
RevolvingUtilizationOfUnsecuredLines    float64
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysP

In [4]:
def clean_dataset(df:pd.DataFrame) -> pd.DataFrame:
    clean_df = df.copy()
    median_age = df["age"].median()
    clean_df.loc[(df["age"] < 21) | (df["age"] > 80), "age"] = median_age
    clean_df.loc[(df["DebtRatio"] > 1), "DebtRatio"] = 1 
    clean_df["MonthlyIncome"] = clean_df["MonthlyIncome"].fillna(clean_df["MonthlyIncome"].median())
    clean_df["NumberOfDependents"] = clean_df["NumberOfDependents"].fillna(0)

    return clean_df

In [32]:
clean_train = clean_dataset(train)
clean_train.to_csv("temp/train.csv", index=False)

In [10]:
for col in clean_train:
    print(f"Median of {col}: {clean_train[col].median()}")

Median of Id: 74931.0
Median of SeriousDlqin2yrs: 0.0
Median of RevolvingUtilizationOfUnsecuredLines: 0.153983669
Median of age: 52.0
Median of NumberOfTime30-59DaysPastDueNotWorse: 0.0
Median of DebtRatio: 0.36662593
Median of MonthlyIncome: 5400.0
Median of NumberOfOpenCreditLinesAndLoans: 8.0
Median of NumberOfTimes90DaysLate: 0.0
Median of NumberRealEstateLoansOrLines: 1.0
Median of NumberOfTime60-89DaysPastDueNotWorse: 0.0
Median of NumberOfDependents: 0.0


In [49]:
X = clean_train.drop(["Id", "SeriousDlqin2yrs"], axis=1)
y = clean_train["SeriousDlqin2yrs"].copy()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

model = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=1000, solver="lbfgs", class_weight="balanced")
)

model.fit(X_train, y_train)

0,1,2
,steps,"[('standardscaler', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [51]:
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]
print(f"{y_proba}")

print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred, digits=3))
print("ROC-AUC:", roc_auc_score(y_val, y_proba))

[0.44069942 0.49908167 0.50476369 ... 0.17130037 0.48401983 0.52614872]
[[15241  4361]
 [  451   908]]
              precision    recall  f1-score   support

           0      0.971     0.778     0.864     19602
           1      0.172     0.668     0.274      1359

    accuracy                          0.770     20961
   macro avg      0.572     0.723     0.569     20961
weighted avg      0.919     0.770     0.825     20961

ROC-AUC: 0.7980441582187519


In [43]:
clean_test = clean_dataset(test)

model.fit(X, y)
res = model.predict(clean_test.drop("Id", axis=1))
print(res)

commpetitions = pd.DataFrame({
    "Id": clean_test["Id"],
    "SeriousDlqin2yrs": res
})
commpetitions.to_csv("commpetitions.csv", index=False)

[1 0 0 ... 0 0 0]
