In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Loading and exploring data

In [2]:
filepath = os.path.join('..','raw_data', 'dataset.csv')
data = pd.read_csv(filepath, sep=';')

In [3]:
data.head(5)

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0,0.0,0.0,0.0,0.0,1.0,1.0,,...,1,1,1,1,0,0,0,178839,9.653333,1.0
1,0e961183-8c15-4470-9a5e-07a1bd207661,0.0,0,0.0,0.0,0.0,,1.0,1.0,1.0,...,1,1,2,2,0,0,0,49014,13.181389,
2,d8edaae6-4368-44e0-941e-8328f203e64e,0.0,0,0.0,0.0,0.0,,,,,...,1,1,2,2,0,0,0,124839,11.561944,1.0
3,0095dfb6-a886-4e2a-b056-15ef45fdb0ef,0.0,0,,,,,,,,...,1,1,1,1,0,0,0,324676,15.751111,1.0
4,c8f8b835-5647-4506-bf15-49105d8af30b,0.0,0,0.0,0.0,0.0,,,,,...,0,1,1,1,0,0,0,7100,12.698611,


In [92]:
test_data= data[data['default'].isnull()]

In [5]:
train_data =data.drop(index=test_data.index)

In [6]:
train_data.duplicated().sum()

0

In [7]:
X_train = train_data.drop(columns=['default', 'uuid'], axis=1)
y_train = train_data['default']

In [78]:
X_train['merchant_group'].nunique()

12

In [9]:
categorical_columns = X_train.select_dtypes(include=['object', 'bool']).columns.to_list()
numerical_columns = X_train.select_dtypes(exclude=['object', 'bool']).columns.to_list()

In [10]:
numerical_columns
categorical_columns

['merchant_category', 'merchant_group', 'has_paid', 'name_in_email']

In [11]:
X_train.isnull().sum()/len(X_train)

account_amount_added_12_24m            0.000000
account_days_in_dc_12_24m              0.118732
account_days_in_rem_12_24m             0.118732
account_days_in_term_12_24m            0.118732
account_incoming_debt_vs_paid_0_24m    0.593014
account_status                         0.543856
account_worst_status_0_3m              0.543856
account_worst_status_12_24m            0.667456
account_worst_status_3_6m              0.577243
account_worst_status_6_12m             0.603639
age                                    0.000000
avg_payment_span_0_12m                 0.238597
avg_payment_span_0_3m                  0.493265
merchant_category                      0.000000
merchant_group                         0.000000
has_paid                               0.000000
max_paid_inv_0_12m                     0.000000
max_paid_inv_0_24m                     0.000000
name_in_email                          0.000000
num_active_div_by_paid_inv_0_12m       0.229595
num_active_inv                         0

columns to drop:  
    worst_status_active_inv because 70% data missing  
    merchant_category because is the same thing as merchant group

In [12]:
columns_to_drop = ['merchant_category', 'worst_status_active_inv']

In [13]:
X_train.columns

Index(['account_amount_added_12_24m', 'account_days_in_dc_12_24m',
       'account_days_in_rem_12_24m', 'account_days_in_term_12_24m',
       'account_incoming_debt_vs_paid_0_24m', 'account_status',
       'account_worst_status_0_3m', 'account_worst_status_12_24m',
       'account_worst_status_3_6m', 'account_worst_status_6_12m', 'age',
       'avg_payment_span_0_12m', 'avg_payment_span_0_3m', 'merchant_category',
       'merchant_group', 'has_paid', 'max_paid_inv_0_12m',
       'max_paid_inv_0_24m', 'name_in_email',
       'num_active_div_by_paid_inv_0_12m', 'num_active_inv',
       'num_arch_dc_0_12m', 'num_arch_dc_12_24m', 'num_arch_ok_0_12m',
       'num_arch_ok_12_24m', 'num_arch_rem_0_12m',
       'num_arch_written_off_0_12m', 'num_arch_written_off_12_24m',
       'num_unpaid_bills', 'status_last_archived_0_24m',
       'status_2nd_last_archived_0_24m', 'status_3rd_last_archived_0_24m',
       'status_max_archived_0_6_months', 'status_max_archived_0_12_months',
       'status_max

In [37]:
len(y_train[y_train ==0])/len(y_train)

0.9856850715746421

# Preprocessing

In [52]:
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from imblearn.over_sampling import ADASYN
import pickle

In [15]:
# make my own column dropper
class ColumnDropper(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X.drop(columns=self.columns, axis=1)
    def get_feature_names_out(self, X, y=None):
        X= self.transform(X)
        return X.columns

In [16]:
preproc_numeric_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    RobustScaler()
)
preproc_cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse=False, handle_unknown='ignore')
)
preproc_base_pipeline = make_column_transformer(
    (ColumnDropper(columns_to_drop), columns_to_drop),
    (preproc_numeric_pipe, numerical_columns),
    (preproc_cat_pipe, categorical_columns)
)

In [80]:
X_train_preprocessed = preproc_base_pipeline.fit_transform(X_train)

In [81]:
X_train_preprocessed.shape

(89976, 116)

In [82]:
# preproc_base_pipeline.get_feature_names_out()

In [83]:
X_train_preprocessed =pd.DataFrame(X_train_preprocessed)

In [84]:
X_resampled , y_resampled = ADASYN().fit_resample(X_train_preprocessed, y_train)

# Baseline Model

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [43]:
X_train_proc, X_test_proc, y_train_proc, y_test_proc= train_test_split(X_resampled, y_resampled, test_size=0.3)

In [44]:
model = LogisticRegression(penalty='l2',solver='newton-cg', max_iter=2000)

In [45]:
# full_pipeline =make_pipeline(
#     preproc_base_pipeline,
#     model
# )

In [46]:
full_pipeline

NameError: name 'full_pipeline' is not defined

In [47]:
full_pipeline.get_params()

NameError: name 'full_pipeline' is not defined

In [48]:
# params_grid = {
#     'logisticregression__penalty': ['l2', 'l1'],
#     'logisticregression__solver': ['saga']
# }
# search = GridSearchCV(full_pipeline, param_grid=params_grid, scoring='precision',n_jobs=-1, cv=3 )

In [49]:
model.fit(X_train_proc, y_train_proc)

In [77]:
test_data['merchant_group'].nunique()

12

In [93]:
test_default_df =test_data[['uuid']]
test_data = test_data.drop(columns=['uuid', 'default'], axis=1)

In [50]:
score = cross_val_score(model,X_test_proc, y_test_proc, cv=5)

In [51]:
score

array([0.8347565 , 0.83119077, 0.82978324, 0.83427177, 0.82892267])

In [55]:
file_name = os.path.join('..','raw_data', 'base_model.sav')
pickle.dump(model, open(file_name, 'wb'))

In [56]:
loaded_model = pickle.load(open(file_name, 'rb'))

In [94]:
test_data_preproc = preproc_base_pipeline.transform(test_data)
test_data_preproc.shape

(10000, 116)

In [95]:
test_data_default= model.predict_proba(test_data_preproc)

In [96]:
test_data_default[:,1]

array([0.21996314, 0.35520693, 0.00697654, ..., 0.98128315, 0.64562722,
       0.14118798])

In [98]:
test_default_df['default'] = test_data_default[:,1]

In [100]:
csv_path = os.path.join('..','raw_data', 'predicted.csv')
test_default_df.to_csv(csv_path)