In [7]:
# Import necessary packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
import pickle

In [8]:
# Preprocess and select relavent data
raw_data = pd.read_csv("defaults.csv")
selected_data = raw_data[['Credit', 'Gender', 'Education', 'Marital', 'Age', 'Default']]
processed_data = selected_data.copy()
mode_data = processed_data.mode()
processed_data.fillna(mode_data.loc[0,:], inplace = True)
processed_data.to_csv('defaults_selected.csv',index=False)

In [9]:
# Distinguish numerical and catagrical data
num_vars = ['Credit', 'Age']
cat_vars = ['Gender', 'Education', 'Marital', 'Default']

Manually data preprocessiong and model training:

In [10]:
processed_data_num = processed_data[num_vars]
processed_data_cat = processed_data[cat_vars]

In [11]:
num_enc = StandardScaler()
processed_data_num_encoded = num_enc.fit_transform(processed_data_num)

In [12]:
cat_enc = OneHotEncoder(drop = 'first')
processed_data_cat_encoded = cat_enc.fit_transform(processed_data_cat)

In [13]:
processed_data_manuel = np.concatenate((processed_data_num_encoded , processed_data_cat_encoded.toarray()), axis=1)

In [14]:
X = processed_data_manuel[:,0:-1]
y = processed_data_manuel[:, -1]
model_manual = LogisticRegression(class_weight= 'balanced').fit(X,y)
model_manual.score(X,y)

0.52625

Data pre-processing and model training using pipeline:

In [15]:
data_pipeline = ColumnTransformer([
    ('numerical', StandardScaler(), num_vars),
    ('categorical', OneHotEncoder(drop = 'first'), cat_vars)])
processed_data_pipeline = data_pipeline.fit_transform(processed_data)
pickle.dump(data_pipeline, open('data_transformer', 'wb'))

In [16]:
lr_pipeline = Pipeline([('lr_model', LogisticRegression(class_weight= 'balanced'))])
X = processed_data_pipeline[:,0:-1]
y = processed_data_pipeline[:, -1]
lr_pipeline.fit(X,y)
pickle.dump(lr_pipeline, open('model_train', 'wb'))
lr_pipeline.score(X,y)

0.52625

In [17]:
lr_pipeline.named_steps['lr_model'].coef_

array([[-0.29141372,  0.12561596, -0.12146618,  0.03041223, -1.2625545 ,
         0.13516807, -0.14536495, -0.34067213]])

Data pre-processing and model training using LOADED pipeline:

In [18]:
data_prep_loaded = pickle.load(open("data_transformer", 'rb'))
processed_data_loaded = data_prep_loaded.fit_transform(processed_data)

In [19]:
input_dict = {'Credit': 120000, 'Gender': 'Male', 'Education': 'Graduate School', 'Marital': 'Never Married', 'Age': 37, 'Default': 'No'}
input_df = pd.DataFrame([input_dict])
input_df[num_vars]

Unnamed: 0,Credit,Age
0,120000,37


In [20]:
data_prep_loaded.fit(processed_data) 

ColumnTransformer(transformers=[('numerical', StandardScaler(),
                                 ['Credit', 'Age']),
                                ('categorical', OneHotEncoder(drop='first'),
                                 ['Gender', 'Education', 'Marital',
                                  'Default'])])

In [21]:
x = data_prep_loaded.transform(input_df) 
x[0,0:-1]

array([-0.28876872,  0.18581318,  1.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ])

In [22]:
model_loaded = pickle.load(open('model_train', 'rb'))
X = processed_data_loaded[:,0:-1]
y = processed_data_loaded[:, -1]
model_loaded.score(X,y)

0.52625

In [23]:
model_loaded.predict([x[0,0:-1]])

array([0.])

In [24]:
model_loaded.predict_proba([x[0,0:-1]])[0][1]

0.47149072289870225

In [25]:
processed_data.iloc[[123]]

Unnamed: 0,Credit,Gender,Education,Marital,Age,Default
123,280000.0,Female,University,Married,56,No


In [26]:
X

array([[-0.28876872, -0.97100038,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [-0.52457849, -0.12968143,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [-0.83899153,  0.18581318,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.91759478,  2.49944029,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.1315622 , -0.12968143,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.49726386, -0.34001117,  1.        , ...,  1.        ,
         0.        ,  0.        ]])

In [27]:
model_loaded.predict(X)

array([0., 1., 1., ..., 1., 1., 0.])

In [29]:
pd.DataFrame(data=model_loaded.predict(X) , columns=["Prediction"])

Unnamed: 0,Prediction
0,0.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
1595,0.0
1596,0.0
1597,1.0
1598,1.0
