Import Libraries

In [None]:
%pip install plotly --upgrade

In [None]:
%pip install matplotlib

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [4]:
from sklearn.preprocessing import StandardScaler

Credit's Data Base

In [5]:
# variable for data storage 
base_credit = pd.read_csv('credit_data.csv')

In [6]:
base_credit # default(0: loan was payed; 1: loan not payed)

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


Inconsistent Data Treatment

In [7]:
base_credit.loc[base_credit['age'] < 0]

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [8]:
# erase registers with inconsistent values
base_credit_corrected1 = base_credit.drop(base_credit[base_credit['age'] < 0].index)
base_credit_corrected1

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


In [9]:
# manually fill in inconsistent data - most reliable technich but not always viable 

# fill in with the average age
average_age = base_credit['age'][base_credit['age'] > 0].mean() # extracts the average of the positive ages
average_age

np.float64(40.92770044906149)

In [10]:
base_credit.loc[base_credit['age'] < 0, 'age'] = average_age # fills the inconsistent data

In [11]:
base_credit.head(27)

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1
5,6,24904.06414,57.471607,15.498598,0
6,7,48430.359613,26.809132,5722.581981,0
7,8,24500.141984,32.897548,2971.00331,1
8,9,40654.892537,55.496853,4755.82528,0
9,10,25075.872771,39.776378,1409.230371,0


In [12]:
base_credit.loc[base_credit['age'] < 0] # there is no more ages below zero

Unnamed: 0,clientid,income,age,loan,default


Missing Data Treatment

In [13]:
# searches for missing data
base_credit.isnull().sum()

clientid    0
income      0
age         3
loan        0
default     0
dtype: int64

In [14]:
# finds the registers with null value for age
base_credit.loc[pd.isnull(base_credit['age'])]

Unnamed: 0,clientid,income,age,loan,default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


In [15]:
# fill these inconsistent data in
base_credit['age'].fillna(base_credit['age'].mean(), inplace=True)
base_credit.loc[pd.isnull(base_credit['age'])]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  base_credit['age'].fillna(base_credit['age'].mean(), inplace=True)


Unnamed: 0,clientid,income,age,loan,default


Division Between Predictors and Class

In [16]:
# selects income, age and loan to analyse default
x_credit = base_credit.iloc[:, 1:4].values
x_credit

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]], shape=(2000, 3))

In [17]:
# selects default 
y_credit = base_credit.iloc[:, 4].values
y_credit

array([0, 0, 0, ..., 1, 0, 0], shape=(2000,))

Attribute scaling

In [18]:
# finds the people of lowest income, age and loan, respectively
x_credit[:, 0].min(), x_credit[:,1].min(), x_credit[:,2].min()

(np.float64(20014.4894700497),
 np.float64(18.055188510566897),
 np.float64(1.37762959325451))

In [19]:
scaler_credit = StandardScaler()
x_credit = scaler_credit.fit_transform(x_credit)

In [20]:
x_credit[:, 0].min(), x_credit[:,1].min(), x_credit[:,2].min() # standardized values

(np.float64(-1.7676158019964077),
 np.float64(-1.7264154144794286),
 np.float64(-1.4592791099462408))

Training and Testing Data Bases

In [21]:
from sklearn.model_selection import train_test_split

In [23]:
# sets 25% of the data base to testing
x_credit_training, x_credit_test, y_credit_training, y_credit_test = train_test_split(x_credit, y_credit, test_size=0.25, random_state=0)

In [24]:
import pickle

In [25]:
with open('credit.pkl', mode='wb') as f:
    pickle.dump([x_credit_training, y_credit_training, x_credit_test, y_credit_test], f)