# **INTRODUCTION**

- **NAMA**   : Wawan Setiawan
- **BATCH**  : HCK-006

# **QUERY SQL**

```
SELECT limit_balance,
      CAST(sex AS INT64) AS sex,
      CAST(education_level AS INT64) AS education_level,
      CAST(marital_status AS INT64) AS marital_status,
      age,
      pay_0, pay_2, pay_3, pay_4,
      CAST(pay_5 AS FLOAT64) AS pay_05,
      CAST(pay_6 AS FLOAT64) AS pay_06,
      bill_amt_1, bill_amt_2, bill_amt_3, bill_amt_4, bill_amt_5, bill_amt_6,
      pay_amt_1, pay_amt_2, pay_amt_3, pay_amt_4, pay_amt_5, pay_amt_6,
      CAST(default_payment_next_month AS INT64) AS default_payment_next_month
FROM `bigquery-public-data.ml_datasets.credit_card_default`
ORDER BY RAND()
LIMIT 15
```





# **MODEL INFERENCE**

## **INSTALL ENGINE**

In [1]:
! pip install feature_engine

Collecting feature_engine
  Downloading feature_engine-1.6.1-py2.py3-none-any.whl (326 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/326.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m326.6/326.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.6.1


## **IMPORT LIBRARIES**

In [18]:
## import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from feature_engine.outliers import Winsorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.model_selection import RandomizedSearchCV
import pickle

# **DATA LOADING**

In [3]:
df = pd.read_csv('/content/model_inference.csv')

In [4]:
df.head(10)

Unnamed: 0,limit_balance,sex,education_level,marital_status,age,pay_0,pay_2,pay_3,pay_4,pay_05,...,bill_amt_4,bill_amt_5,bill_amt_6,pay_amt_1,pay_amt_2,pay_amt_3,pay_amt_4,pay_amt_5,pay_amt_6,default_payment_next_month
0,50000,2,1,2,24,0,0,2,0,0,...,32140,32216,31482,5350,0,1085,2000,1226,2415,0
1,160000,2,1,2,26,1,2,0,0,2,...,104382,100535,101621,0,6497,8001,0,4000,3801,0
2,90000,2,2,2,23,2,3,3,2,2,...,82117,84395,85739,5900,0,0,3600,2900,3500,1
3,270000,2,2,1,40,0,0,0,0,0,...,17160,14818,12702,1826,1469,514,581,612,312,1
4,130000,2,2,2,37,2,2,2,2,0,...,85428,86251,88094,5000,4200,0,3000,3100,3100,1
5,230000,2,2,2,30,-1,2,-1,0,0,...,16549,6392,1828,0,6270,14085,6396,9,6172,0
6,360000,1,1,2,28,0,0,0,0,0,...,103273,91933,108163,25126,60259,27208,31262,90114,30463,0
7,360000,1,1,2,25,-1,-1,-1,0,0,...,5084,3317,1646,14076,5672,0,0,1646,3875,0
8,220000,1,2,2,31,1,2,0,0,0,...,166753,82058,20650,0,6800,6640,4100,1100,0,1
9,200000,1,1,1,53,0,0,0,0,0,...,71929,58133,40500,2500,2700,2242,2093,1300,0,0


In [5]:
df.tail(10)

Unnamed: 0,limit_balance,sex,education_level,marital_status,age,pay_0,pay_2,pay_3,pay_4,pay_05,...,bill_amt_4,bill_amt_5,bill_amt_6,pay_amt_1,pay_amt_2,pay_amt_3,pay_amt_4,pay_amt_5,pay_amt_6,default_payment_next_month
5,230000,2,2,2,30,-1,2,-1,0,0,...,16549,6392,1828,0,6270,14085,6396,9,6172,0
6,360000,1,1,2,28,0,0,0,0,0,...,103273,91933,108163,25126,60259,27208,31262,90114,30463,0
7,360000,1,1,2,25,-1,-1,-1,0,0,...,5084,3317,1646,14076,5672,0,0,1646,3875,0
8,220000,1,2,2,31,1,2,0,0,0,...,166753,82058,20650,0,6800,6640,4100,1100,0,1
9,200000,1,1,1,53,0,0,0,0,0,...,71929,58133,40500,2500,2700,2242,2093,1300,0,0
10,50000,1,3,2,29,1,-1,-1,-1,-1,...,0,1662,4537,1899,832,0,1662,4537,2943,0
11,140000,1,1,2,30,0,-1,-1,-1,-1,...,-885,3019,-177,6500,2515,0,8000,0,1000,0
12,140000,1,2,2,27,0,0,0,0,0,...,61129,59089,58160,50000,3000,4000,5000,4000,2000,0
13,150000,2,1,2,28,-2,-2,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
14,180000,1,2,1,35,0,0,0,0,0,...,79430,81098,82791,3400,2900,3000,3100,3200,7300,0


In [6]:
## check data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   limit_balance               15 non-null     int64
 1   sex                         15 non-null     int64
 2   education_level             15 non-null     int64
 3   marital_status              15 non-null     int64
 4   age                         15 non-null     int64
 5   pay_0                       15 non-null     int64
 6   pay_2                       15 non-null     int64
 7   pay_3                       15 non-null     int64
 8   pay_4                       15 non-null     int64
 9   pay_05                      15 non-null     int64
 10  pay_06                      15 non-null     int64
 11  bill_amt_1                  15 non-null     int64
 12  bill_amt_2                  15 non-null     int64
 13  bill_amt_3                  15 non-null     int64
 14  bill_amt_4  

In [7]:
## check data duplicate
df[df.duplicated()].shape

(0, 24)

There no duplicat column

In [8]:
## check missing values
df.isnull().sum()

limit_balance                 0
sex                           0
education_level               0
marital_status                0
age                           0
pay_0                         0
pay_2                         0
pay_3                         0
pay_4                         0
pay_05                        0
pay_06                        0
bill_amt_1                    0
bill_amt_2                    0
bill_amt_3                    0
bill_amt_4                    0
bill_amt_5                    0
bill_amt_6                    0
pay_amt_1                     0
pay_amt_2                     0
pay_amt_3                     0
pay_amt_4                     0
pay_amt_5                     0
pay_amt_6                     0
default_payment_next_month    0
dtype: int64

There is no missing value

# **DATA CLEANING**

In [9]:
# dataframe used in this section saved on df_clean variable
df_clean = df.copy()
df_clean.head()

Unnamed: 0,limit_balance,sex,education_level,marital_status,age,pay_0,pay_2,pay_3,pay_4,pay_05,...,bill_amt_4,bill_amt_5,bill_amt_6,pay_amt_1,pay_amt_2,pay_amt_3,pay_amt_4,pay_amt_5,pay_amt_6,default_payment_next_month
0,50000,2,1,2,24,0,0,2,0,0,...,32140,32216,31482,5350,0,1085,2000,1226,2415,0
1,160000,2,1,2,26,1,2,0,0,2,...,104382,100535,101621,0,6497,8001,0,4000,3801,0
2,90000,2,2,2,23,2,3,3,2,2,...,82117,84395,85739,5900,0,0,3600,2900,3500,1
3,270000,2,2,1,40,0,0,0,0,0,...,17160,14818,12702,1826,1469,514,581,612,312,1
4,130000,2,2,2,37,2,2,2,2,0,...,85428,86251,88094,5000,4200,0,3000,3100,3100,1


## RENAME CATEGORICAL DATA

In [15]:
# astype target as float to same as feature after encoding
df_clean['default_payment_next_month'] = df_clean['default_payment_next_month'].astype(float)

In [16]:
df_clean

Unnamed: 0,limit_balance,sex,education_level,marital_status,age,pay_0,pay_2,pay_3,pay_4,pay_05,...,bill_amt_4,bill_amt_5,bill_amt_6,pay_amt_1,pay_amt_2,pay_amt_3,pay_amt_4,pay_amt_5,pay_amt_6,default_payment_next_month
0,50000,female,graduate school,single,24,0,0,2,0,0,...,32140,32216,31482,5350,0,1085,2000,1226,2415,0.0
1,160000,female,graduate school,single,26,1,2,0,0,2,...,104382,100535,101621,0,6497,8001,0,4000,3801,0.0
2,90000,female,university,single,23,2,3,3,2,2,...,82117,84395,85739,5900,0,0,3600,2900,3500,1.0
3,270000,female,university,married,40,0,0,0,0,0,...,17160,14818,12702,1826,1469,514,581,612,312,1.0
4,130000,female,university,single,37,2,2,2,2,0,...,85428,86251,88094,5000,4200,0,3000,3100,3100,1.0
5,230000,female,university,single,30,-1,2,-1,0,0,...,16549,6392,1828,0,6270,14085,6396,9,6172,0.0
6,360000,male,graduate school,single,28,0,0,0,0,0,...,103273,91933,108163,25126,60259,27208,31262,90114,30463,0.0
7,360000,male,graduate school,single,25,-1,-1,-1,0,0,...,5084,3317,1646,14076,5672,0,0,1646,3875,0.0
8,220000,male,university,single,31,1,2,0,0,0,...,166753,82058,20650,0,6800,6640,4100,1100,0,1.0
9,200000,male,graduate school,married,53,0,0,0,0,0,...,71929,58133,40500,2500,2700,2242,2093,1300,0,0.0


# **MODEL LOAD**

In [19]:
# Step 1 - Import dependecies
model = pickle.load(open('all_process.pkl', 'rb'))

with open('iqr_columns_1_cleaned.txt', 'r') as file_1:
  iqr_columns_1_cleaned = file_1.read()

with open('iqr_columns_2_cleaned.txt', 'r') as file_2:
  iqr_columns_2_cleaned = file_2.read()

In [20]:
# Step 3 - Chiqr_columns_1_cleaned
iqr_columns_1_cleaned

"['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_05', 'pay_06', 'limit_balance']"

In [21]:
iqr_columns_2_cleaned

"['pay_amt_2', 'pay_amt_3', 'pay_amt_1', 'pay_amt_4']"

In [23]:
# preprocess and predict
result = model.predict(df_clean)
result

array([0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [24]:
# Compare results
y_new = df_clean.default_payment_next_month
compare_test = {
                  'y_new_value': y_new,
                  'y_new_pred': result,
}
compare_test = pd.DataFrame(compare_test)
result_test = []
for index,values in compare_test.iterrows():
  if values['y_new_value'] == values['y_new_pred']:
    result_test.append('Match/predicted')
  else:
    result_test.append('Unmatch/predicted_failed')
compare_test['result_test'] = result_test
compare_test

Unnamed: 0,y_new_value,y_new_pred,result_test
0,0.0,0.0,Match/predicted
1,0.0,0.0,Match/predicted
2,1.0,1.0,Match/predicted
3,1.0,0.0,Unmatch/predicted_failed
4,1.0,1.0,Match/predicted
5,0.0,0.0,Match/predicted
6,0.0,0.0,Match/predicted
7,0.0,0.0,Match/predicted
8,1.0,0.0,Unmatch/predicted_failed
9,0.0,0.0,Match/predicted


```
From total 15 data, 13 data has been success to predict

Final score for the model is 87%
```