# ML Preprocessing & Modelling
## Data

In [1]:
import pandas as pd
data = pd.read_csv('./data/data_loanPrediction.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [2]:
print('Data size :')
data.shape

Data size :


(614, 13)

# EDA

In [3]:
print('Features available : ')
data.columns.tolist()

Features available : 


['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Loan_Status']

From the column list above,
- *Loan_status* is our **target variable**
- *Loan_ID* is a row identifier which can be removed

In [4]:
data['Loan_Status'] = data['Loan_Status'].replace({'Y':1, 'N':0})

In [5]:
# Split the data into *train*, *test* and *validation*
from sklearn.model_selection import train_test_split
X = data.drop(['Loan_Status'],axis=1)
y = data['Loan_Status']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=0)

data.drop(['Loan_ID'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

print('Orig size        : {} - {} '.format(X.shape,y.shape))
print('Training size    : {} - {} '.format(X_train.shape,y_train.shape))
print('Validation size  : {} - {} '.format(X_valid.shape,y_valid.shape))
print('Testing size     : {} - {} '.format(X_test.shape,y_test.shape))
print('Check split      : {}'.format(X_train.shape[0] + X_valid.shape[0] + X_test.shape[0]))

Orig size        : (614, 12) - (614,) 
Training size    : (441, 12) - (441,) 
Validation size  : (62, 12) - (62,) 
Testing size     : (111, 12) - (111,) 
Check split      : 614


In [6]:
# check null values and save cols with null values
cols_nulls = []
print('Null Values')
for col in data.columns:
    print("    {} : {}".format(col, data[col].isnull().sum()))
    if data[col].isnull().sum() != 0:
        cols_nulls.append(col)
print('')
print('Columns with Nulls : ')
print(cols_nulls)        

Null Values
    Gender : 13
    Married : 3
    Dependents : 15
    Education : 0
    Self_Employed : 32
    ApplicantIncome : 0
    CoapplicantIncome : 0
    LoanAmount : 22
    Loan_Amount_Term : 14
    Credit_History : 50
    Property_Area : 0
    Loan_Status : 0

Columns with Nulls : 
['Gender', 'Married', 'Dependents', 'Self_Employed', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']


In [7]:
print('Contents of labels with nulls:')
for col in cols_nulls:
    print('   {} - {}'.format(col, set(data[col])), end ='\n\n')

Contents of labels with nulls:
   Gender - {nan, 'Male', 'Female'}

   Married - {'No', nan, 'Yes'}

   Dependents - {nan, '1', '3+', '2', '0'}

   Self_Employed - {'No', nan, 'Yes'}

   LoanAmount - {nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 9.0, 17.0, 25.0, 26.0, 30.0, nan, nan, nan, nan, 35.0, nan, 36.0, nan, 40.0, 42.0, 44.0, 45.0, 46.0, 47.0, 48.0, 50.0, 53.0, 54.0, 55.0, 56.0, 58.0, 59.0, 60.0, 570.0, 62.0, 63.0, 61.0, 65.0, 66.0, 67.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 78.0, 80.0, 81.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 600.0, 90.0, 89.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0, 128.0, 129.0, 130.0, 131.0, 132.0, 133.0, 134.0, 135.0, 136.0, 137.0, 650.0, 139.0, 138.0, 141.0, 140.0, 143.0, 144.0, 145.0, 146.0, 142.0, 148.0, 149.0, 150.0, 1

Ok, so I need to inpute some of these features. To do so, I will:
- **Gender**  : I shall pick the most promimant applicant.
- **Married** : Shall set to **No**
- **Dependents** : Inpute as **0**
- **Self_Employed** : Inpute as **No**
- **LoanAmount** : I shall the derive the **Mean** Amount as based on the *Loan_Amount_Term*
- **Loan_Amount_Term** : I shall derive the **Mean** term for all applicants
- **Credit_History** : Inpute as **1** (yes)

In [8]:
# Gender selection
from collections import Counter

def mostCommon(lst):
    data = Counter(lst)
    return data.most_common(1)[0][0]

mostCommon(X_train['Gender'])

'Male'

In [9]:
# elements of main pipeline
X_train['Gender'] = X_train['Gender'].fillna('Male')
X_train['Married'] = X_train['Married'].fillna('No')
X_train['Dependents'] = X_train['Dependents'].fillna('0')
X_train['Self_Employed'] = X_train['Self_Employed'].fillna('No')
X_train['Loan_Amount_Term'] = X_train['Loan_Amount_Term'].fillna(X_train['Loan_Amount_Term'].mean())
X_train['LoanAmount'] = X_train['LoanAmount'].fillna(X_train['LoanAmount'].mean())
X_train['Credit_History'] = X_train['Credit_History'].fillna(1)

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
warnings.filterwarnings("ignore")

class PreProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def transform(self, df, y=None):
        pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome',
                    'CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']
        
        df = df[pred_var]
        
        df['Dependents'] = df['Dependents'].fillna(0)
        df['Self_Employed'] = df['Self_Employed'].fillna('No')
        df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(self.term_mean_)
        df['Credit_History'] = df['Credit_History'].fillna(1)
        df['Married'] = df['Married'].fillna('No')
        df['Gender'] = df['Gender'].fillna('Male')
        df['LoanAmount'] = df['LoanAmount'].fillna(self.amt_mean_)
        
               
        return df

    def fit(self, df, y=None):
        self.term_mean_ = df['Loan_Amount_Term'].mean()
        self.amt_mean_ = df['LoanAmount'].mean()

        return self

In [11]:
pp = PreProcessing()
pp.fit_transform(X_train.head())    

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
369,Male,Yes,0,Graduate,No,19730,5266.0,570.0,360.0,1.0,Rural
147,Male,Yes,1,Graduate,No,1538,1425.0,30.0,360.0,1.0,Urban
359,Male,Yes,3+,Graduate,No,5167,3167.0,200.0,360.0,1.0,Semiurban
19,Male,Yes,0,Graduate,No,2600,3500.0,115.0,339.953596,1.0,Urban
291,Male,Yes,2,Graduate,No,4400,0.0,127.0,360.0,0.0,Semiurban


In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]

class ModifiedLabelEncoder(LabelEncoder):
    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(y).reshape(-1, 1)

    def transform(self, y, *args, **kwargs):
        return super().transform(y).reshape(-1, 1)

In [13]:
# 'Gender'
# 'Married'
# 'Dependents'
# 'Education'
# 'Self_Employed'
# 'Credit_History'
# 'Property_Area'

from sklearn.pipeline import make_pipeline
colCAT = ['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area']
pipe1 = make_pipeline(ColumnSelector('Gender'), ModifiedLabelEncoder())

pipe1.fit_transform(X_train.head())

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [14]:
# ApplicantIncome        int64
# CoapplicantIncome    float64
# LoanAmount           float64
# Loan_Amount_Term     float64
from sklearn.preprocessing import StandardScaler
colCONT = ['ApplicantIncome','CoapplicantIncome','LoanAmount', 'Loan_Amount_Term']
pipe2 = make_pipeline(ColumnSelector(colCONT),StandardScaler())
pipe2.fit_transform(X_train.head())

array([[ 1.96240143,  1.43452938,  1.91646971,  0.5       ],
       [-0.77469945, -0.68928628, -0.94551492,  0.5       ],
       [-0.22869356,  0.27392301, -0.04451976,  0.5       ],
       [-0.61491487,  0.4580497 , -0.49501734, -2.        ],
       [-0.34409354, -1.47721581, -0.43141768,  0.5       ]])

In [15]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier

colCAT = ['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area']
colCONT = ['ApplicantIncome','CoapplicantIncome','LoanAmount', 'Loan_Amount_Term']
pipe = Pipeline([('preProc',PreProcessing()),
                 ('fu', FeatureUnion([
                     ("cont_features", make_pipeline(ColumnSelector(colCONT), StandardScaler())),
                     ('cat_Gender', make_pipeline(ColumnSelector('Gender'), ModifiedLabelEncoder())),
                     ('cat_Married', make_pipeline(ColumnSelector('Married'), ModifiedLabelEncoder())),
                     ('cat_Dependents', make_pipeline(ColumnSelector('Dependents'), ModifiedLabelEncoder())),
                     ('cat_Education', make_pipeline(ColumnSelector('Education'), ModifiedLabelEncoder())),
                     ('cat_Self_Employed', make_pipeline(ColumnSelector('Self_Employed'), ModifiedLabelEncoder())),
                     ('cat_Credit_History', make_pipeline(ColumnSelector('Credit_History'), ModifiedLabelEncoder())),
                     ('cat_Property_Area', make_pipeline(ColumnSelector('Property_Area'), ModifiedLabelEncoder()))
                 ])
                 ),
                 ('clsfy',RandomForestClassifier())
                ])
pipe

Pipeline(memory=None,
     steps=[('preProc', PreProcessing()), ('fu', FeatureUnion(n_jobs=1,
       transformer_list=[('cont_features', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(columns=['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term'])), ('standardscaler', StandardScaler(co...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [16]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
param_grid = {'clsfy__n_estimators' : [int(n) for n in np.linspace(5,100,20)],
              'clsfy__max_depth' : [None, 5, 10, 15, 20, 25, 30],
              'clsfy__max_leaf_nodes': [None, 5, 10, 15, 20, 25, 30],
              'clsfy__max_features' : [None, 'auto']
             }
param_search = RandomizedSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=1, n_iter=100, random_state=0)
param_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.0min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('preProc', PreProcessing()), ('fu', FeatureUnion(n_jobs=1,
       transformer_list=[('cont_features', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(columns=['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term'])), ('standardscaler', StandardScaler(co...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'clsfy__n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100], 'clsfy__max_depth': [None, 5, 10, 15, 20, 25, 30], 'clsfy__max_leaf_nodes': [None, 5, 10, 15, 20, 25, 30], 'clsfy__max_features': [None, 'auto']},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [17]:
print('Parameters: ')
print(param_search.best_params_)

Parameters: 
{'clsfy__n_estimators': 10, 'clsfy__max_leaf_nodes': 5, 'clsfy__max_features': 'auto', 'clsfy__max_depth': None}


In [18]:
model = param_search.best_estimator_

In [19]:
print('Best model score : {}'.format( model.score(X_train,y_train) ))
print('Validation score : {}'.format( model.score(X_valid, y_valid)))

Best model score : 0.8049886621315193
Validation score : 0.8709677419354839


# Serialize the model
To be able to load the model in an API, I need to serialize it.

In [20]:
import dill as pickle
filename = 'classify_model.pk'
with open('./model/'+filename, 'wb') as file:
    pickle.dump(model, file)
with open('./classifyAPI/model/'+filename, 'wb') as file:
    pickle.dump(model, file)

In [21]:
with open('./model/'+filename ,'rb') as f:
    loaded_model = pickle.load(f)

In [22]:
#  check that loading and unloading the model does not alter it
for orig, loaded in zip(model.predict(X_valid),loaded_model.predict(X_valid)):
    if orig != loaded:
        print('Not identical')
    else:
        print('{}-{} : Identical'.format(orig,loaded))

1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical


# Creation of API server within `Flask`
The above pipeline was encapsulated withint `server.py` below:

In [23]:
print('------------------------------------------------------------------')
print('server.py')
print('------------------------------------------------------------------')
with open('./classifyAPI/server.py' ,'r') as f:
    print(f.read())

------------------------------------------------------------------
server.py
------------------------------------------------------------------
#import os
import pandas as pd
import dill as pickle
from flask import Flask, jsonify, request


# Custom Functions
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]

class ModifiedLabelEncoder(LabelEncoder):
    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(y).reshape(-1, 1)

    def transform(self, y, *args, **kwargs):
        return super().transform(y).reshape(-1, 1)

# Setting up the App
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def apicall():
	"""API Call"""
	try:
		test_json = request.get_json()
		tes

## Setting up the API
- Within terminal, navigate to location of the api.
- Set [app and env](http://flask.pocoo.org/docs/1.0/tutorial/factory/). 
    - For Linux and Mac:
        - export FLASK_APP=server:app
        - export FLASK_ENV=development
        - flask run --host=0.0.0.0 --port=5000
    - For Windows cmd, use set instead of export:
        - set FLASK_APP=server:app
        - set FLASK_ENV=development
        - flask run --host=0.0.0.0 --port=5000
    - For Windows PowerShell:
        - \\$env:FLASK_APP = "server:app"
        - \\$env:FLASK_ENV = "development"
        - flask run --host=0.0.0.0 --port=5000



In [24]:
import json
import requests

In [25]:
# Set up headers to send and accept json responses
header = {'Content-Type': 'application/json', 'Accept': 'application/json'}

# Convert dataframe to json
data = X_valid.to_json(orient='records')
data

'[{"Loan_ID":"LP002453","Gender":"Male","Married":"No","Dependents":"0","Education":"Graduate","Self_Employed":"Yes","ApplicantIncome":7085,"CoapplicantIncome":0.0,"LoanAmount":84.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Semiurban"},{"Loan_ID":"LP001164","Gender":"Female","Married":"No","Dependents":"0","Education":"Graduate","Self_Employed":"No","ApplicantIncome":4230,"CoapplicantIncome":0.0,"LoanAmount":112.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Semiurban"},{"Loan_ID":"LP002734","Gender":"Male","Married":"Yes","Dependents":"0","Education":"Graduate","Self_Employed":"No","ApplicantIncome":6133,"CoapplicantIncome":3906.0,"LoanAmount":324.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Urban"},{"Loan_ID":"LP002505","Gender":"Male","Married":"Yes","Dependents":"0","Education":"Graduate","Self_Employed":"No","ApplicantIncome":4333,"CoapplicantIncome":2451.0,"LoanAmount":110.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"

In [26]:
# POST <url>/predict
resp = requests.post("http://0.0.0.0:5000/predict", data = json.dumps(data), headers= header)
resp.status_code

ConnectionError: HTTPConnectionPool(host='0.0.0.0', port=5000): Max retries exceeded with url: /predict (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1a0f0a7ba8>: Failed to establish a new connection: [Errno 61] Connection refused',))

In [None]:
resp.json()

Trying to POST to this server will return an error that `AttributeError: Can't get attribute 'ModifiedLabelEncoder' on <module '__main__' from..`. The pipeline has to be re-written such that it utilizes base packages within python. For this reason, I re-visited the pipeline creation in the next [notebook]().