# ML Preprocessing & Modelling specifically for Seriliazation 

In [1]:
import pandas as pd
data = pd.read_csv('./data/data_loanPrediction.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [2]:
data['Loan_Status'] = data['Loan_Status'].replace({'Y':1, 'N':0})

In [3]:
# Split the data into *train*, *test* and *validation*
from sklearn.model_selection import train_test_split
X = data.drop(['Loan_Status'],axis=1)
y = data['Loan_Status']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=0)

data.drop(['Loan_ID'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

print('Orig size        : {} - {} '.format(X.shape,y.shape))
print('Training size    : {} - {} '.format(X_train.shape,y_train.shape))
print('Validation size  : {} - {} '.format(X_valid.shape,y_valid.shape))
print('Testing size     : {} - {} '.format(X_test.shape,y_test.shape))
print('Check split      : {}'.format(X_train.shape[0] + X_valid.shape[0] + X_test.shape[0]))

Orig size        : (614, 12) - (614,) 
Training size    : (441, 12) - (441,) 
Validation size  : (62, 12) - (62,) 
Testing size     : (111, 12) - (111,) 
Check split      : 614


In [4]:
# check null values and save cols with null values
cols_nulls = []
print('Null Values')
for col in data.columns:
    print("    {} : {}".format(col, data[col].isnull().sum()))
    if data[col].isnull().sum() != 0:
        cols_nulls.append(col)
print('')
print('Columns with Nulls : ')
print(cols_nulls)        

Null Values
    Gender : 13
    Married : 3
    Dependents : 15
    Education : 0
    Self_Employed : 32
    ApplicantIncome : 0
    CoapplicantIncome : 0
    LoanAmount : 22
    Loan_Amount_Term : 14
    Credit_History : 50
    Property_Area : 0
    Loan_Status : 0

Columns with Nulls : 
['Gender', 'Married', 'Dependents', 'Self_Employed', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']


In [5]:
print('Contents of labels with nulls:')
for col in cols_nulls:
    print('   {} - {}'.format(col, set(data[col])), end ='\n\n')

Contents of labels with nulls:
   Gender - {nan, 'Male', 'Female'}

   Married - {'Yes', nan, 'No'}

   Dependents - {nan, '3+', '2', '1', '0'}

   Self_Employed - {'Yes', nan, 'No'}

   LoanAmount - {nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 9.0, 17.0, 25.0, 26.0, 30.0, nan, nan, nan, nan, 35.0, nan, 36.0, nan, 40.0, 42.0, 44.0, 45.0, 46.0, 47.0, 48.0, 50.0, 53.0, 54.0, 55.0, 56.0, 58.0, 59.0, 60.0, 570.0, 62.0, 63.0, 61.0, 65.0, 66.0, 67.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 78.0, 80.0, 81.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 600.0, 90.0, 89.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0, 128.0, 129.0, 130.0, 131.0, 132.0, 133.0, 134.0, 135.0, 136.0, 137.0, 650.0, 139.0, 138.0, 141.0, 140.0, 143.0, 144.0, 145.0, 146.0, 142.0, 148.0, 149.0, 150.0, 1

Ok, so I need to inpute some of these features. To do so, I will:
- **Gender**  : I shall pick the most promimant applicant.
- **Married** : Shall set to **No**
- **Dependents** : Inpute as **0**
- **Self_Employed** : Inpute as **No**
- **LoanAmount** : I shall the derive the **Mean** Amount as based on the *Loan_Amount_Term*
- **Loan_Amount_Term** : I shall derive the **Mean** term for all applicants
- **Credit_History** : Inpute as **1** (yes)

In [6]:
# Gender selection
from collections import Counter

def mostCommon(lst):
    data = Counter(lst)
    return data.most_common(1)[0][0]

mostCommon(X_train['Gender'])

'Male'

In [7]:
# elements of main pipeline
X_train['Gender'] = X_train['Gender'].fillna('Male')
X_train['Married'] = X_train['Married'].fillna('No')
X_train['Dependents'] = X_train['Dependents'].fillna('0')
X_train['Self_Employed'] = X_train['Self_Employed'].fillna('No')
X_train['Loan_Amount_Term'] = X_train['Loan_Amount_Term'].fillna(X_train['Loan_Amount_Term'].mean())
X_train['LoanAmount'] = X_train['LoanAmount'].fillna(X_train['LoanAmount'].mean())
X_train['Credit_History'] = X_train['Credit_History'].fillna(1)

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
warnings.filterwarnings("ignore")

class PreProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def transform(self, df, y=None):
        pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome',
                    'CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']
        
        df = df[pred_var]
        
        df['Dependents'] = df['Dependents'].fillna(0)
        df['Self_Employed'] = df['Self_Employed'].fillna('No')
        df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(self.term_mean_)
        df['Credit_History'] = df['Credit_History'].fillna(1)
        df['Married'] = df['Married'].fillna('No')
        df['Gender'] = df['Gender'].fillna('Male')
        df['LoanAmount'] = df['LoanAmount'].fillna(self.amt_mean_)

        gender_values = {'Female' : 0, 'Male' : 1} 
        married_values = {'No' : 0, 'Yes' : 1}
        education_values = {'Graduate' : 0, 'Not Graduate' : 1}
        employed_values = {'No' : 0, 'Yes' : 1}

        col_OHE = ['Property_Area','Dependents']
        for col in col_OHE:
            for enum, lbl in enumerate(set(df[col])):
                col_name = col+'_'+lbl    
                Xt = df[col].replace(lbl,1)
                Xt[Xt!=1]=0
                df[col_name] = Xt
        df.drop(col_OHE, axis=1, inplace=True)
        
        df.replace({'Gender': gender_values,
                    'Married': married_values, 
                    'Education': education_values,
                    'Self_Employed': employed_values
                   }, inplace=True)
        
        # Standard Scalar for Continuous Functions
        colCONT = ['ApplicantIncome','CoapplicantIncome','LoanAmount', 'Loan_Amount_Term']
        for col in colCONT:
            df[col] = (df[col] - df[col].mean())/df[col].std(ddof=0)
        
        return df

    def fit(self, df, y=None):
        self.term_mean_ = df['Loan_Amount_Term'].mean()
        self.amt_mean_ = df['LoanAmount'].mean()

        return self

In [9]:
pp = PreProcessing()
pp.fit_transform(X_train.head())    

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Urban,Property_Area_Semiurban,Property_Area_Rural,Dependents_2,Dependents_1,Dependents_0,Dependents_3+
369,1,1,0,0,1.962401,1.434529,1.91647,0.5,1.0,0,0,1,0,0,1,0
147,1,1,0,0,-0.774699,-0.689286,-0.945515,0.5,1.0,1,0,0,0,1,0,0
359,1,1,0,0,-0.228694,0.273923,-0.04452,0.5,1.0,0,1,0,0,0,0,1
19,1,1,0,0,-0.614915,0.45805,-0.495017,-2.0,1.0,1,0,0,0,0,1,0
291,1,1,0,0,-0.344094,-1.477216,-0.431418,0.5,0.0,0,1,0,1,0,0,0


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([('preProc',PreProcessing())
                 ,('clsfy',RandomForestClassifier())
                ])

In [11]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
param_grid = {'clsfy__n_estimators' : [int(n) for n in np.linspace(5,100,20)],
              'clsfy__max_depth' : [None, 5, 10, 15, 20, 25, 30],
              'clsfy__max_leaf_nodes': [None, 5, 10, 15, 20, 25, 30],
              'clsfy__max_features' : [None, 'auto']
             }
param_search = RandomizedSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=1, n_iter=100, random_state=0)
param_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.0min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('preProc', PreProcessing()), ('clsfy', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
      ..._jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'clsfy__n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100], 'clsfy__max_depth': [None, 5, 10, 15, 20, 25, 30], 'clsfy__max_leaf_nodes': [None, 5, 10, 15, 20, 25, 30], 'clsfy__max_features': [None, 'auto']},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [12]:
print('Parameters: ')
print(param_search.best_params_)

Parameters: 
{'clsfy__n_estimators': 45, 'clsfy__max_leaf_nodes': 5, 'clsfy__max_features': 'auto', 'clsfy__max_depth': 20}


In [13]:
model = param_search.best_estimator_

In [14]:
print('Best model score : {}'.format( model.score(X_train,y_train) ))
print('Validation score : {}'.format( model.score(X_valid, y_valid)))

Best model score : 0.8027210884353742
Validation score : 0.8709677419354839


# Serialize the model

In [15]:
import dill as pickle
filename = 'classify_model2.pk'
# save the model within current directory for testing
with open('./model/'+filename, 'wb') as file:
    pickle.dump(model, file)
# save mode to API directory    
with open('./classifyAPI/model/'+filename, 'wb') as file:
    pickle.dump(model, file)

In [17]:
# Check if the original and loaded model are identical
with open('./model/'+filename ,'rb') as f:
    loaded_model = pickle.load(f)
for orig, loaded in zip(model.predict(X_valid),loaded_model.predict(X_valid)):
    if orig != loaded:
        print('Not identical')
    else:
        print('{}-{} : Identical'.format(orig,loaded))

1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical


# Creation of server within `Flask`

In [18]:
print('------------------------------------------------------------------')
print('server2.py')
print('------------------------------------------------------------------')
with open('./classifyAPI/server2.py' ,'r') as f:
    print(f.read())

------------------------------------------------------------------
server2.py
------------------------------------------------------------------
#import os
import pandas as pd
import dill as pickle
from flask import Flask, jsonify, request
from sklearn.base import BaseEstimator, TransformerMixin


# PreProcessing
class PreProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def transform(self, df, y=None):
        pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome',
                    'CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']

        df = df[pred_var]

        df['Dependents'] = df['Dependents'].fillna(0)
        df['Self_Employed'] = df['Self_Employed'].fillna('No')
        df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(self.term_mean_)
        df['Credit_History'] = df['Credit_History'].fillna(1)
        df['Married'] = df['Married'].fillna('No')
        df['G

## Setting up the API
- Within terminal, navigate to location of the api.
- Set [app and env](http://flask.pocoo.org/docs/1.0/tutorial/factory/). 
    - For Linux and Mac:
        - export FLASK_APP=server2:app
        - export FLASK_ENV=development
        - flask run --host=0.0.0.0 --port=5000
    - For Windows cmd, use set instead of export:
        - set FLASK_APP=server2:app
        - set FLASK_ENV=development
        - flask run --host=0.0.0.0 --port=5000
    - For Windows PowerShell:
        - \\$env:FLASK_APP = "server2:app"
        - \\$env:FLASK_ENV = "development"
        - flask run --host=0.0.0.0 --port=5000



In [19]:
import json
import requests

In [20]:
# Set up headers to send and accept json responses
header = {'Content-Type': 'application/json', 'Accept': 'application/json'}

# Convert dataframe to json
data = X_valid.to_json(orient='records')
data

'[{"Loan_ID":"LP002453","Gender":"Male","Married":"No","Dependents":"0","Education":"Graduate","Self_Employed":"Yes","ApplicantIncome":7085,"CoapplicantIncome":0.0,"LoanAmount":84.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Semiurban"},{"Loan_ID":"LP001164","Gender":"Female","Married":"No","Dependents":"0","Education":"Graduate","Self_Employed":"No","ApplicantIncome":4230,"CoapplicantIncome":0.0,"LoanAmount":112.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Semiurban"},{"Loan_ID":"LP002734","Gender":"Male","Married":"Yes","Dependents":"0","Education":"Graduate","Self_Employed":"No","ApplicantIncome":6133,"CoapplicantIncome":3906.0,"LoanAmount":324.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Urban"},{"Loan_ID":"LP002505","Gender":"Male","Married":"Yes","Dependents":"0","Education":"Graduate","Self_Employed":"No","ApplicantIncome":4333,"CoapplicantIncome":2451.0,"LoanAmount":110.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"

In [21]:
# POST <url>/predict
resp = requests.post("http://0.0.0.0:5000/predict", data = json.dumps(data), headers= header)
resp.status_code

200

In [22]:
resp.json()

{'predictions': '[{"0":"LP002453","1":1},{"0":"LP001164","1":1},{"0":"LP002734","1":1},{"0":"LP002505","1":1},{"0":"LP001194","1":1},{"0":"LP001207","1":0},{"0":"LP002740","1":1},{"0":"LP002386","1":1},{"0":"LP002188","1":0},{"0":"LP002537","1":1},{"0":"LP002002","1":1},{"0":"LP002600","1":1},{"0":"LP001776","1":1},{"0":"LP002170","1":1},{"0":"LP002364","1":1},{"0":"LP002151","1":1},{"0":"LP002444","1":1},{"0":"LP002239","1":1},{"0":"LP002522","1":1},{"0":"LP002788","1":0},{"0":"LP001146","1":0},{"0":"LP002387","1":1},{"0":"LP001807","1":1},{"0":"LP002149","1":1},{"0":"LP001643","1":1},{"0":"LP002640","1":1},{"0":"LP002408","1":1},{"0":"LP001594","1":1},{"0":"LP001711","1":0},{"0":"LP002776","1":0},{"0":"LP002139","1":1},{"0":"LP001653","1":1},{"0":"LP001674","1":1},{"0":"LP002409","1":1},{"0":"LP001356","1":1},{"0":"LP002335","1":0},{"0":"LP001947","1":1},{"0":"LP002732","1":1},{"0":"LP001750","1":1},{"0":"LP002807","1":1},{"0":"LP001693","1":1},{"0":"LP002837","1":0},{"0":"LP001030",

In [23]:
df_json = resp.json()
df_json = pd.read_json(df_json['predictions'], orient='records')

In [24]:
# Check that local predicted is the same as API predicted
for orig, loaded in zip(list(model.predict(X_valid)), list(df_json[1])):
    if orig != loaded:
        print('{}-{} : Not identical'.format(orig,loaded))
    else:
        print('{}-{} : Identical'.format(orig,loaded))

1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
1-1 : Identical
0-0 : Identical
1-1 : Identical


## Viola! We have managed to publish a model to an API on flask to classify our data