### Working with Machine Learning APIs

1. Options to implement Machine Learning models.
2. What are APIs?
3. Python Environment Setup & Flask Basics.
4. Creating a Machine Learning Model.
5. Saving the Machine Learning Model: Serialization & Deserialization.
6. Creating an API using Flask.

In [None]:
"""Filename: hello-world.py
    """
from flask import Flask

app = Flask(__name__)

@app.route('/users/<string:username>')
def hello_world(username=None):
    return("Hello {}!".format(username))

In [None]:
# Save the above file as a python file hello-world.py

In [1]:
# Lets build an ML script for prediction
import os
import json
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
!ls 

API-Project  hello-world.py  __pycache__  Untitled.ipynb


In [8]:
!ls '/home/michael/Documents/Programming#/Learning APIs/API-Project/data'

testing.csv  training.csv


In [2]:
path = '/home/michael/Documents/Programming#/Learning APIs/API-Project/data/'
data = pd.read_csv(path + 'training.csv')
list(data.columns)

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Loan_Status']

In [10]:
data.shape

(614, 13)

In [11]:
for i in data.columns:
    print('The number of null values in: {} == {}'.format(i, data[i].isnull().sum()))

The number of null values in: Loan_ID == 0
The number of null values in: Gender == 13
The number of null values in: Married == 3
The number of null values in: Dependents == 15
The number of null values in: Education == 0
The number of null values in: Self_Employed == 32
The number of null values in: ApplicantIncome == 0
The number of null values in: CoapplicantIncome == 0
The number of null values in: LoanAmount == 22
The number of null values in: Loan_Amount_Term == 14
The number of null values in: Credit_History == 50
The number of null values in: Property_Area == 0
The number of null values in: Loan_Status == 0


In [6]:
# Lets build an ML script for prediction
import os
import json
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings('ignore')


def build_all():
    data = pd.read_csv(path + 'training.csv')

    data = data.dropna(subset=['Gender', 'Married', 'Credit_History', 'LoanAmount'])

    # Create train and test sets
    pred_var = ['Gender', 'Married', 'Dependents', 'Education', 
                'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 
                'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 
                'Property_Area']

    X_train, X_test, y_train, y_test = train_test_split(data[pred_var], data['Loan_Status'], test_size = 0.25, random_state = 42)

    # Convert y_train & y_test to np.array:
    y_train = y_train.replace({'Y':1, 'N':0}).as_matrix()
    y_test = y_test.replace({'Y':1, 'N':0}).as_matrix()

    pipe = make_pipeline(PreProcessing(),
                         RandomForestClassifier())

    param_grid = {"randomforestclassifier__n_estimators" : [10, 20, 30],
                  "randomforestclassifier__max_depth" : [None, 6, 8, 10],
                  "randomforestclassifier__max_leaf_nodes": [None, 5, 10, 20], 
                  "randomforestclassifier__min_impurity_split": [0.1, 0.2, 0.3]}

    grid = GridSearchCV(pipe, param_grid=param_grid, cv = 3)

    grid.fit(X_train, y_train)

    return grid

To make sure that the pre-processing steps are followed religiously even after we are done with experimenting and we do not miss them while predictions, we’ll create a custom pre-processing Scikit-learn estimator.

In [3]:
from sklearn.base import TransformerMixin, BaseEstimator

class PreProcessing(BaseEstimator, TransformerMixin):
    """Custom pre-processing estimator for our use-case
    """
    
    def __init__(self):
        pass
    
    def transform(self, df):
        """Regular transform() that is a help for training, validation
        & testing datasets.
        Note: The operations performed here are the ones that we did prior to this cell
        """
        pred_var = ['Gender', 'Married', 'Dependents', 'Education', 
                    'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 
                    'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 
                    'Property_Area']
        df = df[pred_var]
        
        # Fill missing values
        df['Dependents'] = df['Dependents'].fillna(0)
        df['Self_Employed'] = df['Self_Employed'].fillna('No')
        df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(self.term_mean_)
        df['Credit_History'] = df['Credit_History'].fillna(1)
        df['Married'] = df['Married'].fillna('No')
        df['Gender'] = df['Gender'].fillna('Male')
        df['LoanAmount'] = df['LoanAmount'].fillna(self.amt_mean_)
        
        # encode values
        gender_values = {'Female':0, 'Male': 1}
        married_values = {'No':0, 'Yes':1}
        education_values = {'Graduate':0, 'Not Graduate':1}
        employed_values = {'No':0, 'Yes':1}
        property_values = {'Rural':0, 'Urban':1, 'Semiurban':2}
        dependent_values = {'3+':3, '0':0,'2':2,'1':1}
        df.replace({'Gender':gender_values, 'Married':married_values,
                    'Education':education_values, 'Self_Employed':employed_values, 
                    'Property_Area':property_values, 'Dependents':dependent_values}, inplace=True)
        
        return df.as_matrix()
    
    def fit(self, df, y = None, **fit_params):
        """Fitting the training set and calculating the required values from the train
            e.g. We will need the mean of X_train['Loan_Amount_Term'] that 
            will be used in transforming X_test
        """
        
        self.term_mean_ = df['Loan_Amount_Term'].mean()
        self.amt_mean_ = df['LoanAmount'].mean()
        return self       

In [7]:
a = build_all()

In [8]:
a

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', PreProcessing()), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impu...bs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'randomforestclassifier__n_estimators': [10, 20, 30], 'randomforestclassifier__max_depth': [None, 6, 8, 10], 'randomforestclassifier__max_leaf_nodes': [None, 5, 10, 20], 'randomforestclassifier__min_impurity_split': [0.1, 0.2, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [None]:
# Convert y_train & y_test to np.array:
y_train = y_train.replace({'Y':1, 'N':0}).as_matrix()
y_test = y_test.replace({'Y':1, 'N':0}).as_matrix()

We’ll create a pipeline to make sure that all the preprocessing steps that we do are just a single scikit-learn estimator.

In [17]:
pipe = make_pipeline(PreProcessing(),
                     RandomForestClassifier())

pipe

Pipeline(memory=None,
     steps=[('preprocessing', PreProcessing()), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

To search for the best hyper-parameters (degree for Polynomial Features & alpha for Ridge), we’ll do a Grid Search:

Defining param_grid:

In [18]:
param_grid = {"randomforestclassifier__n_estimators" : [10, 20, 30],
              "randomforestclassifier__max_depth" : [None, 6, 8, 10],
              "randomforestclassifier__max_leaf_nodes": [None, 5, 10, 20], 
              "randomforestclassifier__min_impurity_split": [0.1, 0.2, 0.3]}

Running the Grid Search:

In [19]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv = 3)

Fitting the training data to the pipeline estimator:

In [21]:
grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', PreProcessing()), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impu...bs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'randomforestclassifier__n_estimators': [10, 20, 30], 'randomforestclassifier__max_depth': [None, 6, 8, 10], 'randomforestclassifier__max_leaf_nodes': [None, 5, 10, 20], 'randomforestclassifier__min_impurity_split': [0.1, 0.2, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

Let’s see what parameter did the Grid Search select:

In [22]:
print('Best Parameters: {}'.format(grid.best_params_))

Best Parameters: {'randomforestclassifier__max_depth': None, 'randomforestclassifier__max_leaf_nodes': 5, 'randomforestclassifier__min_impurity_split': 0.3, 'randomforestclassifier__n_estimators': 30}


In [23]:
print('Validation set score: {:.2f}'.format(grid.score(X_test, y_test)))

Validation set score: 0.77


In [9]:
# Loading the test set
test_df = pd.read_csv(path+'testing.csv', encoding='utf-8-sig')
test_df = test_df.head()

In [10]:
a.predict(test_df)

array([1, 1, 1, 1, 1])

In [27]:
grid.predict(test_df)

array([1, 1, 1, 1, 1])

In [32]:
# Dump the model to a pickle file
import dill as pickle
filename = 'model_v1.sav'

In [33]:
path = '/home/michael/Documents/Programming#/Learning APIs/API-Project/'
with open(path+'models/'+filename, 'wb') as file:
    joblib.dump(a, file)

FileNotFoundError: [Errno 2] No such file or directory: '/home/michael/Documents/Programming#/Learning APIs/API-Project/models/model_v1.sav'

In [17]:
# Load the model again
with open(path+'models/'+filename, 'rb') as f:
    loaded_model = joblib.load(f)

In [18]:
loaded_model.predict(test_df)

array([1, 1, 1, 1, 1])

Our pipeline is looking pretty swell & fairly decent to go the most important step of the tutorial: Serialize the Machine Learning Model

We have a custom Class that we need to import while running our training, hence we’ll be using dill module to packup the estimator Class with our grid object.

It is advisable to create a separate training.py file that contains all the code for training the model 

Now that we have a working model, let's serve it.

There are three important parts in constructing our wrapper function, apicall():
    
    1. Getting the request data (for which predictions are to be made)
    2. Loading our pickled estimator
    3. jsonify our predictions and send the response back with status code: 200

HTTP messages are made of a header and a body. As a standard, majority of the body content sent across are in json format. We’ll be sending (POST url-endpoint/) the incoming data as batch to get predictions.

(NOTE: You can send plain text, XML, csv or image directly but for the sake of interchangeability of the format, it is advisable to use json)

In [None]:
"""Filename: server.py
"""

import os
import pandas as pd
from sklearn.externals import joblib
from flask import Flask, jsonify, request

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def apicall():
    """API Call
    
    Pandas dataframe (sent as a payload) from API Call
    """
    try:
        test_json = request.get_json()
        test = pd.read_json(test_json, orient='records')
        
        # To resolve the issue of TypeError: Cannot compare types 'ndarray(dtype=int64)' and 'str'
        test['Dependents'] = [str(x) for x in list(test['Dependents'])]
    
        # Getting the Loan_IDs seperated out
        loan_ids = test['Loan_ID']
    except Exception as e:
        raise e
        
    clf ='model_v1.pk'
    
    if test.empty:
        return(bad_request())
    else:
        # Load the saved model 
        print("Loading the model...")
        loaded_model = None
        with open(path+'/models/'+clf, 'rb') as f:
            loaded_model = pickle.load(f)
            
            
        print('|The model has been loaded... doing predictions now...')
        predictions = loaded_model.predict(test)
        
        """Add the predictions as Series to a new pandas Dataframe
        
        
                                OR
                                
        
        But we need to send the response codes as well
        """
        responses = jsonify(predictions = final_predictions.to_json(orient='records'))
        
        responses.status_code = 200
        
        return responses
    

In [37]:
import json
import requests

In [28]:
"""setting the headers to send and accept json responses
"""

header = {'Content-Type':'application/json', 
          'Accept':'application/json'}

"""Reading test batch
"""
df = pd.read_csv(path + 'data/testing.csv', encoding='utf-8-sig')
df = df.head()

"""Converting Pandas dataframe to json

"""
data = df.to_json(orient = 'records')

data

'[{"Loan_ID":"LP001015","Gender":"Male","Married":"Yes","Dependents":"0","Education":"Graduate","Self_Employed":"No","ApplicantIncome":5720,"CoapplicantIncome":0,"LoanAmount":110.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Urban"},{"Loan_ID":"LP001022","Gender":"Male","Married":"Yes","Dependents":"1","Education":"Graduate","Self_Employed":"No","ApplicantIncome":3076,"CoapplicantIncome":1500,"LoanAmount":126.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Urban"},{"Loan_ID":"LP001031","Gender":"Male","Married":"Yes","Dependents":"2","Education":"Graduate","Self_Employed":"No","ApplicantIncome":5000,"CoapplicantIncome":1800,"LoanAmount":208.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Urban"},{"Loan_ID":"LP001035","Gender":"Male","Married":"Yes","Dependents":"2","Education":"Graduate","Self_Employed":"No","ApplicantIncome":2340,"CoapplicantIncome":2546,"LoanAmount":100.0,"Loan_Amount_Term":360.0,"Credit_History":null,"Property_Are

In [29]:
"""POST <url>/predict
"""
resp = requests.post("http://0.0.0.0:8000/predict", 
                     data = json.dumps(data),
                     headers= header)

In [30]:
resp.status_code

500

In [None]:
"""The final response we get is as follows:
"""
resp.json()

In [36]:
data = df.to_json(orient = 'records')

AttributeError: 'SList' object has no attribute 'to_json'

In [42]:
!pip freeze > requi.txt

In [44]:
import pickle
pickle.dump(a, open('model.pkl', 'wb'))

In [45]:
!ls

 API-Project			      model.pkl     requi.txt
 hello-world.py			      models	    server.py
'Model Classing for API call.ipynb'   __pycache__   training_1.py


### New Model

In [46]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [49]:
!ls

 API-Project			      model.pkl     server.py
 app.py				      models	    titanic.csv
 hello-world.py			      __pycache__   training_1.py
'Model Classing for API call.ipynb'   requi.txt


In [52]:
# Create dataframe
train = pd.read_csv('titanic.csv', sep='\t')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [53]:
# Drop null values
train.dropna(inplace =True)

In [54]:
# features and target
target = 'Survived'
features = ['Pclass', 'Age', 'SibSp', 'Fare']

In [55]:
# X matrix, y vector
X = train[features]
y = train[target]

In [56]:
# model
model = LogisticRegression()
model.fit(X, y)
model.score(X,y)

0.7037037037037037

In [57]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))

The structure of the code follows:

    Load pickled model
    Name flask app
    Create a route that receives JSON inputs, 
        uses the trained model to make a prediction, 
        and returns that prediction in a JSON format, 
        which can be accessed through the API endpoint.

In [58]:
# Testing your flask app
# Using the local url
url = 'http://127.0.0.1:5000/'

In [59]:
# sample data
data = {'Pclass': 3, 
        'Age': 2, 
        'SibSp': 1, 
        'Fare': 50}
data = json.dumps(data)

In [60]:
type(data)

str

Post sample data and check response code using requests.post(url, data). You want to get a response code of 200 to make sure that the app is working:

In [61]:
send_request = requests.post(url, data)
print(send_request)

<Response [200]>


In [62]:
send_request.connection

<requests.adapters.HTTPAdapter at 0x7f4308ff7e80>

In [63]:
send_request.status_code

200

In [64]:
# Print the json of th request to see the model's prediction
print(send_request.json())

{'results': {'results': 1}}


Result 1 means the passenger survived based on the dataset.

Let's shutdown the flask app by typing ctrl + c when done testing.

Create Procfile:

A Procfile specifies the commands that are executed by a Heroku app on startup. To create one, open up a new file named Procfile (no extension) in the working directory and paste the following.

In [None]:
# Procfile
web: gunicorn app:app # <Dont run this>

In [None]:
#Create requirements.txt
!pip freeze > requi.txt