# The simplest way to deploy a ML model and predict unseen samples.

In order to train a ML model you need two things: a dataset to train you model with and the model itself.<br>
To predict unseen samples you need three things: the unseen samples, the trained model and the dataset schema used to train the model.<br>
<br>
The training dataset schema is necessary for you to arrange unseen sample features in the same order as the training samples.<br>
If you neglect this, it is very possible you will end up with incorrect predictions without even noticing it.<br>
<br>
So let's get to work.

In [22]:
# Training a model.
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier as gb

iris = load_iris()
X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
y = pd.DataFrame(data=iris.target, columns=['target'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = gb()
clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [16]:
# The training dataset schema.
schema = list(X_train.columns)
print(schema)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [33]:
# By now you have all you need to deploy the training model in a persistent manner.
# Check out this link: https://scikit-learn.org/stable/modules/model_persistence.html
persistent_model = {
	'model':clf,
	'schema':schema
}

persistent_model

{'model': GradientBoostingClassifier(criterion='friedman_mse', init=None,
               learning_rate=0.1, loss='deviance', max_depth=3,
               max_features=None, max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=100,
               presort='auto', random_state=None, subsample=1.0, verbose=0,
               warm_start=False),
 'schema': ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)']}

In [34]:
from joblib import dump, load
dump(persistent_model, './persistent_model.joblib')

['./persistent_model.joblib']

In [42]:
# Later you can load back the pickled model (possibly in another Python process).
persistent_model = load('./persistent_model.joblib')
model = persistent_model['model']
schema = persistent_model['schema']

print('Model: ',  model)
print('Schema: ', schema)

Model:  GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
Schema:  ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [43]:
# Reorder unseen samples using the training schema.
X_test = X_test[schema]

assert(X_test.columns == X_train.columns).all()

In [44]:
# Predict.
clf.predict(X_test)

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 1, 2, 1, 2])

In [45]:
# Ok, there you have it. Now let's DRY it.
from joblib import dump, load

def save_model(clf, schema, path):
	persistent_model = {'model':clf, 'schema':schema}
	return dump(clf, path)

def load_model(path):
	persistent_model = load(path)
	model = persistent_model['model']
	schema = persistent_model['schema']
	return model, schema