# Prepare the real-time scoring model

In [0]:
#!pip install --upgrade azureml-train-automl-runtime==1.36.0
#!pip install --upgrade azureml-automl-runtime==1.36.0
#!pip install --upgrade scikit-learn
#!pip install --upgrade numpy

In [0]:
%sh
pip install azureml-core

In [0]:
from azureml.core import Workspace
from azureml.core.model import Model

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# sklearn.externals.joblib was deprecated in 0.21
from sklearn import __version__ as sklearnver
from packaging.version import Version
if Version(sklearnver) < Version("0.21.0"):
    from sklearn.externals import joblib
else:
    import joblib

import numpy as np
import pandas as pd

ws = Workspace.get(name='fd-machine-learning-workspace',
                    subscription_id= '831359a3-06f6-4ed7-a3d0-535b8b673781',
                    resource_group='fd-resource-group')


In [0]:
account_df = pd.read_csv('../data/Account_Info.csv')
fraud_df = pd.read_csv('../data/Fraud_Transactions.csv')
untagged_df = pd.read_csv( '../data/Untagged_Transactions.csv')

View the fraud dataframe

In [0]:
fraud_df.head(3)

View the account info dataframe.

In [0]:
account_df.head(3)

View the untagged transactions dataframe.

In [0]:
###### Reorder the column of dataframe by ascending order in pandas 
cols=untagged_df.columns.tolist()
cols.sort()
untagged_df=untagged_df[cols]

untagged_df.head(3)

## Prepare data

The raw data has some issues we need to cleanup before we can use it to train a model, which we perform in the following cells.

### Prepare accounts

Begin by cleaning the data in accounts data set.
Remove columns that have very few or no values: `accountOwnerName`, `accountAddress`, `accountCity` and `accountOpenDate`

In [0]:
account_df_clean = account_df[["accountID", "transactionDate", "transactionTime", 
                               "accountPostalCode", "accountState", "accountCountry", 
                               "accountAge", "isUserRegistered", "paymentInstrumentAgeInAccount", 
                               "numPaymentRejects1dPerUser"]]

Create a copy of the dataframe so our data manipulation does not affect the original.

In [0]:
account_df_clean = account_df_clean.copy()

Let's ensure that values that are not numeric (e.g., they have incorrect string values or garbage data) are converted to NaN and then we can fill those NaN values with 0.

In [0]:
account_df_clean['paymentInstrumentAgeInAccount'] = pd.to_numeric(account_df_clean['paymentInstrumentAgeInAccount'], errors='coerce')
account_df_clean['paymentInstrumentAgeInAccount'] = account_df_clean[['paymentInstrumentAgeInAccount']].fillna(0)['paymentInstrumentAgeInAccount']

Next, let's convert the `numPaymentRejects1dPerUser` so that the column has a datatype of `float` instead of `object`.

In [0]:
account_df_clean["numPaymentRejects1dPerUser"] = account_df_clean[["numPaymentRejects1dPerUser"]].astype(float)["numPaymentRejects1dPerUser"]

In [0]:
account_df_clean["numPaymentRejects1dPerUser"].value_counts()

`account_df_clean` is now ready for use in modeling.

### Prepare untagged transactions

Next, cleanup the untagged transactions data set. There are 16 columns in the untagged_transactions whose values are all null, let's drop these columns to simplify our dataset.

In [0]:
untagged_df_clean = untagged_df.dropna(axis=1, how="all").copy()

We can examine the count of non-null values, and view the inferred data type for each column by running the following cell. Looking at the output of the cell, we have some work to do. For a start, we have columns with fewer than 200,000 non-null values. This means there are some null values in that column that we need to fix.

Let's cleanup the `localHour` field. 

Replace null values in `localHour` with `-99`. Also replace values of `-1` with `-99`.

In [0]:
untagged_df_clean["localHour"] = untagged_df_clean["localHour"].fillna(-99)
untagged_df_clean.loc[untagged_df_clean.loc[:,"localHour"] == -1, "localHour"] = -99

Confirm the values now look good.

In [0]:
untagged_df_clean["localHour"].value_counts()

Clean up the remaining null fields:
- Fix missing values for location fields by setting them to `NA` for unknown. 
- Set `isProxyIP` to False
- Set `cardType` to `U` for unknown (which is a new level)
- Set `cvvVerifyResult` to `N` which means for those where the transaction failed because the wrong CVV2 number was entered ro no CVV2 numebr was entered, treat those as if there was no CVV2 match.

In [0]:
untagged_df_clean = untagged_df_clean.fillna(value={"ipState": "NA", "ipPostcode": "NA", "ipCountryCode": "NA", 
                               "isProxyIP":False, "cardType": "U", 
                               "paymentBillingPostalCode" : "NA", "paymentBillingState":"NA",
                               "paymentBillingCountryCode" : "NA", "cvvVerifyResult": "N"
                              })

Confirm all null values have been addressed.

The `transactionScenario` column provides no insights because all rows have the same `A` value. Let's drop that column. Same idea for the `transactionType` column.

In [0]:
del untagged_df_clean["transactionScenario"]

In [0]:
del untagged_df_clean["transactionType"]

`untagged_df_clean` is now ready for use in modeling.

### Prepare fraud transactions

Now move on to preparing the fraud transactions data set.

The `transactionDeviceId` has no meaningful values, so we will drop it.

In [0]:
fraud_df_clean = fraud_df.copy()
del fraud_df_clean['transactionDeviceId']

The fraud data set has a `localHour` field that we need to fill missing values, just as we did for the account data set.

In [0]:
fraud_df_clean["localHour"] = fraud_df_clean["localHour"].fillna(-99)

Examine your work, you should have 8640 non-null values in each column.

`fraud_df_clean` is now ready for use in modeling.

## Create labels

The goal is to create a dataframe with all transactions, where each transaction is tagged via the `isFraud` column with a value of `0` - no fraud or `1` - fraudulent. 

Any transactions that appear in untagged_transactions dataframe that also appear in the fraud dataframe will be marked as fraudulent. 

The remaining transactions will be marked as not fraudulent. 

Run the following cells to create the labels series.

In [0]:
all_labels = untagged_df_clean["transactionID"].isin(fraud_df_clean["transactionID"])

In [0]:
all_transactions = untagged_df_clean

Then we can save our estimators module.

In [0]:
# write out to models/customestimators.py
scoring_service = """
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
class NumericCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self = self
    def fit(self, X, y=None):
        print("NumericCleaner.fit called")
        return self
    def transform(self, X):
        print("NumericCleaner.transform called")
        X["localHour"] = X["localHour"].fillna(-99)
        X.loc[X.loc[:,"localHour"] == -1, "localHour"] = -99
        return X

class CategoricalCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self = self
    def fit(self, X, y=None):
        print("CategoricalCleaner.fit called")
        return self
    def transform(self, X):
        print("CategoricalCleaner.transform called")
        X = X.fillna(value={"cardType":"U","cvvVerifyResult": "N"})
        return X
""" 

with open("./customestimators.py", "w") as file:
    file.write(scoring_service)

Next, load the estimators.

In [0]:
from customestimators import NumericCleaner, CategoricalCleaner

Now build the pipeline that will prepare the data. 

The gist of the following cell is to split the data preparation into two paths, splitting the data sets vertically, and then combine the result. The `ColumnTransformer` will effectively concatenate the data frame that results from the numeric transformations with the data frame resulting from the categorical transformations. 

- Numeric Transformer Pipeline: We use the custom transformers created previously to cleanup the numeric columns. Since the model you will train in this notebook is a Support Vector Machine classifier, we need to standardize the scale of numeric values which is what the `StandardScaler` provides.
- Categorical Transformer Pipeline: We use the custome transformer created previously cleanup the categorical columns. Then we one-hot encode each value of each categorical column, resulting in a wider data frame with one column for each possible value (and 1 appearing in rows that had that value).

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_features=["transactionAmountUSD", "transactionDate", "transactionTime", "localHour", 
                  "transactionIPaddress", "digitalItemCount", "physicalItemCount"]

categorical_features=["transactionCurrencyCode", "browserLanguage", "paymentInstrumentType", "cardType", "cvvVerifyResult"]                           

numeric_transformer = Pipeline(steps=[
    ('cleaner', NumericCleaner()),
    ('scaler', StandardScaler())
])
                               
categorical_transformer = Pipeline(steps=[
    ('cleaner', CategoricalCleaner()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

Let's confirm we run all our historical data thru this transformation pipeline and observe the resulting shape.

In [0]:
preprocessed_result = preprocessor.fit_transform(all_transactions)

In [0]:
preprocessed_result.shape

In [0]:
pd.DataFrame(preprocessed_result.todense()).head(5)

## Create pipeline and train a simple model

Now you will build upon the transformation pipeline you created previously to train a model to classify rows as fraudulent or not fraudulent.

Run the following cells to make sure you've imported the dependencies for the pipeline (you probably already have, but having them clearly loaded here will help you when porting your code to a web service).

In [0]:
from customestimators import NumericCleaner, CategoricalCleaner
from sklearn.model_selection import train_test_split

As might be obvious, our data has a lot of samples that are not fraudulent. If we proceed to train a model, we will effectively train the model to predict non-fraud. This situation where one class (non-fraud) appears much more often than the others (fraud) is called a class imbalance, and to mitigate its effect we can reduce the number of non-fraud samples so that we have the same number of non-fraud and fraud samples. 

Run the following cells to downsize and then randomly sample 1,151 non-fraud rows, and then we'll union these row with our 1,151 fraud rows.

> Feel free to ignore any `SettingWithCopyWarning` warnings in the cell output below.

In [0]:
only_fraud_samples = all_transactions.loc[all_labels == True]
only_fraud_samples["label"] = True
only_non_fraud_samples = all_transactions.loc[all_labels == False]
only_non_fraud_samples["label"] = False
random_non_fraud_samples = only_non_fraud_samples.sample(n=1151, replace=False, random_state=42)
balanced_transactions = pd.concat([random_non_fraud_samples, only_fraud_samples])

balanced_transactions["label"].value_counts()

Next, you need to separate out the label column from the dataframe so the labels are not used as input features:

In [0]:
balanced_labels = balanced_transactions["label"]
del balanced_transactions["label"]

Now you will create subsets of the training data frame, one that will be used for training the model `X_train` and `y_train` and the another that reserved for testing its performance `X_test` and `y_test`.

In [0]:
X_train, X_test, y_train, y_test = train_test_split(balanced_transactions, balanced_labels, 
                                                    test_size=0.2, random_state=42)

Now train the model. In this case, you will use the `LinearSVC` class.

> Feel free to ignore any `ConvergenceWarning` warnings in the cell output below

In [0]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

svm_clf = Pipeline((
    ("preprocess", preprocessor),
    ("linear_svc", LinearSVC(C=1, loss="hinge"))
))
svm_clf.fit(X_train, y_train)

Test the model predicting against a single row from the test set.

In [0]:
svm_clf.predict(X_test[0:1])

Next, evaluate the model by examining how well it is predicting against all data in the training set.

In [0]:
y_train_preds = svm_clf.predict(X_train)

Use a confusion matrix to see how your model performed when correctly predicting non-fraud and fraud (the top left and bottom right values). Also, examine how your model made mistakes (the bottom left and top right values). In the below, the column headers are predicted non-fraud and predicted fraud, and the row headers are actually non-fraud, and actually fraud (e.g., as described by the training data).

In [0]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
confusion_matrix(y_train, y_train_preds)

Take a look at the performance of your model using the common set of metrics for a classifier. Do you think this is good or bad?

In [0]:
print("Accuracy:", accuracy_score(y_train, y_train_preds))
print("Precision:", precision_score(y_train, y_train_preds))
print("Recall:", recall_score(y_train, y_train_preds))
print("F1:", f1_score(y_train, y_train_preds))
print("AUC:", roc_auc_score(y_train, y_train_preds))

Given that this is just a parsimonous model, this model provides a start that performs better than random (as indicated by the AUC being greater than 0.5). There is more work (such as additional feature engineering) that can be done to improve this beyond the current performance that you would want to do before deploying it in production. A parsiminous model helps us to both see if the desired classification is possible given the data and allows to quickly get to something we can deploy as a service to enable integration early on. Then we can iterate deploying improved versions of the model.

Now, evaluate the same using the test data set, using data the trained model has not seen. How does it perform?

In [0]:
y_test_preds = svm_clf.predict(X_test)
print(confusion_matrix(y_test, y_test_preds))
print(accuracy_score(y_test, y_test_preds))
print("Accuracy:", accuracy_score(y_test, y_test_preds))
print("Precision:", precision_score(y_test, y_test_preds))
print("Recall:", recall_score(y_test, y_test_preds))
print("F1:", f1_score(y_test, y_test_preds))
print("AUC:", roc_auc_score(y_test, y_test_preds))

The overall performance of the model against data it has not seen (the test data) is similar to how it performs with the training data. That's a good sign, indicating we did not overfit the model to the training data.

Next, let's look the steps to prepare the model for deployment as a web service.

## Save the model to disk

In preparation for deploying the model, you need to save the model to disk.

In [0]:
joblib.dump(svm_clf, 'fraud_score.pkl')

## Test loading the model

Next simulate re-loading the model from disk, just like the web service (which you will create in a moment) will have to do.

In [0]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from customestimators import NumericCleaner, CategoricalCleaner

# sklearn.externals.joblib was deprecated in 0.21
from sklearn import __version__ as sklearnver
from packaging.version import Version
if Version(sklearnver) < Version("0.21.0"):
    from sklearn.externals import joblib
else:
    import joblib

desired_cols = ['accountID',
 'browserLanguage',
 'cardType',
 'cvvVerifyResult',
 'digitalItemCount',
 'ipCountryCode',
 'ipPostcode',
 'ipState',
 'isProxyIP',
 'localHour',
 'paymentBillingCountryCode',
 'paymentBillingPostalCode',
 'paymentBillingState',
 'paymentInstrumentType',
 'physicalItemCount',
 'transactionAmount',
 'transactionAmountUSD',
 'transactionCurrencyCode',
 'transactionDate',
 'transactionID',
 'transactionIPaddress',
 'transactionTime']

scoring_pipeline = joblib.load('fraud_score.pkl')

In [0]:
untagged_df_fresh = pd.read_csv('./data/Untagged_Transactions.csv')[desired_cols]

test_pipeline_preds = scoring_pipeline.predict(untagged_df_fresh)
test_pipeline_preds

In [0]:
one_row = untagged_df_fresh.iloc[:1]
test_pipeline_preds2 = scoring_pipeline.predict(one_row)
test_pipeline_preds2

In [0]:
import os
import azureml
from azureml.core import Workspace
from azureml.core.model import Model

# sklearn.externals.joblib was deprecated in 0.21
from sklearn import __version__ as sklearnver
from packaging.version import Version
if Version(sklearnver) < Version("0.21.0"):
    from sklearn.externals import joblib
else:
    import joblib

In [0]:
def saveModelToAML(ws, model, model_folder_path="models", model_name="realtime-score"):
    # create the models subfolder if it does not exist in the current working directory
    target_dir = './' + model_folder_path
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
  
    # save the model to disk
    joblib.dump(model, model_folder_path + '/' + model_name + '.pkl')
  
    # notice for the model_path, we supply the name of the model outputs folder without a trailing slash
    # anything present in the model folder path will be uploaded to AML along with the model
    print("Registering and uploading model...")
    registered_model = Model.register(model_path=model_folder_path, 
                                      model_name=model_name, 
                                      workspace=ws)
    return registered_model

In [0]:
def loadModelFromAML(ws, model_name="realtime-score"):
  # download the model folder from AML to the current working directory
  model_file_path = Model.get_model_path(model_name, _workspace=ws)
  print('Loading model from:', model_file_path)
  model = joblib.load(model_file_path)
  return model

In [0]:
#Save the model to the AML Workspace
registeredModel = saveModelToAML(ws, svm_clf)

In [0]:
gbct = loadModelFromAML(ws)
y_test_preds = gbct.predict(X_test)

In [0]:
y_test_preds