# SageMaker Payment Classification 


### 1. Setup <a class="anchor" id="Setup"></a>


In [1]:
#import sys

#!{sys.executable} -m pip install --upgrade pip       --quiet # upgrade pip to the latest vesion
#!{sys.executable} -m pip install --upgrade sagemaker --quiet # upgrade SageMaker to the latest vesion

In [2]:
import boto3
import io
import sagemaker
import time
import os

from time import sleep
from sklearn.metrics import classification_report
from sagemaker.feature_store.feature_group import FeatureGroup

import pandas as pd
import numpy as np

In [3]:
region = sagemaker.Session().boto_region_name
sm_client = boto3.client("sagemaker")
boto_session = boto3.Session(region_name=region)
sagemaker_session = sagemaker.session.Session(boto_session=boto_session, sagemaker_client=sm_client)
role = sagemaker.get_execution_role()
bucket_prefix = "payment-classification"
s3_bucket = sagemaker_session.default_bucket()

We define the factorize key which is used to map the '__transaction_category__' to numeric values

In [4]:
factorize_key = {
    "Uncategorized": 0,
    "Entertainment": 1,
    "Education": 2,
    "Shopping": 3,
    "Personal Care": 4,
    "Health and Fitness": 5,
    "Food and Dining": 6,
    "Gifts and Donations": 7,
    "Investments": 8,
    "Bills and Utilities": 9,
    "Auto and Transport": 10,
    "Travel": 11,
    "Fees and Charges": 12,
    "Business Services": 13,
    "Personal Services": 14,
    "Taxes": 15,
    "Gambling": 16,
    "Home": 17,
    "Pension and insurances": 18,
}

### 2. Data preparation <a class="anchor" id="Data-preparation"></a>

We ingest the simulated data from the public SageMaker S3 training database:

In [5]:
!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/synthetic_financial/financial_transactions_mini.csv .

download: s3://sagemaker-sample-files/datasets/tabular/synthetic_financial/financial_transactions_mini.csv to ./financial_transactions_mini.csv


Let's start by loading the dataset from our csv file into a Pandas dataframe

In [6]:
data = pd.read_csv(
    "financial_transactions_mini.csv",
    parse_dates=["timestamp"],
    infer_datetime_format=True,
    dtype={"transaction_category": "string"},
)

The dataframe looks as follows:

| | transaction_category | receiver_id | sender_id | amount | timestamp |
|------:|:-----------------------|-----------------:|-----------------:|---------:|:--------------------|
| 39733 | Shopping | 4258863736072564 | 4630246970548037 | 91.58 | 2021-03-10 01:28:23 |
| 27254 | Shopping | 4356269497886716 | 4752313573239323 | 115.17 | 2021-01-22 23:28:24 |
| 30628 | Shopping | 4233636409552058 | 4635766441812956 | 90.98 | 2021-02-05 03:24:10 |
| 46614 | Shopping | 4054967431278644 | 4823810986511227 | 86.74 | 2021-04-02 14:42:45 |
| 37957 | Shopping | 4831814582525664 | 4254514582909482 | 123.27 | 2021-03-17 11:17:18 |
| 46878 | Shopping | 4425943481448900 | 4349267977109013 | 65.53 | 2021-03-17 15:47:49 |
| 81350 | Auto and Transport | 4146116413442105 | 4062723166078919 | 91.67 | 2021-03-29 13:23:44 |
| 10613 | Entertainment | 4788727923958282 | 4485838385631386 | 76.22 | 2021-02-11 17:45:53 |
| 46715 | Shopping | 4702782703461430 | 4944181591271506 | 86.67 | 2021-03-20 15:37:17 |
| 69110 | Investments | 4180233446952120 | 4702069426390603 | 530.39 | 2021-04-21 08:28:13 |

In [7]:
data.sample(10)

Unnamed: 0,transaction_category,receiver_id,sender_id,amount,timestamp
46278,Shopping,4731292609202362,4672555770541581,66.25,2021-04-03 19:26:45
20594,Shopping,4737643537570227,4214145129441298,85.52,2021-02-26 14:02:10
48917,Shopping,4267960178972308,4112752513516164,87.78,2021-04-11 13:23:25
57356,Food and Dining,4989217930879371,4657405327740212,65.26,2021-01-29 16:07:13
20085,Shopping,4399267668778422,4205530797082881,62.05,2021-01-04 21:49:19
52102,Health and Fitness,4418563900622718,4179877536691817,88.03,2021-02-12 18:22:04
89207,Auto and Transport,4241319998581099,4663876677020334,109.34,2021-02-19 18:41:17
5206,Entertainment,4741183460284839,4969803101503108,77.14,2021-02-26 16:41:46
72642,Auto and Transport,4699729158312266,4430535174050598,153.53,2021-03-12 19:04:08
14392,Entertainment,4285140543694436,4747093946612665,14.66,2021-02-14 18:33:09


Next, we extract the year, month, day, hour, minute, second from the timestamp and remove the timestamp

In [8]:
data["year"] = data["timestamp"].dt.year
data["month"] = data["timestamp"].dt.month
data["day"] = data["timestamp"].dt.day
data["hour"] = data["timestamp"].dt.hour
data["minute"] = data["timestamp"].dt.minute
data["second"] = data["timestamp"].dt.second

del data["timestamp"]

We'll transform the transaction categories to numeric targets for the classification by factorization.

In [9]:
data["transaction_category"] = data["transaction_category"].replace(factorize_key)

### 3. Create feature store <a class="anchor" id="Create-feature-store"></a>

To enrich dataset we will use the [Feature Store](https://aws.amazon.com/sagemaker/feature-store/). 

Before creating the feature store itself we need to set a name for the feature group and identifier used

In [10]:
feature_group_name = "feature-group-payment-classification"
record_identifier_feature_name = "identifier"

With the name we defined we create the feature group, runtime and session

In [11]:
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sagemaker_session)

featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = sagemaker.Session(
    boto_session=boto_session,
    sagemaker_client=sm_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

Once we have defined our feature store we need to put some data in it. We create a Pandas dataframe with the columns mean_amount, count, identifier and event time to store in the feature store

In [12]:
columns = ["mean_amount", "count", "identifier", "EventTime"]
feature_store_data = pd.DataFrame(columns=columns, dtype=object)

feature_store_data["identifier"] = range(19)
feature_store_data["mean_amount"] = 0.0
feature_store_data["count"] = 1
feature_store_data["EventTime"] = time.time()

Using the created dataframe we set the feature definitions

In [13]:
feature_group.load_feature_definitions(data_frame=feature_store_data)

[FeatureDefinition(feature_name='mean_amount', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='count', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='identifier', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='EventTime', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>)]

With these definitions ready we can create the feature group itself

In [14]:
feature_group.create(
    s3_uri=f"s3://{s3_bucket}/{bucket_prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name="EventTime",
    role_arn=role,
    enable_online_store=True,
)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:623849366717:feature-group/feature-group-payment-classification',
 'ResponseMetadata': {'RequestId': 'bdab5ed1-4bfb-413b-89e4-702d95125c32',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'bdab5ed1-4bfb-413b-89e4-702d95125c32',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '113',
   'date': 'Tue, 15 Mar 2022 03:03:09 GMT'},
  'RetryAttempts': 0}}

It takes a couple of minutes for the feature group to be created, we need to wait for this to be done before trying to ingest data in the feature store

In [15]:
status = feature_group.describe().get("FeatureGroupStatus")
while status == "Creating":
    print("Waiting for Feature Group to be Created")
    time.sleep(5)
    status = feature_group.describe().get("FeatureGroupStatus")
print(f"FeatureGroup {feature_group.name} successfully created.")

Waiting for Feature Group to be Created
Waiting for Feature Group to be Created
Waiting for Feature Group to be Created
Waiting for Feature Group to be Created
FeatureGroup feature-group-payment-classification successfully created.


Once the feature group is created we can ingest data into it

In [16]:
feature_group.ingest(data_frame=feature_store_data, max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='feature-group-payment-classification', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7fdfc486ba58>, max_workers=3, max_processes=1, profile_name=None, _async_result=<multiprocess.pool.MapResult object at 0x7fdfc5e572b0>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

To retrieve data from our feature store we define a function that gets the current values from the feature store

In [17]:
def get_feature_store_values():
    response = featurestore_runtime.batch_get_record(
        Identifiers=[
            {
                "FeatureGroupName": feature_group_name,
                "RecordIdentifiersValueAsString": [str(i) for i in range(19)],
            }
        ]
    )

    columns = ["mean_amount", "count", "identifier", "EventTime"]

    feature_store_resp = pd.DataFrame(
        data=[
            [resp["Record"][i]["ValueAsString"] for i in range(len(columns))]
            for resp in response["Records"]
        ],
        columns=columns,
    )
    feature_store_resp["identifier"] = feature_store_resp["identifier"].astype(int)
    feature_store_resp["count"] = feature_store_resp["count"].astype(int)
    feature_store_resp["mean_amount"] = feature_store_resp["mean_amount"].astype(float)
    feature_store_resp["EventTime"] = feature_store_resp["EventTime"].astype(float)
    feature_store_resp = feature_store_resp.sort_values(by="identifier")

    return feature_store_resp


feature_store_resp = get_feature_store_values()

We update the values in the feature store with the real values of our data

In [18]:
feature_store_data = pd.DataFrame()
feature_store_data["mean_amount"] = data.groupby(["transaction_category"]).mean()["amount"]
feature_store_data["count"] = data.groupby(["transaction_category"]).count()["amount"]
feature_store_data["identifier"] = feature_store_data.index
feature_store_data["EventTime"] = time.time()

feature_store_data["mean_amount"] = (
    pd.concat([feature_store_resp, feature_store_data])
    .groupby("identifier")
    .apply(lambda x: np.average(x["mean_amount"], weights=x["count"]))
)
feature_store_data["count"] = (
    pd.concat([feature_store_resp, feature_store_data]).groupby("identifier").sum()["count"]
)

feature_group.ingest(data_frame=feature_store_data, max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='feature-group-payment-classification', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7fdfc486ba58>, max_workers=3, max_processes=1, profile_name=None, _async_result=<multiprocess.pool.MapResult object at 0x7fdfc556cfd0>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

And display them after getting them from the feature store

In [19]:
feature_store_data = get_feature_store_values()
feature_store_data

Unnamed: 0,mean_amount,count,identifier,EventTime
2,494.773326,466,0,1647313000.0
15,51.205853,14513,1,1647313000.0
11,850.011007,745,2,1647313000.0
14,100.914531,33954,3,1647313000.0
5,31.478,1210,4,1647313000.0
8,119.571685,4838,5,1647313000.0
4,93.239755,9675,6,1647313000.0
16,51.054155,2792,7,1647313000.0
17,6018.076434,931,8,1647313000.0
10,114.745296,3350,9,1647313000.0


We use the feature store to calculate the distance between the average of every category and the current amount

In [20]:
additional_features = pd.pivot_table(
    feature_store_data, values=["mean_amount"], index=["identifier"]
).T.add_suffix("_dist")
additional_features_columns = list(additional_features.columns)
data = pd.concat([data, pd.DataFrame(columns=additional_features_columns, dtype=object)])
data[additional_features_columns] = additional_features.values[0]
for col in additional_features_columns:
    data[col] = abs(data[col] - data["amount"])

data

Unnamed: 0,transaction_category,receiver_id,sender_id,amount,year,month,day,hour,minute,second,...,9_dist,10_dist,11_dist,12_dist,13_dist,14_dist,15_dist,16_dist,17_dist,18_dist
0,0.0,4.518552e+15,4.333582e+15,833.26,2021.0,3.0,10.0,19.0,57.0,42.0,...,718.514704,732.288109,481.72241,806.338389,628.667464,337.027497,2074.16073,460.037701,52.287679,627.986786
1,0.0,4.518552e+15,4.642413e+15,596.63,2021.0,2.0,11.0,17.0,53.0,32.0,...,481.884704,495.658109,245.09241,569.708389,392.037464,100.397497,2310.79073,223.407701,184.342321,391.356786
2,0.0,4.274544e+15,4.952666e+15,176.76,2021.0,2.0,21.0,18.0,29.0,32.0,...,62.014704,75.788109,174.77759,149.838389,27.832536,319.472503,2730.66073,196.462299,604.212321,28.513214
3,0.0,4.518552e+15,4.457299e+15,879.78,2021.0,4.0,9.0,16.0,14.0,19.0,...,765.034704,778.808109,528.24241,852.858389,675.187464,383.547497,2027.64073,506.557701,98.807679,674.506786
4,0.0,4.601853e+15,4.578126e+15,742.25,2021.0,4.0,4.0,15.0,50.0,16.0,...,627.504704,641.278109,390.71241,715.328389,537.657464,246.017497,2165.17073,369.027701,38.722321,536.976786
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99992,18.0,4.405008e+15,4.583356e+15,205.43,2021.0,4.0,20.0,12.0,23.0,53.0,...,90.684704,104.458109,146.10759,178.508389,0.837464,290.802503,2701.99073,167.792299,575.542321,0.156786
99993,18.0,4.300417e+15,4.949241e+15,151.49,2021.0,3.0,24.0,19.0,30.0,18.0,...,36.744704,50.518109,200.04759,124.568389,53.102536,344.742503,2755.93073,221.732299,629.482321,53.783214
99994,18.0,4.405008e+15,4.996896e+15,188.28,2021.0,3.0,8.0,19.0,51.0,10.0,...,73.534704,87.308109,163.25759,161.358389,16.312536,307.952503,2719.14073,184.942299,592.692321,16.993214
99995,18.0,4.262047e+15,4.017367e+15,204.26,2021.0,2.0,14.0,23.0,25.0,7.0,...,89.514704,103.288109,147.27759,177.338389,0.332536,291.972503,2703.16073,168.962299,576.712321,1.013214


### 4. Create model <a class="anchor" id="Create-model"></a>
In this notebook we will be using the [Extreme Gradient Boosting](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html) (XGBoost) implementation of the gradient boosted trees algorithm. This model is selected due to it relatively fast training time and explainable properties. The model can be substituted at will a different [SageMaker estimator](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) or a [model of your choosing](https://aws.amazon.com/blogs/machine-learning/bring-your-own-model-with-amazon-sagemaker-script-mode/).



Now that we have the dataset we can start preparing the model. First, we create a training, validation and testing split.

In [21]:
# Randomly sort the data then split out first 70%, second 20%, and last 10%
train_data, validation_data, test_data = np.split(
    data.sample(frac=1, random_state=42), [int(0.7 * len(data)), int(0.9 * len(data))]
)

We save these sets to a file.

In [22]:
train_data.to_csv("train.csv", index=False, header=False)
validation_data.to_csv("validation.csv", index=False, header=False)
test_data.to_csv("test.csv", index=False, header=False)

And upload these files to our s3 bucket

In [23]:
boto3.Session().resource("s3").Bucket(s3_bucket).Object(
    os.path.join(bucket_prefix, "train/train.csv")
).upload_file("train.csv")
boto3.Session().resource("s3").Bucket(s3_bucket).Object(
    os.path.join(bucket_prefix, "validation/validation.csv")
).upload_file("validation.csv")

Get the XGBoost sagemaker image

In [24]:
container = sagemaker.image_uris.retrieve(region=region, framework="xgboost", version="1.2-2")

Transform our data to a sagemaker input for training

In [25]:
s3_input_train = sagemaker.inputs.TrainingInput(
    s3_data="s3://{}/{}/train".format(s3_bucket, bucket_prefix), content_type="csv"
)
s3_input_validation = sagemaker.inputs.TrainingInput(
    s3_data="s3://{}/{}/validation/".format(s3_bucket, bucket_prefix), content_type="csv"
)

We define the XGBoost model

In [26]:
xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/{}/output".format(s3_bucket, bucket_prefix),
    sagemaker_session=sagemaker_session,
)

Set the parameters

In [27]:
xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    objective="multi:softprob",
    num_class=19,
    verbosity=0,
    num_round=100,
)

And train the model

In [28]:
xgb.fit({"train": s3_input_train, "validation": s3_input_validation})

2022-03-15 03:03:39 Starting - Starting the training job...
2022-03-15 03:04:07 Starting - Preparing the instances for trainingProfilerReport-1647313419: InProgress
............
2022-03-15 03:06:10 Downloading - Downloading input data...
2022-03-15 03:06:35 Training - Downloading the training image.........
2022-03-15 03:08:05 Training - Training image download completed. Training in progress.[34m[2022-03-15 03:07:55.878 ip-10-2-143-46.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2022-03-15:03:07:56:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2022-03-15:03:07:56:INFO] Failed to parse hyperparameter objective value multi:softprob to Json.[0m
[34mReturning the value itself[0m
[34m[2022-03-15:03:07:56:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-03-15:03:07:56:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2022-03-15:03:07:56:INFO] Determined delimiter of CSV input is ','[0m
[34m[2022-

### 5. Using the endpoint <a class="anchor" id="Using-the-endpoint"></a>

Deploy the model to an endpoint

In [29]:
xgb_predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    serializer=sagemaker.serializers.CSVSerializer(),
)

-------!

### 6. Evaluate performance <a class="anchor" id="Evaluate-performance"></a>

Run the model on our test data

In [30]:
def predict(data, predictor):
    predictions = []
    confidences = []
    for row in data:
        response = np.fromstring(predictor.predict(row).decode("utf-8")[1:], sep=",")
        pred = response.argmax()
        confidence = max(response)
        predictions.extend([pred])
        confidences.extend([confidence])

    return predictions, confidences

Running it on the first 3 rows in our dataset results in the following:

In [31]:
pred, conf = predict(test_data.drop(["transaction_category"], axis=1).to_numpy()[:3], xgb_predictor)
print(
    f"The predictions for the first 3 entries are {pred}, the confidence for these predictions are {conf}"
)

The predictions for the first 3 entries are [1, 1, 11], the confidence for these predictions are [0.5061870217323303, 0.5919070839881897, 0.658933162689209]




Now we run the predictions on the complete dataset

In [32]:
predictions, confidences = predict(
    test_data.drop(["transaction_category"], axis=1).to_numpy(), xgb_predictor
)



And report the prediction results

In [33]:
print(
    classification_report(
        test_data["transaction_category"].to_list(), predictions, target_names=factorize_key
    )
)

                        precision    recall  f1-score   support

         Uncategorized       1.00      0.92      0.96        51
         Entertainment       0.81      0.89      0.85      1486
             Education       1.00      0.94      0.97        80
              Shopping       0.86      0.94      0.90      3441
         Personal Care       1.00      0.98      0.99       132
    Health and Fitness       0.99      0.89      0.94       443
       Food and Dining       0.99      0.82      0.90       918
   Gifts and Donations       1.00      0.95      0.97       275
           Investments       0.99      0.97      0.98        88
   Bills and Utilities       1.00      0.99      1.00       332
    Auto and Transport       0.94      0.84      0.88      1967
                Travel       0.96      0.84      0.90       120
      Fees and Charges       1.00      0.94      0.97       106
     Business Services       1.00      0.99      1.00       146
     Personal Services       1.00      