In [1]:
!pip install awswrangler==3.12.0 evidently==0.7.8 -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dash 2.18.1 requires dash-core-components==2.0.0, which is not installed.
dash 2.18.1 requires dash-html-components==2.0.0, which is not installed.
dash 2.18.1 requires dash-table==5.0.0, which is not installed.
dash 2.18.1 requires Flask<3.1,>=1.0.4, but you have flask 3.1.0 which is incompatible.
dash 2.18.1 requires Werkzeug<3.1, but you have werkzeug 3.1.3 which is incompatible.
sparkmagic 0.21.0 requires pandas<2.0.0,>=0.17.1, but you have pandas 2.2.3 which is incompatible.[0m[31m
[0m

In [None]:
from monitoring_utils import get_username

# GLOBAL VARIABLES

In [None]:
# Username 
USERNAME = get_username()
# Evidently workspace
workspace = "https://nahpmgpxmm.us-east-1.awsapprunner.com"

# Training variables
TARGET_COL = "is_fraud"
SEED = 42
TRAIN_SPLIT = 0.7
FEATURES = ['card_present', 'trx_vel_last_1mths', 'trx_vel_last_2mths',
            'amt_vel_last_1mths', 'amt_vel_last_2mths']

# PROJECT CREATION OR LOADING

In [4]:
from evidently.ui.workspace import RemoteWorkspace

  np_bool = np.bool  # type: ignore[attr-defined]


In [5]:
ws = RemoteWorkspace(workspace)

In [6]:
project_name = "Monitoreo de Sistema de Detección de Fraude en TC"
project_description = "Este proyecto monitorea tanto la data como el modelo del sistema de ML de fraude" 
project = ws.search_project(project_name)
if project:
    project = project[0]
    print("Project already exists")
else:
    print("Creating new project")
    project = ws.create_project(project_name)
    project.description = project_description
    project.save()
#ws.delete_project(project.id)

Project already exists


# DATA PULL

In [7]:
import awswrangler as wr
from evidently.presets import DataSummaryPreset
from evidently import Dataset
from evidently import DataDefinition
from evidently import Report

In [8]:
periodo_carga_inicio = 202412
periodo_carga_fin = 202412

In [9]:
query = """
    SELECT  transaction_id
            ,amount
            ,merchant_category
            ,merchant_country
            ,card_present
            ,is_fraud
            ,cod_month
            ,trx_vel_last_1mths
            ,trx_vel_last_2mths
            ,amt_vel_last_1mths
            ,amt_vel_last_2mths
    FROM    RISK_MANAGEMENT.CREDIT_CARD_TRANSACTIONS
    WHERE   cod_month between {} and {}
    """.format(periodo_carga_inicio, periodo_carga_fin)
query

'\n    SELECT  transaction_id\n            ,amount\n            ,merchant_category\n            ,merchant_country\n            ,card_present\n            ,is_fraud\n            ,cod_month\n            ,trx_vel_last_1mths\n            ,trx_vel_last_2mths\n            ,amt_vel_last_1mths\n            ,amt_vel_last_2mths\n    FROM    RISK_MANAGEMENT.CREDIT_CARD_TRANSACTIONS\n    WHERE   cod_month between 202412 and 202412\n    '

In [10]:
df = wr.athena.read_sql_query(sql=query, database="risk_management")

2025-06-24 04:15:59,873	INFO worker.py:1852 -- Started a local Ray instance.


In [11]:
# Casting variables
df[TARGET_COL] = df[TARGET_COL].astype(str)
df["card_present"] = df["card_present"].astype(str).astype('category')
df["trx_vel_last_1mths"] = df["trx_vel_last_1mths"].astype("float32")
df["trx_vel_last_2mths"] = df["trx_vel_last_2mths"].astype("float32")

In [12]:
df.head(3)

Unnamed: 0,transaction_id,amount,merchant_category,merchant_country,card_present,is_fraud,cod_month,trx_vel_last_1mths,trx_vel_last_2mths,amt_vel_last_1mths,amt_vel_last_2mths
0,105bdcc07fba,131.491882,retail,CA,0,0,202412,2.0,4.0,237.101273,487.9729
1,18276227a494,63.524712,restaurant,US,1,0,202412,2.0,5.0,195.016602,551.49762
2,371d07e33364,41.115845,restaurant,US,1,0,202412,3.0,6.0,236.132446,592.613464


In [13]:
raw_definition = DataDefinition(
    id_column="transaction_id",
    timestamp="cod_month",
    categorical_columns=[TARGET_COL, "card_present",
                         "merchant_category", "merchant_country"],
    numerical_columns=["amount", "trx_vel_last_1mths",
                       "trx_vel_last_2mths", "amt_vel_last_1mths",
                       "amt_vel_last_2mths"]
)
# https://docs.evidentlyai.com/docs/library/data_definition

In [14]:
dataset = Dataset.from_pandas(df, data_definition=raw_definition)

In [15]:
data_summary_report = Report([DataSummaryPreset()], tags=["Data summary"])
data_summary_run = data_summary_report.run(dataset, None)
data_summary_run.save_html("data_summary.html")
data_summary_ref = ws.add_run(project.id, data_summary_run)
print(f"Report's link: {data_summary_ref.url}")
print(f"Report's id: {data_summary_ref.id}")
#ws.delete_run(project.id, data_summary_ref.id)

Report's link: https://nahpmgpxmm.us-east-1.awsapprunner.com/projects/01979e90-bceb-72e9-ab90-5c3b49cc3896/reports/0197a026-8091-717e-b5c8-ea077ca97331
Report's id: 0197a026-8091-717e-b5c8-ea077ca97331


# MODEL TRAINING

In [16]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import pandas as pd
from evidently import BinaryClassification

In [17]:
X = df[FEATURES]
y = df[TARGET_COL].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=TRAIN_SPLIT,
                                                    random_state=SEED)

In [18]:
xgb = XGBClassifier(enable_categorical=True)
xgb.fit(X_train, y_train)

In [19]:
pred_definition = DataDefinition(
    classification=[BinaryClassification(target="is_fraud",
                                         prediction_labels="prediction",
                                         prediction_probas="prob",
                                         pos_label=1)],
    id_column="transaction_id",
    timestamp="cod_month",
    categorical_columns=["is_fraud", "prediction", "card_present"],
    numerical_columns=["trx_vel_last_1mths", "trx_vel_last_2mths",
                       "amt_vel_last_1mths", "amt_vel_last_2mths",
                       "prob"]
)

In [31]:
# Building datasets
df_train = pd.concat([X_train, y_train], axis=1)
df_train["prediction"] = xgb.predict(X_train)
df_train["prediction"] = df_train["prediction"].astype("category")
df_train["prob"] = xgb.predict_proba(X_train)[:, -1]
reference = Dataset.from_pandas(df_train, data_definition=pred_definition)

df_test = pd.concat([X_test, y_test], axis=1)
df_test["prediction"] = xgb.predict(X_test)
df_test["prediction"] = df_test["prediction"].astype("category")
df_test["prob"] = xgb.predict_proba(X_test)[:, -1]
current = Dataset.from_pandas(df_test, data_definition=pred_definition)

# DATA DRIFT DETECTION

In [39]:
from evidently.presets import DataDriftPreset
from monitoring_utils import detect_data_drift

In [40]:
# % of valid drifted columns (no IDs neither TIMESTAMPS) to consider the dataset drifted
drift_share = 0.3

## Without data drift

In [41]:
data_drift_report = Report([DataDriftPreset(drift_share=drift_share)], tags=["Data drift"])
data_drift_run = data_drift_report.run(current, reference)
data_drift_run.save_html("data_drift.html")
data_drift_ref = ws.add_run(project.id, data_drift_run)
print(f"Report's link: {data_drift_ref.url}")
print(f"Report's id: {data_drift_ref.id}")
#ws.delete_run(project.id, data_drift_ref.id)

Report's link: https://nahpmgpxmm.us-east-1.awsapprunner.com/projects/01979e90-bceb-72e9-ab90-5c3b49cc3896/reports/0197a033-6f7f-7158-ad72-eb4f28697566
Report's id: 0197a033-6f7f-7158-ad72-eb4f28697566


In [42]:
detect_data_drift(data_drift_run)

False

## With data drift

In [43]:
df_drifted = df_test.copy()
numeric_cols = ["trx_vel_last_1mths", "trx_vel_last_2mths", "amt_vel_last_1mths", "amt_vel_last_2mths"]
noise = 100
for c in numeric_cols:
    df_drifted[c] += noise
drifted_current = Dataset.from_pandas(df_drifted, data_definition=pred_definition)

In [44]:
data_drift_report = Report([DataDriftPreset(drift_share=drift_share)], tags=["Data drift"])
data_drift_run = data_drift_report.run(drifted_current, reference)
data_drift_run.save_html("data_drift.html")
data_drift_ref = ws.add_run(project.id, data_drift_run)
print(f"Report's link: {data_drift_ref.url}")
print(f"Report's id: {data_drift_ref.id}")
#ws.delete_run(project.id, data_drift_ref.id)

Report's link: https://nahpmgpxmm.us-east-1.awsapprunner.com/projects/01979e90-bceb-72e9-ab90-5c3b49cc3896/reports/0197a033-c3b5-710e-83d3-1de253723b45
Report's id: 0197a033-c3b5-710e-83d3-1de253723b45


In [45]:
detect_data_drift(data_drift_run)

True

# CONCEPT DRIFT DETECTION

In [47]:
from evidently.presets import ClassificationPreset
from monitoring_utils import get_metric_dict, get_metric

In [49]:
concept_drift_report = Report([ClassificationPreset()], tags=["Concept drift"])
concept_drift_run = concept_drift_report.run(current, reference)
concept_drift_run.save_html("concept_drift.html")
concept_drift_ref = ws.add_run(project.id, concept_drift_run)
print(f"Report's link: {concept_drift_ref.url}")
print(f"Report's id: {concept_drift_ref.id}")

Report's link: https://nahpmgpxmm.us-east-1.awsapprunner.com/projects/01979e90-bceb-72e9-ab90-5c3b49cc3896/reports/0197a03b-ff46-76d0-8c14-7aeb9278b5d5
Report's id: 0197a03b-ff46-76d0-8c14-7aeb9278b5d5


In [50]:
metric_dict = get_metric_dict(concept_drift_run)
metric_dict

{'Accuracy': 'e4f2ea2dab7ab3a65b16ef5124d02db4',
 'Precision': '3c2d79a01c4f6539c105bc7dff803faa',
 'Recall': '11c1894b301e560af740a01ee9739c2a',
 'F1Score': '2df7cb543daa4f42d1d5cdad5bb551ae',
 'RocAuc': '836d15857ce444b246783aad42a94779',
 'LogLoss': '275d2d67f1d9d892b74d8d9bdc07c233',
 'TPR': 'aede79b5b66270f86918a2111b694162',
 'TNR': '48b057cb0301dad5740a4a1a25050ffe',
 'FPR': '2e6d045688c97855c5661e6a8f6bd2e5',
 'FNR': 'd183f8fb51722080c04c08958a4579ff',
 'F1ByLabel': '960da7cc5ae6dfee13853a0b222466ca',
 'PrecisionByLabel': '57f465d9f518c33f7672e42e807a1737',
 'RecallByLabel': '66b666a06110f2e57883b2a7365fcf45',
 'RocAucByLabel': 'c1d27bd176ffbcc8c6b5871f2d88e6e6'}

In [None]:
cur_logloss, ref_logloss = get_metric(concept_drift_run, metric_dict["RocAuc"])
print(cur_logloss, ref_logloss)

In [147]:
delta = 0.04

In [148]:
if abs(ref_logloss-cur_logloss) > delta:
    if cur_logloss < ref_logloss:
        print("Significant concept drift detected")
else:
    print("No significant concept drift")

Significant concept drift detected


# ALERTING

In [51]:
import boto3

In [52]:
sns = boto3.client("sns")
response = sns.create_topic(Name="NewTopic")
topic_arn = response["TopicArn"]

In [None]:
endpoint = "xxxxxx@gmail.com"

In [None]:
sns.list_subscriptions_by_topic(TopicArn=topic_arn)["Subscriptions"]
my_endpoint = [e["Endpoint"] for e in sns.list_subscriptions_by_topic(TopicArn=topic_arn)["Subscriptions"] if e["Endpoint"]==endpoint] 
if my_endpoint:
    print("Email already registered")
else:
    print("Registering email")
    sns.subscribe(
        TopicArn=topic_arn,
        Protocol="email",
        Endpoint=endpoint
    )

Registering email


In [58]:
if detect_data_drift(data_drift_run):
    sns.publish(TopicArn=topic_arn,
                Subject="Data drift detection on Fraud Detection model",
                Message=f"Check more details on the report: {data_drift_ref.url}"
    )