# Problem: SPEI transfer disputes

Build model to stop more SPEI transfer disputes.

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

disputes = pd.read_csv("disputes.csv")
disputes

# Approach: Add ATO Features to Transfer Model

Start by fetching historical data for Train and Test

In [None]:
from sumatra import Client
sumatra = Client('console.qa.sumatra.ai')

timeline = "simulated_100k_over_180d"
sumatra.get_timeline(timeline)

## Enrich historical events with candidate features

Replay events through Sumatra topology to compute historical feature values.

In [None]:
enriched = sumatra.materialize(timeline)
spei_outgoing = enriched.get_events("spei_outgoing")
spei_outgoing

## Join feature data and labels

In [None]:
labeled = pd.merge(spei_outgoing, disputes, 'left', left_on='_id', right_on='sumatra_id')
labeled['is_fraud'] = ~labeled.sumatra_id.isna()
labeled.is_fraud.value_counts()

## Choose subset of features

In [None]:
original = [
    'amount',
    'name_similarity',
    'money_out_48h',
    'past_pair_money_transferred',
    'unique_senders_to_beneficiary',
]
new = [
    #'failed_logins_by_ipc',
    #'trusted_device_updates_shared_mobile_36h',
]
features = original + new
labeled[features + ['is_fraud']]

## Use standard missing-value replacement

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed = pd.DataFrame(imp.fit_transform(labeled[features+['is_fraud']]), columns=features+['is_fraud'])
imputed

## Time-based Train/Test split

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(imputed, shuffle=False, test_size=0.2)
print("Train set size:", train.shape)
print("Test set size: ", test.shape)

## Train ML models and measure performance

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
plt.rcParams['font.size'] = 18
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Test Set Performance (Random Forest)")

# original
clf = RandomForestClassifier(n_estimators=5)
clf.fit(train[original], train.is_fraud)

test_score = clf.predict_proba(test[original])[:,1]
fpr, tpr, thresholds = roc_curve(test.is_fraud, test_score)

plt.plot(fpr, tpr, label='original')

# original + new
if new:
    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(train[original + new], train.is_fraud)

    test_score = clf.predict_proba(test[original + new])[:,1]
    fpr, tpr, thresholds = roc_curve(test.is_fraud, test_score)

    plt.plot(fpr, tpr, label='original+new')
    plt.legend(loc='best')