In [None]:
import pandas as pd
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import (
    recall_score,
    precision_score,
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,ConfusionMatrixDisplay
)
import os
from sklearn.model_selection import TimeSeriesSplit
import pickle


                                                       Data Cleaning

In [None]:
total_data = pd.read_csv('fraud_payment_data', sep=',', header=0) 


In [None]:
#Removing redudant columns
total_data=total_data.drop('Time_step',axis=1)
total_data=total_data.drop('Sender_lob',axis=1)
total_data=total_data.drop('Sender_Id',axis=1)
total_data=total_data.drop('Bene_Id',axis=1)
total_data=total_data.drop('Transaction_Id',axis=1)

#Apparently some transactions amounted to zero dollars. None of them were fraudulent, so I've removed them.
total_data=total_data[total_data.USD_amount>0]
#The NaNs represent self-transactions. Correct for these here.
total_data['Sender_Country']=total_data['Sender_Country'].fillna(total_data['Bene_Country'])
total_data['Bene_Country']=total_data['Bene_Country'].fillna(total_data['Sender_Country'])
total_data['Sender_Sector'] = total_data['Sender_Sector'].fillna(-1)
total_data['Sender_Account']=total_data['Sender_Account'].fillna(total_data['Bene_Account'])
total_data['Bene_Account']=total_data['Bene_Account'].fillna(total_data['Sender_Account'])

total_data=total_data.reset_index(drop=True)

                                                      Feature Engineering

In [None]:
#One-hot encode Transaction Type
Type_feature= pd.get_dummies(total_data['Transaction_Type'], drop_first=True).astype(int)
countries=list(set(total_data['Sender_Country']).union(set(total_data['Bene_Country'])))
sectors=list(set(total_data['Sender_Sector']))

In [None]:
#Used to model the probability distribution of amount spent/recieved in a transaction given a historical average.
#Returns probability of the transaction amount being greater than or equal to the one observed
from scipy.stats import expon
def amount_prob(mean,amount):
  return expon.cdf(x=amount, scale=mean)

In [None]:
## Function that takes as input a node (an account), and a graph of transactions, and outputs the smallest  
## number of transactions seperating the account from an account that had previously been involved in fraud. 
## The number of transactions is counted in the forward direction (outgoing transactions), and been restricted to at maximum length 5.
## If greater than 5, then just return a huge value so the corresponding feature has a negligent value.
def min_path_to_fraud(sender_node,G):
    reach=nx.single_source_shortest_path_length(G, sender_node, cutoff=5)
    Fraud_nodes=[node for node in reach.keys() if G.nodes[node]['Fraud_count']>0]
    shortest_lengths = [reach[node] for node in Fraud_nodes]
    if shortest_lengths:
        return min(shortest_lengths)
    else:
        return 100000000 

## An alternative measure of connectedness to the above. This measures what percentage of the nodes within 5
## transactions of the given node are historically involved in fraud.
def fraud_centrality(node,G):
    reach=nx.single_source_shortest_path_length(G, node, cutoff=5).keys()
    return len([node for node in reach if G.nodes[node]['Fraud_count']>0])/len(reach)

##Returns whether a pair of sender, beneficiary has had a fraudulent transaction before
def repeat_fraud(G,sender_node,bene_node):
    Fraud= False
    for transaction in G[sender_node][bene_node]:
        if G[sender_node][bene_node][transaction]['Label']==1:
            Fraud=True
    return Fraud

In [None]:
##Initiate graph
G = nx.MultiDiGraph()
##Define dictionaries to keep track of fraud rates by country and sector
country_dict=dict.fromkeys(countries, [0,0])
sector_dict=dict.fromkeys(sectors, [0,0])

##Initiate features we aim to engineer
##bene/sender_prob is the probability explained above, Fraud_percentage_bene/sender is the percentage of fraudulent transactions among all 
##transactions (ingoing and outgoing) the sender account has been involved in, and Fraud_index_bene/sender is the distance given by the 
##min_path function above. Fraud centrality is given by the above function.  
features_bene= pd.DataFrame(columns=['bene_prob','Fraud_percentage_bene','Fraud_index_bene',
                                   'Fraud_centrality_bene','bene_in_deg','bene_out_deg','fraud_rate_by_country_bene'])
features_sender= pd.DataFrame(columns=['sender_prob','Fraud_percentage_sender','Fraud_index_sender',
                                    'Fraud_centrality_sender','sender_in_deg','sender_out_deg','fraud_rate_by_country_sender'])

##Repeat fraud is given by the function above.
features_general=pd.DataFrame(columns=['Repeat_Fraud','fraud_rate_by_sector'])
for index, row in total_data.iterrows():
    if index%1000==0:
       print(index)
    if index%10000==0:
       total=pd.concat([features_bene,features_sender,features_general],axis=1)
       total.to_csv('total_features', index=False)
       print('Progress Saved!')
        
    ##First build general features
    fraud_rate_by_country_sender=country_dict[row['Sender_Country']][0]
    fraud_rate_by_country_Bene=country_dict[row['Bene_Country']][0]
    fraud_rate_by_sector=sector_dict[row['Sender_Sector']][0]

    country_dict[row['Sender_Country']][0]=(country_dict[row['Sender_Country']][0]*country_dict[row['Sender_Country']][1]+row['Label'])/(country_dict[row['Sender_Country']][1]+1)
    country_dict[row['Sender_Country']][1]+=1
    country_dict[row['Bene_Country']][0]=(country_dict[row['Bene_Country']][0]*country_dict[row['Bene_Country']][1]+row['Label'])/(country_dict[row['Bene_Country']][1]+1)
    country_dict[row['Bene_Country']][1]+=1
    sector_dict[row['Sender_Sector']][0]=(sector_dict[row['Sender_Sector']][0]*sector_dict[row['Sender_Sector']][1]+row['Label'])/(sector_dict[row['Sender_Sector']][1]+1)
    sector_dict[row['Sender_Sector']][1]+=1
    
    new= not(G.has_edge(row['Sender_Account'],row['Bene_Account']))
    repeatfraud=(not new) and repeat_fraud(G,row['Sender_Account'],row['Bene_Account'])
    features_general.loc[index]=[repeatfraud,fraud_rate_by_sector]
    
    ##Build features related to sender accounts 
    check1=G.has_node(row['Sender_Account'])
    if check1: ## If node already exists (i.e sender account involved in some transaction before)  
      sender_in_deg=G.in_degree(row['Sender_Account'])
      sender_out_deg=G.out_degree(row['Sender_Account'])
      Fraud_percentage_sender=G.nodes[row['Sender_Account']]['Fraud_count']/(sender_in_deg+sender_out_deg)                                                                               
      Fraud_centrality_sender=fraud_centrality(row['Sender_Account'],G)
      Fraud_index_sender=1/(1+min_path_to_fraud(row['Sender_Account'],G))
      if sender_out_deg>0: ## If node has been involved in an outgoing transaction
        ##Engineer sender account features 
        sender_prob=amount_prob(G.nodes[row['Sender_Account']]['total_out']/sender_out_deg,row['USD_amount'])
        features_sender.loc[index]=[sender_prob,Fraud_percentage_sender,Fraud_index_sender,
                                    Fraud_centrality_sender,sender_in_deg,sender_out_deg,fraud_rate_by_country_sender]
      else:
        ##Engineer sender account features with default value for the prob feature as zero if no outgoing transaction history.
        features_sender.loc[index]=[0,Fraud_percentage_sender,Fraud_index_sender,
                                    Fraud_centrality_sender,sender_in_deg,0,fraud_rate_by_country_sender]
        
    else:##If node does not exist, put 0 as default value for features where appropriate 
      features_sender.loc[index]=[0,0,0,0,0,0,fraud_rate_by_country_sender]
    
    ##Repeat the same for beneficiary account
    check2=G.has_node(row['Bene_Account'])
    if check2:
      Bene_in_deg=G.in_degree(row['Bene_Account'])
      Bene_out_deg=G.out_degree(row['Bene_Account'])
      Fraud_percentage_Bene=G.nodes[row['Bene_Account']]['Fraud_count']/(Bene_in_deg+Bene_out_deg)                                                                                 
      Fraud_centrality_Bene=fraud_centrality(row['Bene_Account'],G)
      Fraud_index_Bene=1/(1+min_path_to_fraud(row['Bene_Account'],G))
      if Bene_in_deg>0: 
        Bene_prob=amount_prob(G.nodes[row['Bene_Account']]['total_in']/Bene_in_deg,row['USD_amount'])
        features_bene.loc[index]=[Bene_prob,Fraud_percentage_Bene,Fraud_index_Bene,
                                    Fraud_centrality_Bene,Bene_in_deg,Bene_out_deg,fraud_rate_by_country_Bene]
      else:
        features_bene.loc[index]=[0,Fraud_percentage_Bene,Fraud_index_Bene,
                                    Fraud_centrality_Bene,0,Bene_out_deg,fraud_rate_by_country_Bene]
    else:
      features_bene.loc[index]=[0,0,0,0,0,0,fraud_rate_by_country_Bene]

    check3=(row['Sender_Account']==row['Bene_Account'])#For self-transactions
    ##Add/update edges and nodes in the graph corresponding to the transaction
    if check1:  
      G.nodes[row['Sender_Account']]['total_out']+=row['USD_amount']
      G.nodes[row['Sender_Account']]['Fraud_count']+=row['Label']
    else:
      G.add_node(row['Sender_Account'], total_out=row['USD_amount'], total_in=0, Fraud_count=row['Label'])
    if check2 or check3:  
      G.nodes[row['Bene_Account']]['total_in']+=row['USD_amount']
      G.nodes[row['Bene_Account']]['Fraud_count']+=row['Label']
    else:
      G.add_node(row['Bene_Account'], total_in=row['USD_amount'], total_out=0, Fraud_count=row['Label'])
    
    G.add_edge(row['Sender_Account'], row['Bene_Account'],Label=row['Label'])

total=pd.concat([features_bene,features_sender,features_general,Type_feature],axis=1)
total.to_csv('total_features', index=False)


In [None]:
final_features=pd.read_csv('total_features', sep=',', header=0) 


In [None]:
final_features

In [None]:
#Train, test, split 
train_features = final_features[0:1000000]
val_features = final_features[1000000:1250000]
test_features = final_features[1250000:-1]

y_train = total_data['Label'][0:1000000]
y_val   = total_data['Label'][1000000:1250000]
y_test  = total_data['Label'][1250000:-1]

#Fit scaler on TRAIN ONLY - this learns mean and std from training set 
scaler = StandardScaler()
X_train = scaler.fit_transform(train_features)

# Transform val/test using the SAME scaler
X_val  = scaler.transform(val_features)
X_test = scaler.transform(test_features)

In [None]:
#Aligning by index
total_data = total_data.loc[final_features.index]

#Make sure X and y are aligned
X = pd.DataFrame(final_features)
y = pd.Series(total_data["Label"])   


In [None]:
##Looking at the correlation matrix for our features in our training data
corr_matrix = pd.concat([train_features,total_data[:1000000]['Label']],axis=1).corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

                                                                Hyperparameter Tunning 
                                                            

In [None]:


param_grid = {
    "learning_rate": [0.01, 0.05],
    "max_depth": [6, 8],
    "n_estimators": [400, 600],
    "min_child_weight": [1, 2],
    "subsample": [0.9, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "gamma": [0, 0.1
    #pos weight tunned seperately
]     
}




In [None]:
#Lift scorer
#Lift scorer at top 10% because it reflects operational reality (limited review capacity), and shows model's ability to prioritize
#Highest lift should be at top percentiles
#Lift@10% = "How good is my priority queue?"
def lift_scorer(estimator, X, y, top_pct=0.1):
    y_true = y.to_numpy() 
    y_pred = estimator.predict_proba(X)[:, 1]
    n = int(len(y_true) * top_pct)
    idx = y_pred.argsort()[-n:]
    return y_true[idx].sum() / (y_true.sum() * top_pct)



In [None]:
#time Series Split 
tscv = TimeSeriesSplit(n_splits=5)


In [None]:
# Base model with class imbalance
xgb_base = XGBClassifier(scale_pos_weight=49, random_state=831)

#Grid search
xgb_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid,
    scoring=lift_scorer,
    cv=tscv,
    n_jobs=-1,
    verbose=2
)

xgb_search.fit(X_train, y_train)
print(f"Best params: {xgb_search.best_params_}")

os.system('say "First grid search finished"')

In [None]:
from collections import Counter
counter = Counter(y_train)
neg = counter[0]
pos = counter[1]

#Scale gives us our imbalance
scale = neg / pos

#lift scorer 
def lift_scorer(estimator, X, y, top_pct=0.1):
    y_true = y.to_numpy() 
    y_pred = estimator.predict_proba(X)[:, 1]
    n = int(len(y_true) * top_pct)
    idx = y_pred.argsort()[-n:]
    return y_true[idx].sum() / (y_true.sum() * top_pct)


pos_weights = [scale * f for f in [0.5, 1, 2, 5, 7, 9, 11]]

param_grid = {
    "scale_pos_weight": pos_weights
}


xgb = XGBClassifier(
  colsample_bytree=0.8, 
  gamma=0.1, 
  learning_rate=0.01, 
  max_depth=8,
  min_child_weight=2,
  n_estimators= 600, 
  subsample= 0.9
)

search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=lift_scorer,
    cv=tscv,
    n_jobs=-1
)

search.fit(X_train, y_train)

print(search.best_params_)
best_pos_weight = search.best_params_['scale_pos_weight_lift']

import os
os.system('say "Pos weight grid search finished running"')


In [None]:

lr = LogisticRegression(C=1.0, max_iter=1000, random_state=831 )

param_grid = {'class_weight': [{0: 1, 1: w} for w in [25, 49, 75, 100]]}

search = GridSearchCV(lr, param_grid, scoring=lift_scorer, cv=tscv, n_jobs=-1 )
search.fit(X_train, y_train)

In [None]:
best_lr = search.best_estimator_
print("Best LR params:", search.best_params_)
print("Best CV lift:", search.best_score_)


                                                       Training Set
    

In [None]:
#LR training set 
y_pred_train_lr = best_lr.predict(X_train)
y_pred_proba_train_lr = best_lr.predict_proba(X_train)[:, 1]
lift_train_lr = lift_scorer(best_lr, X_train, y_train, top_pct=0.1)

print("Logistic Regression - TRAINING")
print(f"Lift@10%: {lift_train_lr:.2f}x")
print(classification_report(y_train, y_pred_train_lr))
print(confusion_matrix(y_train, y_pred_train_lr))

#LR validation set
print("Logistic Regression - VALIDATION")
y_pred_val_lr = best_lr.predict(X_val)
y_pred_proba_val_lr = best_lr.predict_proba(X_val)[:, 1]
lift_val_lr = lift_scorer(best_lr, X_val, y_val, top_pct=0.1)
print(f"Lift@10%: {lift_val_lr:.2f}x")
print(classification_report(y_val, y_pred_val_lr))
print(confusion_matrix(y_val, y_pred_val_lr))

In [None]:
# Train final XGB model
val_model = XGBClassifier(
    colsample_bytree=0.8, 
    gamma=0.1, 
    learning_rate=0.01, 
    min_child_weight=2,
    n_estimators=600, 
    subsample=0.9,
    scale_pos_weight=23,
    max_depth=8,
    random_state=831
)
val_model.fit(X_train, y_train)

#XGB TRAINING SET 
y_pred_train = val_model.predict(X_train)
y_pred_proba_train = val_model.predict_proba(X_train)[:, 1]
lift_train = lift_scorer(val_model, X_train, y_train, top_pct=0.1)

print("XGB - TRAINING")
print(f"Lift@10%: {lift_train:.2f}x")
print(classification_report(y_train, y_pred_train))
print(confusion_matrix(y_train, y_pred_train))

#XGB VALIDATION SET 
print("\n XGB - VALIDATION")
y_pred_val = val_model.predict(X_val)
lift_val = lift_scorer(val_model, X_val, y_val, top_pct=0.1)
print(f"Lift@10%: {lift_val:.2f}x")
print(classification_report(y_val, y_pred_val))
print(confusion_matrix(y_val, y_pred_val))

                                                       Test Set

In [None]:

#Create DataFrame from test_features to extract column names
df_temp = pd.DataFrame(test_features)
feature_names = train_features.columns.tolist()  

#combine training + validation
X_final = np.vstack([X_train, X_val])
y_final = np.concatenate([y_train, y_val])

#final optimized XGB
final_model = XGBClassifier(
    colsample_bytree=0.8, 
  gamma=0.1, 
  learning_rate=0.01, 
  max_depth=8,
  min_child_weight=2,
  n_estimators= 600, 
  subsample= 0.9,
  scale_pos_weight=23,
  random_state=831
)

final_model.fit(X_final, y_final)


In [None]:
#XGB Test Set
y_pred_test = final_model.predict(X_test)
y_pred_proba_test = final_model.predict_proba(X_test)[:, 1]
lift_test = lift_scorer(final_model, X_test, y_test, top_pct=0.1)

print("XGB - TEST SET")
print(f"Lift@10%: {lift_test:.2f}x")
print(f"Recall: {recall_score(y_test, y_pred_test):.3f}")
print(f"Precision: {precision_score(y_test, y_pred_test):.3f}")
print(f"F1: {f1_score(y_test, y_pred_test):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

In [None]:
import matplotlib.pyplot as plt

# Vizualizing XGBoost curves - recall vs. lift, XGBoost vs. LR 
base_params = {
    'colsample_bytree': 0.8, 
    'gamma': 0.1, 
    'learning_rate': 0.01, 
    'max_depth': 8,
    'min_child_weight': 2,
    'n_estimators': 600, 
    'subsample': 0.9,
    'random_state': 831
}

thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
pos_weights = [5, 10, 23, 50, 100, 200]

for thresh in thresholds:
    recalls = []
    lifts = []
    
    for pw in pos_weights:
        xgb = XGBClassifier(**base_params, scale_pos_weight=pw)
        xgb.fit(X_final, y_final)
        
        y_prob = xgb.predict_proba(X_test)[:, 1]
        y_pred = (y_prob >= thresh).astype(int)
        
        recall = recall_score(y_test, y_pred)
        ppr = y_pred.sum() / len(y_pred)
        lift = recall / ppr if ppr > 0 else 0
        
        recalls.append(recall)
        lifts.append(lift)
    
    plt.plot(recalls, lifts, 
              marker='o',
              label=f'XGB Threshold={thresh}')

# Logistic Regression curve
lr_recalls = []
lr_lifts = []
class_weights = [10, 25, 49, 75, 100, 150, 200, 300, 500]

for cw in class_weights:
    lr = LogisticRegression(class_weight={0: 1, 1: cw}, max_iter=1000)
    lr.fit(X_final, y_final)
    
    y_prob_lr = lr.predict_proba(X_test)[:, 1]
    y_pred_lr = (y_prob_lr >= 0.5).astype(int)
    
    recall_lr = recall_score(y_test, y_pred_lr)
    ppr_lr = y_pred_lr.sum() / len(y_pred_lr)
    lift_lr = recall_lr / ppr_lr if ppr_lr > 0 else 0
    
    lr_recalls.append(recall_lr)
    lr_lifts.append(lift_lr)

plt.plot(lr_recalls, lr_lifts, label='Logistic Regression', color='brown')

plt.plot([0, 1], [1, 1], 'k--', label='Baseline (random)')
plt.xlabel('Recall')
plt.ylabel('Lift')
plt.title('Recall-Lift Trade-off: XGBoost vs Logistic Regression')
plt.legend()
plt.grid(True)
#annotation for your optimal model
plt.scatter([0.73], [6.7], s=150, color='black', marker='*', zorder=5, label='Current Model (pw=23, t=0.5)')
plt.legend(fontsize=8)
plt.tight_layout()
plt.show()

os.system('say "Your graph is ready."')

In [None]:
#Opitmal Point automatically extraxted and W/ pos weight

base_params = {
    'colsample_bytree': 0.8, 
    'gamma': 0.1, 
    'learning_rate': 0.01, 
    'max_depth': 8,
    'min_child_weight': 2,
    'n_estimators': 600, 
    'subsample': 0.9,
    'random_state': 831
}

thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
pos_weights = [5, 10, 23, 50, 100, 200]

for thresh in thresholds:
    recalls = []
    lifts = []
    for pw in pos_weights:
        xgb = XGBClassifier(**base_params, scale_pos_weight=pw)
        xgb.fit(X_final, y_final)
        lift = lift_scorer(xgb, X_test, y_test, top_pct=0.1)
        y_prob = xgb.predict_proba(X_test)[:, 1]
        y_pred = (y_prob >= thresh).astype(int)
        recall = recall_score(y_test, y_pred)
        recalls.append(recall)
        lifts.append(lift)
    
    line, = plt.plot(recalls, lifts, marker='o', label=f'XGB Threshold={thresh}')
    #pos_weight labels
    for i, pw in enumerate(pos_weights):
        plt.annotate(f'{pw}', (recalls[i], lifts[i]), fontsize=7, alpha=0.7)

#Logistic Regression curve
lr_recalls = []
lr_lifts = []
class_weights = [10, 25, 49, 75, 100, 150, 200, 300, 500]

for cw in class_weights:
    lr = LogisticRegression(class_weight={0: 1, 1: cw}, max_iter=1000, random_state=831)
    lr.fit(X_final, y_final)
    lift_lr = lift_scorer(lr, X_test, y_test, top_pct=0.1)
    y_prob_lr = lr.predict_proba(X_test)[:, 1]
    y_pred_lr = (y_prob_lr >= 0.5).astype(int)
    recall_lr = recall_score(y_test, y_pred_lr)
    lr_recalls.append(recall_lr)
    lr_lifts.append(lift_lr)

plt.plot(lr_recalls, lr_lifts, label='Logistic Regression', color='brown')
plt.plot([0, 1], [1, 1], 'k--', label='Baseline (random)')
plt.xlabel('Recall')
plt.ylabel('Lift')
plt.title('Recall-Lift Trade-off: XGBoost vs Logistic Regression')
plt.legend()
plt.grid(True)

#Calculate optimal model point from final_model
y_pred_optimal = final_model.predict(X_test)
recall_optimal = recall_score(y_test, y_pred_optimal)
lift_optimal = lift_scorer(final_model, X_test, y_test, top_pct=0.1)

plt.scatter([recall_optimal], [lift_optimal], s=150, color='black', marker='*', zorder=5, label='Current Model (pw=23, t=0.5)')
plt.legend(fontsize=8)
plt.tight_layout()
plt.show()
os.system('say "Your graph is ready."')

In [None]:
# Save model and feature names
model_package = {
    'model': final_model,
    'feature_names': feature_names
}

with open('xgboost_fraud_model.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print("Model and feature names saved successfully!")