#### Hypothesis testing amounts difference of fraudulent transactions ####

Checking if sender balance features differ in fraudulent transactions in comparison with non-fraudulent
- H0: distribution of sender balance fetures in Fraud and Non-Fraud transactions are same
- H1:  distribution of sender balance fetures in Fraud and Non-Fraud transactions significantly differ

In [3]:
import psycopg2
import pandas as pd
from dotenv import load_dotenv
import os
load_dotenv()

conn_params = {
    "host": os.getenv("POSTGRES_HOST"),
    "port": os.getenv("POSTGRES_PORT"),
    "dbname": os.getenv("POSTGRES_DB"),
    "user": os.getenv("POSTGRES_USER"),
    "password": os.getenv("POSTGRES_PASSWORD"),
}

conn = psycopg2.connect(**conn_params)

#cur = conn.cursor()

query = """
    SELECT 
        transaction_id,
        amount,
        sender_balance_before,
        sender_balance_after,
        (sender_balance_before - sender_balance_after) sender_balance_delta,
        is_fraud
    FROM 
        analytics_staging.stg_transactions
     WHERE sender_balance_before is not null 
       AND sender_balance_after is not null
     
"""

#cur.execute(query)
#data = cur.fetchall()

df = pd.read_sql(query, conn)

df.describe()


  df = pd.read_sql(query, conn)


Unnamed: 0,transaction_id,amount,sender_balance_before,sender_balance_after,sender_balance_delta
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,34994410.0,179861.9,833883.1,855113.7,-21230.56
std,1836730.0,603858.2,2888243.0,2924049.0,146643.3
min,31813100.0,0.0,0.0,0.0,-1915268.0
25%,33403760.0,13389.57,0.0,0.0,0.0
50%,34994410.0,74871.94,14208.0,0.0,0.0
75%,36585070.0,208721.5,107315.2,144258.4,10150.44
max,38175720.0,92445520.0,59585040.0,49585040.0,10000000.0


In [None]:
# bool to int for is_fraud
df["is_fraud"] = df["is_fraud"].astype(int)

# split to fruad_positive (actual fraud) and fraud_negative (non-fraud) transactions
fruad_positive = df[df['is_fraud'] == 1]
fraud_negative = df[df['is_fraud'] == 0]

print("Rows:", len(df), "Fraud:", len(fruad_positive), "Non-fraud:", len(fraud_negative))

df.head()

Rows: 6362620 Fraud: 8213 Non-fraud: 6354407


Unnamed: 0,transaction_id,amount,sender_balance_before,sender_balance_after,sender_balance_delta,is_fraud
0,31813101,9839.64,170136.0,160296.36,9839.64,0
1,31813102,1864.28,21249.0,19384.72,1864.28,0
2,31813103,181.0,181.0,0.0,181.0,1
3,31813104,181.0,181.0,0.0,181.0,1
4,31813105,11668.14,41554.0,29885.86,11668.14,0


In [15]:
import numpy as np
from scipy import stats

# Cliff's delta p(X > Y) - p(X < Y)
# range [-1, 1] where values close to 0 are values with no stochastatic dominance
def cliffs_delta(x:np.ndarray, y:np.ndarray) -> float:
    gt = 0
    lt = 0

    for x_i in x:
        gt += np.sum(x_i > y)
        lt += np.sum(x_i < y)
    return (gt - lt) / (len(x) - len(y))


def check_feature_stats(feature:str):
    x = fruad_positive[feature].dropna().to_numpy()
    y = fraud_negative[feature].dropna(). to_numpy()

    out = {}

    # Mann-Whitney two-sided test
    u_stat, u_p = stats.mannwhitneyu(x, y, alternative="two-sided")
    out["mannwhitney_u"] = u_stat
    out["mannwhitney_p"] = u_p

    # Welch t-test two-sided independent
    t_stat, t_p = stats.ttest_ind(x, y, equal_var=False, nan_policy="omit")
    out["welch_t"] = t_stat
    out["welch_p"] = t_p

    return out


In [20]:
# running test
features = ["sender_balance_before", "sender_balance_after", "sender_balance_delta"]

stat_tests_results = pd.DataFrame(check_feature_stats(f) for f in features)

stat_tests_results.head()

Unnamed: 0,mannwhitney_u,mannwhitney_p,welch_t,welch_p
0,42337960000.0,0.0,20.85702,3.495452e-94
1,15460390000.0,0.0,-30.550072,2.69186e-194
2,50387650000.0,0.0,55.992451,0.0


In [24]:
print(df.groupby("is_fraud")[["sender_balance_before","sender_balance_after","sender_balance_delta"]].median())

          sender_balance_before  sender_balance_after  sender_balance_delta
is_fraud                                                                   
0                      14069.00                   0.0                  0.00
1                     438983.45                   0.0             436317.49


- According to independent t-test result for sender_balance_delta between Fraud and Non-Fraud transactions - H0 is rejected. 
- Sender balance delta strongly differs - consistent with balance transition anomalies.