In [5]:
import pandas as pd
import os
import matplotlib.pyplot as plt
os.chdir("C:/Users/timur/WB Datasets/")

# Transaction Type Definitions (as per thesis)
1. CASH-IN: is the process of increasing the balance of account by paying in cash to a merchant.
2. CASH-OUT: is the opposite process of CASH-IN, it means to withdraw cash from a merchant which decreases the balance of the account.
3. DEBIT: is similar process than CASH-OUT and involves sending the money from the mobile money service to a bank account.
4. PAYMENT: is the process of paying for goods or services to merchants which decreases the balance of the account and increases the balance of the receiver.
6. TRANSFER: is the process of sending money to another user of the service through the mobile money platform.

# Load Data

In [6]:
data = pd.read_csv("exercise_1_credit_card_fraud.csv").drop_duplicates()

In [116]:
data.sample(n=10)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
597732,33,CASH_IN,364644.06,C672694214,53037.0,417681.06,C953175101,0.0,0.0,0,0
5402537,377,CASH_OUT,54118.74,C1175527925,0.0,0.0,C133501534,997518.52,1051637.26,0,0
6231480,592,TRANSFER,1558062.38,C870388405,1545.0,0.0,C174979354,0.0,1558062.38,0,0
2795083,216,PAYMENT,2879.3,C956634492,19364.21,16484.91,M1046436164,0.0,0.0,0,0
4322861,308,CASH_OUT,87460.35,C427630646,62192.0,0.0,C776672057,96228.68,183689.03,0,0
5934212,404,PAYMENT,22474.19,C1059535414,0.0,0.0,M1915400093,0.0,0.0,0,0
4218063,305,CASH_OUT,250461.23,C2060375980,21881.0,0.0,C238373120,916545.23,1167006.47,0,0
4425629,322,PAYMENT,13831.03,C1378078878,0.0,0.0,M295918054,0.0,0.0,0,0
1255093,134,PAYMENT,42887.53,C561464320,8216.61,0.0,M1495215576,0.0,0.0,0,0
4397685,321,CASH_IN,230121.66,C455722730,6456413.0,6686534.66,C204102424,1578748.69,1348627.03,0,0


# Add additional fields to dataset

In [128]:
# Add transaction id columns
data["transactionId"] = data.index

# Calculate the change in balance for the sender and receiver
data["balanceDiffOrig"] = data["newbalanceOrig"] - data["oldbalanceOrg"]
data["balanceDiffDest"] = data["newbalanceDest"] - data["oldbalanceDest"]

# Although this is a simulation from a agent based model, we can assume that we have initial date to calculate the running date from the time steps
INITIAL_DATE = '2023-11-01'
data['dateTime']=pd.to_datetime(data['step'], unit='h', origin=pd.Timestamp('2020-01-01'))
data['date'] = data['dateTime'].dt.date
data['dayOfWeek'] = data['dateTime'].dt.dayofweek
data['hour'] = data['dateTime'].dt.hour
data['month'] = data['dateTime'].dt.month

# The initial character from nameOrig and nameDest indicates the entity type (as per thesis)
data["origType"] = data["nameOrig"].apply(lambda x: "Client" if x[0]=="C" else "Merchant")
data["destType"] = data["nameDest"].apply(lambda x: "Client" if x[0]=="C" else "Merchant")
data.sample(n=10)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,...,transactionId,balanceDiffOrig,balanceDiffDest,dateTime,date,dayOfWeek,hour,month,origType,destType
2530537,205,CASH_OUT,222930.61,C199961083,0.0,0.0,C1094820116,1645951.08,1868881.68,0,...,2530537,0.0,222930.6,2020-01-09 13:00:00,2020-01-09,3,13,1,Client,Client
3342186,253,CASH_OUT,247779.83,C1632217655,81899.68,0.0,C1581491453,312844.84,560624.68,0,...,3342186,-81899.68,247779.84,2020-01-11 13:00:00,2020-01-11,5,13,1,Client,Client
4076807,301,CASH_OUT,1938.46,C727518885,51680.0,49741.54,C44607611,101640.46,103578.92,0,...,4076807,-1938.46,1938.46,2020-01-13 13:00:00,2020-01-13,0,13,1,Client,Client
4924876,350,CASH_OUT,106543.34,C1041698296,0.0,0.0,C423541940,911313.77,1017857.11,0,...,4924876,0.0,106543.34,2020-01-15 14:00:00,2020-01-15,2,14,1,Client,Client
2457928,203,TRANSFER,136043.83,C2071928617,1006.0,0.0,C528583904,241033.38,377077.21,0,...,2457928,-1006.0,136043.83,2020-01-09 11:00:00,2020-01-09,3,11,1,Client,Client
5046254,354,CASH_IN,98094.25,C741349739,50041.0,148135.25,C236449396,255397.41,157303.16,0,...,5046254,98094.25,-98094.25,2020-01-15 18:00:00,2020-01-15,2,18,1,Client,Client
193724,13,CASH_OUT,236074.14,C1694749942,199338.0,0.0,C1459119309,2304194.96,2897128.42,0,...,193724,-199338.0,592933.46,2020-01-01 13:00:00,2020-01-01,2,13,1,Client,Client
1726962,160,CASH_OUT,144792.3,C1288421959,0.0,0.0,C396095272,2714625.0,2859417.3,0,...,1726962,0.0,144792.3,2020-01-07 16:00:00,2020-01-07,1,16,1,Client,Client
3352106,253,CASH_IN,54582.67,C704290547,1341579.55,1396162.22,C65828706,151528.0,0.0,0,...,3352106,54582.67,-151528.0,2020-01-11 13:00:00,2020-01-11,5,13,1,Client,Client
1456744,140,PAYMENT,6235.97,C123087610,0.0,0.0,M1221759393,0.0,0.0,0,...,1456744,0.0,0.0,2020-01-06 20:00:00,2020-01-06,0,20,1,Client,Merchant


# Data quality checks
1. Transfers must be client to client = transferCheck
2. The difference in the senders balance must be equal and the opposite sign of the recievers balance = balanceCheck
3. Cash out must be between client and merchant = cashOutCheck
4. Cash in must be between client and merchant = cashinCheck
5. Payment must be between client and merchant = paymentCheck
6. Amount must be a float = amountCheck

In [118]:
# Summarise transaction type by sender/receiver type
entityTypeByTransactionType = data.groupby(["type", "origType", "destType"])["transactionId"].count().reset_index()
entityTypeByTransactionType.columns = ["type", "origType", "destType", "transactionCount"]
entityTypeByTransactionType

Unnamed: 0,type,origType,destType,transactionCount
0,CASH_IN,Client,Client,1399284
1,CASH_OUT,Client,Client,2237500
2,DEBIT,Client,Client,41432
3,PAYMENT,Client,Merchant,2151495
4,TRANSFER,Client,Client,532909


The summary table does not align with the relationships we expect

In [119]:
transaction_type_balance_diff_summary = data.groupby(["type"]).agg({ "balanceDiffOrig":[ "sum", "mean"], "balanceDiffDest":[ "sum", "mean"]}).reset_index()
transaction_type_balance_diff_summary .columns = ["type", "balanceDiffOrigSum","balanceDiffOrigMean", "balanceDiffDestSum", "balanceDiffDestMean"]
transaction_type_balance_diff_summary 

Unnamed: 0,type,balanceDiffOrigSum,balanceDiffOrigMean,balanceDiffDestSum,balanceDiffDestMean
0,CASH_IN,236360300000.0,168915.203779,-169052300000.0,-120813.411259
1,CASH_OUT,-63879760000.0,-28549.612057,433108800000.0,193568.180775
2,DEBIT,-144418900.0,-3485.685163,823158400.0,19867.696753
3,PAYMENT,-13724250000.0,-6378.936662,0.0,0.0
4,TRANSFER,-23529900000.0,-44153.695021,525960500000.0,986961.106232


The change in balance between the sender and receiver is not the same in absolute value as we should expect. We can apply filters to remove these rows, but we won't be left with any
data so in this case we won't apply them. 

In [120]:
applyChecks = False
if applyChecks:
    # transferCheck
    data = data[ (data["type"] != "Transfer") | (( data["origType"] == "Client") & (data["destType"] == "Client"))]
    # balanceCheck
    data = data[ (data["balanceDiffOrig"]/data["balanceDiffDest"]) == -1]
    # cashOutCheck
    data = data[ (data["type"] != "CASH_OUT") | (( data["origType"] == "Client") & (data["destType"] == "Merchant"))]
    # cashinCheck
    data = data[ (data["type"] != "CASH_IN") | (( data["origType"] == "Client") & (data["destType"] == "Merchant"))]
    # paymentCheck
    data = data[ (data["type"] != "Payement") | (( data["origType"] == "Client") & (data["destType"] == "Merchant"))]
    # amountCheck
    data = data[data["amount"].apply(lambda x: type(x)==float)]

# Summarise Transaction by type and whether it is fraudulent or not

In [121]:
transaction_type_summary = data.groupby(["type"]).agg({"amount": ["sum", "mean", "count"]}).reset_index()
transaction_type_summary.columns = ["type", "totalAmount", "averageAmount", "count"]
transaction_type_summary

Unnamed: 0,type,totalAmount,averageAmount,count
0,CASH_IN,236367400000.0,168920.242004,1399284
1,CASH_OUT,394413000000.0,176273.964346,2237500
2,DEBIT,227199200.0,5483.665314,41432
3,PAYMENT,28093370000.0,13057.60466,2151495
4,TRANSFER,485292000000.0,910647.009645,532909


In [168]:
fraud_summary = data.groupby(["isFraud"]).agg({"amount": ["sum", "mean", "count"], "balanceDiffOrig" : ["mean"], "hour" :["mean"]}).reset_index()
fraud_summary.columns = ["isFraud", "totalAmount", "averageAmount", "totalCount", "origAverageBalancChange", "averageHourOfDay"]
fraud_summary

Unnamed: 0,isFraud,totalAmount,averageAmount,totalCount,origAverageBalancChange,averageHourOfDay
0,0,1132337000000.0,178197.0,6354407,23141.52,15.326333
1,1,12056420000.0,1467967.0,8213,-1457275.0,11.546451


In [123]:
fraud_vs_transaction_type = data.groupby(["isFraud", "type"])["transactionId"].count().reset_index()
fraud_vs_transaction_type.columns = ["isFraud", "type", "count"]
is_fraud_total_count = fraud_summary[["isFraud", "totalCount"]]
fraud_vs_transaction_type = pd.merge(fraud_vs_transaction_type, is_fraud_total_count, on="isFraud")
fraud_vs_transaction_type["proportion"] = fraud_vs_transaction_type["count"]/fraud_vs_transaction_type["totalCount"]
fraud_vs_transaction_type

Unnamed: 0,isFraud,type,count,totalCount,proportion
0,0,CASH_IN,1399284,6354407,0.220207
1,0,CASH_OUT,2233384,6354407,0.35147
2,0,DEBIT,41432,6354407,0.00652
3,0,PAYMENT,2151495,6354407,0.338583
4,0,TRANSFER,528812,6354407,0.08322
5,1,CASH_OUT,4116,8213,0.501157
6,1,TRANSFER,4097,8213,0.498843


We see that fraudulent transactions only consist of transfers and cash outs

# Calculate the date difference between the transfer and cash out across fraudulent and non-fraudulent transactions
While the approach below should work, it appears that the destination client of a transfer is not the same as the originating client of a cash out, which is not what we expect. For example below
we have a transfer classed as fraudulent, the amount and step is the same but we see that the originating and destination customers are all completely different.

1. step 	type 	    amount 	nameOrig 	oldbalanceOrg 	newbalanceOrig 	nameDest
2. 1 	    TRANSFER 	181.00 	C1305486145 181.00 	        0.0 	        C553264065 
3. 1 	    CASH_OUT 	181.00 	C840083671 	181.00 	        0.0 	        C38997010

In [170]:
# Split transaction by transfers and cash_outs
transfers = data[data["type"]=="TRANSFER"]
cash_outs = data[data["type"]=="CASH_OUT"]

# join the two dataframes, as mentioned above we would expect the receiver of the transfer is the originator of the cash out, so we should join on nameDes and nameOrig. 
transfer_to_cash_out = pd.merge(transfers, cash_outs, left_on='nameDest', right_on='nameOrig',  suffixes=('Transfer', 'Cashout'))
# Impost that the transfer is on or before the cash hour date time
transfer_to_cash_out = transfer_to_cash_out[transfer_to_cash_out["dateTimeTransfer"]<=transfer_to_cash_out["dateTimeCashout"]]
# Calculate the hour difference
transfer_to_cash_out["hourDiff"] = transfer_to_cash_out["stepCashout"] - transfer_to_cash_out["stepTransfer"]
# Calculate the average hour difference across classes
transfer_to_cash_out = transfer_to_cash_out.groupby(["isFraudTransfer"])["hourDiff"].mean().reset_index()
transfer_to_cash_out.columns = ["isFraud", "averageHourDiff"]
transfer_to_cash_out

Unnamed: 0,isFraud,averageHourDiff
0,0,149.496241
1,1,481.0


To account for this issue, I will instead join on transaction amount, imposing that the transfer date cannot be after the cash out date, and remove duplicate transactions by keeping the row that has the smallest date difference between the transfer and cash out. 

In [166]:

transfer_to_cash_out = pd.merge(transfers, cash_outs, on="amount",  suffixes=('Transfer', 'Cashout'))
transfer_to_cash_out = transfer_to_cash_out[transfer_to_cash_out["dateTimeTransfer"]<=transfer_to_cash_out["dateTimeCashout"]]
transfer_to_cash_out["hourDiff"] = transfer_to_cash_out["stepCashout"] - transfer_to_cash_out["stepTransfer"]
transfer_to_cash_out = transfer_to_cash_out.sort_values(by="hourDiff")
transfer_to_cash_out = transfer_to_cash_out.drop_duplicates(subset=["transactionIdTransfer"])
transfer_to_cash_out = transfer_to_cash_out.groupby(["isFraudTransfer"]).agg({"hourDiff":["mean", "count"]}).reset_index()
transfer_to_cash_out.columns = ["isFraud", "meanHourDiff", "transactionCount"]
transfer_to_cash_out


Unnamed: 0,isFraud,meanHourDiff,transactionCount
0,0,109.483242,9339
1,1,0.030154,4079


We see that the average time difference between the classes is vastly different, with fraudulent transactions displaying a significantly shortest time difference between transfer and cash out.  