## MMCS Project | Fraud Detection Model

In [65]:
# Import the necessary libraries
import pandas as pd
from pulp import *

Load the datasets for the transactions, customers and fraudulent transactions into seperate dataframes

In [2]:
all_transactions = pd.read_excel('Data/transactions.xlsx')
customers = pd.read_excel('Data/customers.xlsx')
frauds = pd.read_excel('Data/frauds.xlsx')

Next we join the transaction and customer datasets so that we have the customer information for every transaction.

In [3]:
all_transactions = pd.merge(all_transactions, customers, on='customer_id')

Index(['transaction_id', 'description', 'Amount', 'category', 'date', 'month',
       'customer_id', 'type', 'In_or_Out', 'bank_to', 'bank_from',
       'transac_prob', 'description_prob', 'priority', 'home_bank',
       'customer_prob'],
      dtype='object')

We cleanup the DataFrame to make it easier to code the forthcoming logic, all we need to investigate is transactions that are paid out and are not cash withdrawals. So we are going to create a filtered dataframe *transations* from the *all_transactions* dataframe.

In [10]:
all_transactions.columns = map(str.lower, all_transactions.columns)
frauds.columns = map(str.lower, frauds.columns)

In [11]:
transactions = all_transactions.loc[(all_transactions['in_or_out'] == 'paid_out') & (all_transactions['category'] != 'Cash Withdrawal')]

In [61]:
transactions.set_index('transaction_id', inplace=True)

## Building the Initial Model

In [101]:
def solve_initial_model(df, lambda_c, lambda_d, daily_budget):
    # Define the cost for external investigators based on priority
    external_investigator_cost = {1: 40, 2: 60, 3: 100, 4: 150}
    # Define the time (in days) it takes to investigate a transaction based on priority
    time_needed = {1: 0.25, 2: 0.5, 3: 1, 4: 2}

    # Create parameters based on the DataFrame indexed by transaction_id
    V = df['amount']
    P = 0.5 * df['customer_prob'] + 0.3 * df['description_prob'] + 0.2 * df['transac_prob']
    C = df['priority'].map(external_investigator_cost)
    t = df['priority'].map(time_needed)

    # Bank capacity
    T_bank = {'bank_a': 8, 'bank_b': 12, 'bank_c': 10, 'bank_d': 10, 'bank_e': 10}

    # Define the model
    model = LpProblem("Fraud_Investigation", LpMaximize)

    # Decision variables
    x = LpVariable.dicts("x", df.index, cat='Binary')
    y = LpVariable.dicts("y", df.index, cat='Binary')
    z = LpVariable.dicts("z", [(b, i) for b in T_bank.keys() for i in df.index], cat='Binary')

    # Objective function
    model += lpSum([V[i] * P[i] * (x[i] + y[i]) - C[i] * y[i] for i in df.index])

    # Constraints
    # 1. Investigation Exclusivity Constraint
    for i in df.index:
        model += x[i] + y[i] <= 1

    # 2. Bank Capacity Constraints for Internal Investigation
    for b in T_bank:
        model += lpSum([t[i] * z[(b, i)] for i in df.index]) <= T_bank[b]

    # 3. Bank Investigation Assignment Constraint
    for i in df.index:
        model += lpSum([z[(b, i)] for b in T_bank]) == x[i]

    # 4. Budget Constraint for External Investigation
    model += lpSum([y[i] * C[i] for i in df.index]) <= daily_budget

    # 5. Variable Investigation per Category Ratio Constraint
    for c in lambda_c:
        I_c = df[df['category'] == c].index
        model += lpSum([x[i] + y[i] for i in I_c]) <= lambda_c[c] * lpSum([x[i] + y[i] for i in df.index])
    
    for d in lambda_d:
        I_d = df[df['description'].str.contains(d)].index
        model += lpSum([x[i] + y[i] for i in I_d]) <= lambda_d[d] * lpSum([x[i] + y[i] for i in df.index])

    # Solve the model
    model.solve()

    # Extract results
    x_result = [i for i in df.index if x[i].varValue == 1]
    y_result = [i for i in df.index if y[i].varValue == 1]

    # Get the value of the objective function
    objective_value = value(model.objective)

    return x_result, y_result, objective_value

In [102]:
lambda_c = {'Utilities': 0.3, 'Shopping': 0.3, 'Holiday': 0.1}
lambda_d = {'Sport event tickets': 0.05, 'Facebook Marketplace Upfront payment': 0.05, 'Clothes Online additional posting payment': 0.05}
daily_budget = 10000

days = list(transactions['date'].unique())
df1 = transactions[transactions['date'] == days[1]]

x_ids, y_ids, obj_value = solve_initial_model(df1, lambda_c, lambda_d, daily_budget)

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/uzaykaradag/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/_g/3jdpr0d14q1g62hkp6vf5mlh0000gn/T/69ab73adf300417a88e8573736641f24-pulp.mps max timeMode elapsed branch printingOptions all solution /var/folders/_g/3jdpr0d14q1g62hkp6vf5mlh0000gn/T/69ab73adf300417a88e8573736641f24-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 2281 COLUMNS
At line 49826 RHS
At line 52103 BOUNDS
At line 60028 ENDATA
Problem MODEL has 2276 rows, 7924 columns and 29432 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 14127.7 - 0.09 seconds
Cgl0004I processed model has 2272 rows, 7924 columns (7924 integer (7924 of which binary)) and 20376 elements
Cbc0038I Initial state - 9 integers unsatisfied sum - 1.7
Cbc0038I Pass   1: suminf.    1.00000 (4) obj. -14104.9 it

In [103]:
def compute_real_value_saved(x_ids, y_ids, transactions_df, fraud_df):
    # Create a set of fraudulent transaction IDs for quick lookup
    fraudulent_transactions = set(fraud_df['transaction_id'])

    external_investigator_cost = {1: 40, 2: 60, 3: 100, 4: 150}

    C = transactions_df['priority'].map(external_investigator_cost)

    # Initialize the total value saved
    total_value_saved = 0

    # Calculate the value saved for each transaction identified by the model
    for i in x_ids + y_ids:
        if i in fraudulent_transactions:
            V_i = transactions_df.at[i, 'amount']  # Value of the transaction
            f_i = 1  # Transaction is fraudulent
            y_i = 1 if i in y_ids else 0  # 1 if investigated externally, 0 otherwise

            # Calculate the value saved for this transaction
            total_value_saved += V_i * f_i - C[i] * y_i

    return total_value_saved

In [104]:
# Example usage:
real_value_saved = compute_real_value_saved(x_ids, y_ids, df1, frauds)

In [105]:
real_value_saved

193.95

In [111]:
def solve_second_model(df, fraud_df, daily_budget):
    
    fraudulent_transactions = set(fraud_df['transaction_id']) 
    # Define the cost for external investigators based on priority
    external_investigator_cost = {1: 40, 2: 60, 3: 100, 4: 150}
    # Define the time (in days) it takes to investigate a transaction based on priority
    time_needed = {1: 0.25, 2: 0.5, 3: 1, 4: 2}

    # Create parameters based on the DataFrame indexed by transaction_id
    V = df['amount']
    P = 0.5 * df['customer_prob'] + 0.3 * df['description_prob'] + 0.2 * df['transac_prob']
    C = df['priority'].map(external_investigator_cost)
    t = df['priority'].map(time_needed)

    # Bank capacity
    T_bank = {'bank_a': 8, 'bank_b': 12, 'bank_c': 10, 'bank_d': 10, 'bank_e': 10}

    # Define the model
    model = LpProblem("Fraud_Investigation", LpMaximize)

    # Decision variables
    x = LpVariable.dicts("x", df.index, cat='Binary')
    y = LpVariable.dicts("y", df.index, cat='Binary')
    z = LpVariable.dicts("z", [(b, i) for b in T_bank.keys() for i in df.index], cat='Binary')

    # Objective function
    model += lpSum([df.at[i, 'amount'] * (x[i] + y[i]) - df.at[i, 'priority'] * C[i] * y[i]
                    for i in df.index if i in fraudulent_transactions])

    # Constraints
    # 1. Investigation Exclusivity Constraint
    for i in df.index:
        model += x[i] + y[i] <= 1

    # 2. Bank Capacity Constraints for Internal Investigation
    for b in T_bank:
        model += lpSum([t[i] * z[(b, i)] for i in df.index]) <= T_bank[b]

    # 3. Bank Investigation Assignment Constraint
    for i in df.index:
        model += lpSum([z[(b, i)] for b in T_bank]) == x[i]

    # 4. Budget Constraint for External Investigation
    model += lpSum([y[i] * C[i] for i in df.index]) <= daily_budget

    # Solve the model
    model.solve()

    # Extract results
    x_result = [i for i in df.index if x[i].varValue == 1]
    y_result = [i for i in df.index if y[i].varValue == 1]

    # Get the value of the objective function
    objective_value = value(model.objective)

    return x_result, y_result, objective_value

In [112]:
x_ids, y_ids, max_value_save = solve_second_model(df1, frauds, daily_budget)

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/uzaykaradag/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/_g/3jdpr0d14q1g62hkp6vf5mlh0000gn/T/983d11a0299a4a9f85f8fb0acb95354c-pulp.mps max timeMode elapsed branch printingOptions all solution /var/folders/_g/3jdpr0d14q1g62hkp6vf5mlh0000gn/T/983d11a0299a4a9f85f8fb0acb95354c-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 2275 COLUMNS
At line 33984 RHS
At line 36255 BOUNDS
At line 44180 ENDATA
Problem MODEL has 2270 rows, 7924 columns and 15848 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 360.95 - 0.01 seconds
Cgl0004I processed model has 1137 rows, 5666 columns (5666 integer (5666 of which binary)) and 11326 elements
Cutoff increment increased from 1e-05 to 0.04995
Cbc0038I Initial state - 0 integers unsatisfied sum - 2.22045e-16
Cb

In [113]:
obj_value - real_value_saved

167.00000000000006

In [117]:
df1['amount']

transaction_id
3350     70.00
2869     70.00
2871     40.75
2868     75.25
2875     55.25
         ...  
3652    249.99
3663     55.00
3664     85.50
3246    120.30
3307     45.00
Name: amount, Length: 1132, dtype: float64