## MMCS Project | Fraud Detection Model

In [65]:
# Import the necessary libraries
import pandas as pd
from pulp import *

Load the datasets for the transactions, customers and fraudulent transactions into seperate dataframes

In [2]:
all_transactions = pd.read_excel('Data/transactions.xlsx')
customers = pd.read_excel('Data/customers.xlsx')
frauds = pd.read_excel('Data/frauds.xlsx')

Next we join the transaction and customer datasets so that we have the customer information for every transaction.

In [3]:
all_transactions = pd.merge(all_transactions, customers, on='customer_id')

Index(['transaction_id', 'description', 'Amount', 'category', 'date', 'month',
       'customer_id', 'type', 'In_or_Out', 'bank_to', 'bank_from',
       'transac_prob', 'description_prob', 'priority', 'home_bank',
       'customer_prob'],
      dtype='object')

We cleanup the DataFrame to make it easier to code the forthcoming logic, all we need to investigate is transactions that are paid out and are not cash withdrawals. So we are going to create a filtered dataframe *transations* from the *all_transactions* dataframe.

In [10]:
all_transactions.columns = map(str.lower, all_transactions.columns)
frauds.columns = map(str.lower, frauds.columns)

In [11]:
transactions = all_transactions.loc[(all_transactions['in_or_out'] == 'paid_out') & (all_transactions['category'] != 'Cash Withdrawal')]

In [61]:
transactions.set_index('transaction_id', inplace=True)

## Building the Initial Model

In [141]:
from pulp import *

# Global Constants
EXTERNAL_INVESTIGATOR_COST = {1: 40, 2: 60, 3: 100, 4: 150}
TIME_NEEDED = {1: 0.25, 2: 0.5, 3: 1, 4: 2}
INVESTIGATION_TEAM_SIZE = {"bank_a": 8, "bank_b": 12, "bank_c": 10, "bank_d": 10, "bank_e": 10}


def solve_initial_model(transactions_df, lambda_c, lambda_d, daily_budget):
    """
    Solves an optimization problem to maximize the expected value saved from fraud investigations.

    Parameters:
    - transactions_df (pd.DataFrame): DataFrame containing transaction details.
    - lambda_c (dict): Dictionary mapping categories to their respective investigation limits.
    - lambda_d (dict): Dictionary mapping descriptions to their respective investigation limits.
    - daily_budget (float): Budget available for external investigations.

    Returns:
    Tuple (list, list, float): Returns a tuple containing lists of transactions selected for
    internal and external investigations, and the expected value saved.
    """
    # Calculating combined probability, cost, and time for each transaction
    amount = transactions_df["amount"]
    prob = 0.5 * transactions_df["customer_prob"] + 0.3 * transactions_df["description_prob"] + 0.2 * transactions_df[
        "transac_prob"]
    cost = transactions_df["priority"].map(EXTERNAL_INVESTIGATOR_COST)
    investigation_time = transactions_df["priority"].map(TIME_NEEDED)

    # Initialize the linear programming model
    model = LpProblem("Fraud_Investigation", LpMaximize)

    # Define decision variables for internal (x), external (y), and team assignments (z)
    internally_investigate = LpVariable.dicts("x", transactions_df.index, cat="Binary")
    externally_investigate = LpVariable.dicts("y", transactions_df.index, cat="Binary")
    investigated_by_bank = LpVariable.dicts("z", [(b, i) for b in INVESTIGATION_TEAM_SIZE.keys() for i in transactions_df.index],
                                            cat="Binary")

    # Define the objective function
    model += lpSum([amount[i] * prob[i] * (internally_investigate[i] + externally_investigate[i]) - cost[i] * externally_investigate[i] for i in transactions_df.index])

    # Add constraints to the model
    # 1. Investigation Exclusivity Constraint
    for i in transactions_df.index:
        model += internally_investigate[i] + externally_investigate[i] <= 1

    # 2. Bank Capacity Constraints for Internal Investigation
    for b in INVESTIGATION_TEAM_SIZE:
        model += lpSum([investigation_time[i] * investigated_by_bank[(b, i)] for i in transactions_df.index]) <= INVESTIGATION_TEAM_SIZE[b]

    # 3. Bank Investigation Assignment Constraint
    for i in transactions_df.index:
        model += lpSum([investigated_by_bank[(b, i)] for b in INVESTIGATION_TEAM_SIZE]) == internally_investigate[i]

    # 4. Budget Constraint for External Investigation
    model += lpSum([externally_investigate[i] * cost[i] for i in transactions_df.index]) <= daily_budget

    # 5. Variable Investigation per Category Ratio Constraint
    for c in lambda_c:
        I_c = transactions_df[transactions_df["category"] == c].index
        model += lpSum([internally_investigate[i] + externally_investigate[i] for i in I_c]) <= lambda_c[c] * lpSum([internally_investigate[i] + externally_investigate[i] for i in transactions_df.index])

    for d in lambda_d:
        I_d = transactions_df[transactions_df["description"].str.contains(d)].index
        model += lpSum([internally_investigate[i] + externally_investigate[i] for i in I_d]) <= lambda_d[d] * lpSum([internally_investigate[i] + externally_investigate[i] for i in transactions_df.index])

    # Solve the model and extract results
    model.solve()
    internally_investigated = [i for i in transactions_df.index if internally_investigate[i].varValue == 1]
    externally_investigated = [i for i in transactions_df.index if externally_investigate[i].varValue == 1]
    expected_value_saved = pulp.value(model.objective)  # Corrected line

    return internally_investigated, externally_investigated, expected_value_saved

In [151]:
lambda_c = {'Utilities': 0.3, 'Shopping': 0.3, 'Holiday': 0.1}
lambda_d = {'Sport event tickets': 0.05, 'Facebook Marketplace Upfront payment': 0.05, 'Clothes Online additional posting payment': 0.05}
daily_budget = 10000

days = list(transactions['date'].unique())
df1 = transactions[transactions['date'] == days[1]]

internal, external, evs = solve_initial_model(df1, lambda_c, lambda_d, daily_budget)

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/uzaykaradag/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/_g/3jdpr0d14q1g62hkp6vf5mlh0000gn/T/903175befccb458f8e1ce729c2d329f8-pulp.mps max timeMode elapsed branch printingOptions all solution /var/folders/_g/3jdpr0d14q1g62hkp6vf5mlh0000gn/T/903175befccb458f8e1ce729c2d329f8-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 2281 COLUMNS
At line 49826 RHS
At line 52103 BOUNDS
At line 60028 ENDATA
Problem MODEL has 2276 rows, 7924 columns and 29432 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 14127.7 - 0.09 seconds
Cgl0004I processed model has 2272 rows, 7924 columns (7924 integer (7924 of which binary)) and 20376 elements
Cbc0038I Initial state - 9 integers unsatisfied sum - 1.7
Cbc0038I Pass   1: suminf.    1.00000 (4) obj. -14104.9 it

In [158]:
def compute_real_value_saved(internally_investigated, externally_investigated, transactions_df, fraudulent_transactions):
    """
    Calculates the actual value saved by the investigations based on the transactions flagged as fraudulent.

    Parameters:
    - x_ids (list): List of transaction IDs selected for internal investigation.
    - y_ids (list): List of transaction IDs selected for external investigation.
    - transactions_df (pd.DataFrame): DataFrame containing transaction details.
    - fraudulent_transactions (set): Set of IDs of transactions that are actually fraudulent.

    Returns:
    float: The total value saved by the investigations.
    """
    cost = transactions_df["priority"].map(EXTERNAL_INVESTIGATOR_COST)

    total_value_saved = 0

    # Iterate over identified transactions and calculate total value saved
    for i in internally_investigated + externally_investigated:
        if i in fraudulent_transactions:
            print(i)
            amount = transactions_df.at[i, "amount"]  # Transaction value
            is_fraud = 1  # Flag indicating the transaction is fraudulent
            external_flag = 1 if i in externally_investigated else 0  # Flag for external investigation

            # Calculating value saved for each transaction
            total_value_saved += amount * is_fraud - cost[i] * external_flag

    return total_value_saved

In [159]:
# Example usage:
fraudulent_transactions = set(frauds['transaction_id'])

real_value_saved = compute_real_value_saved(internal, external, df1, fraudulent_transactions)

3692
2544


In [105]:
real_value_saved

193.95

In [168]:
from pulp import *

def solve_second_model(transactions_df, fraudulent_transactions, daily_budget):
    """
    Solves a second optimization model focusing on maximizing the value saved by investigating known fraudulent transactions.

    Parameters:
    - transactions_df (pd.DataFrame): DataFrame containing transaction details.
    - fraudulent_transactions (set): Set of transaction IDs known to be fraudulent.
    - daily_budget (float): Budget available for external investigations.

    Returns:
    Tuple (list, list, float): Returns a tuple containing lists of transactions selected for 
    internal and external investigations, and the value at stake.
    """
    # Extract necessary data
    amount = transactions_df["amount"]
    cost = transactions_df["priority"].map(EXTERNAL_INVESTIGATOR_COST)
    investigation_time = transactions_df["priority"].map(TIME_NEEDED)

    # Define the linear programming model
    model = LpProblem("Fraud_Investigation_Focused", LpMaximize)

    # Define decision variables for internal (x), external (y), and team assignments (z)
    x = LpVariable.dicts("internally_investigate", transactions_df.index, cat="Binary")
    y = LpVariable.dicts("externally_investigate", transactions_df.index, cat="Binary")
    z = LpVariable.dicts("investigated_by_bank",
                         [(b, i) for b in INVESTIGATION_TEAM_SIZE.keys() for i in transactions_df.index],
                         cat="Binary")

    # Define the objective function (only consider fraudulent transactions)
    model += lpSum([amount[i] * (x[i] + y[i]) - cost[i] * y[i] for i in transactions_df.index if i in fraudulent_transactions])

    # Add constraints similar to the first model
    for i in transactions_df.index:
        model += x[i] + y[i] <= 1
    for b in INVESTIGATION_TEAM_SIZE:
        model += lpSum([investigation_time[i] * z[(b, i)] for i in transactions_df.index]) <= INVESTIGATION_TEAM_SIZE[b]
    for i in transactions_df.index:
        model += lpSum([z[(b, i)] for b in INVESTIGATION_TEAM_SIZE]) == x[i]
    model += lpSum([y[i] * cost[i] for i in transactions_df.index]) <= daily_budget

    # Solve the model and extract results
    model.solve()
    internally_investigated = [i for i in transactions_df.index if x[i].varValue == 1]
    externally_investigated = [i for i in transactions_df.index if y[i].varValue == 1]
    value_at_stake = pulp.value(model.objective)

    return internally_investigated, externally_investigated, value_at_stake


In [170]:
x_ids, y_ids, value_at_stake = solve_second_model(df1, fraudulent_transactions, daily_budget)

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/uzaykaradag/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/_g/3jdpr0d14q1g62hkp6vf5mlh0000gn/T/b2f373acadf247cd9cace81ca7830b2a-pulp.mps max timeMode elapsed branch printingOptions all solution /var/folders/_g/3jdpr0d14q1g62hkp6vf5mlh0000gn/T/b2f373acadf247cd9cace81ca7830b2a-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 2275 COLUMNS
At line 33984 RHS
At line 36255 BOUNDS
At line 44180 ENDATA
Problem MODEL has 2270 rows, 7924 columns and 15848 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 360.95 - 0.01 seconds
Cgl0004I processed model has 1137 rows, 5666 columns (5666 integer (5666 of which binary)) and 11326 elements
Cutoff increment increased from 1e-05 to 0.04995
Cbc0038I Initial state - 0 integers unsatisfied sum - 0
Cbc0038I Sol

In [171]:
value_at_stake - real_value_saved

167.00000000000006

In [117]:
df1['amount']

transaction_id
3350     70.00
2869     70.00
2871     40.75
2868     75.25
2875     55.25
         ...  
3652    249.99
3663     55.00
3664     85.50
3246    120.30
3307     45.00
Name: amount, Length: 1132, dtype: float64

In [120]:
import fraud_detection as fd

In [128]:
import numpy as np

fraudulent_transactions = set(frauds['transaction_id'])

# Define the base lambda_c
lambda_c_values = [{'Utilities': 0.5}]
lambda_d_values = [{'ClothesOnline additional posting payment': 0.05}]


# Example usage of calculate_loss function (assuming the rest of the setup is the same)
results = fd.calculate_loss(df1, fraudulent_transactions, daily_budget, lambda_c_values, lambda_d_values)

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/uzaykaradag/.pyenv/versions/3.12.0/lib/python3.12/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/_g/3jdpr0d14q1g62hkp6vf5mlh0000gn/T/c5e0a14dc0f644a2b1234e0a43b80502-pulp.mps max timeMode elapsed branch printingOptions all solution /var/folders/_g/3jdpr0d14q1g62hkp6vf5mlh0000gn/T/c5e0a14dc0f644a2b1234e0a43b80502-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 2277 COLUMNS
At line 40766 RHS
At line 43039 BOUNDS
At line 50964 ENDATA
Problem MODEL has 2272 rows, 7924 columns and 20376 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 14214.1 - 0.04 seconds
Cgl0004I processed model has 2272 rows, 7924 columns (7924 integer (7924 of which binary)) and 20376 elements
Cbc0038I Initial state - 0 integers unsatisfied sum - 9.29257e-14
Cbc0038I Solution found of -14214.1
Cbc0038I Befor

TypeError: 'Series' object is not callable