In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import shap
import matplotlib.pyplot as plt
import xgboost as xgb

df = pd.read_csv('synthesized_donor_data.csv')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 5 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Nama Pemohon/Institusi                    15000 non-null  object 
 1   Emel                                      15000 non-null  object 
 2   No. K/P (baru)/Polis/Tentera/No. Pasport  15000 non-null  object 
 3   No. Telefon Bimbit                        15000 non-null  object 
 4   Jumlah Pendapatan                         15000 non-null  float64
dtypes: float64(1), object(4)
memory usage: 586.1+ KB


In [11]:
df.head()

Unnamed: 0,Nama Pemohon/Institusi,Emel,No. K/P (baru)/Polis/Tentera/No. Pasport,No. Telefon Bimbit,Jumlah Pendapatan
0,Yap bin Wei Jie,yap.wei.jie6@msn.com,890801-31-8371,012-0758418,7991.26
1,Fatimah binti Ali,fatimah.ali47@yahoo.com,910803-09-4788,014-2273286,7418.29
2,Arun bin Suresh,arun.suresh28@msn.com,910828-52-5695,010-8559286,9027.72
3,Khairul bin Ismail,khairul.ismail21@outlook.my,790204-07-6937,019-57099785,9734.97
4,Sharifah binti Ahmad,sharifah.ahmad29@hotmail.com,620417-51-7528,018-25342147,3225.15


In [4]:
import re
import json

# Function to validate email format
def is_valid_email(email):
    # Basic email validation regex
    regex = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return re.match(regex, email) is not None

# Function to calculate total expenditure from JSON string
def calculate_total_expenditure(expense_json):
    try:
        expenses = json.loads(expense_json)  # Parse the JSON string
        total = sum(expenses.values())  # Sum all expense categories
        return total
    except (json.JSONDecodeError, TypeError):
        return 0  # Return 0 if there's an error in parsing

# Function to calculate total income from JSON string
def calculate_total_income(income_json):
    try:
        incomes = json.loads(income_json)  # Parse the JSON string
        total = sum(incomes.values())  # Sum all income sources
        return total
    except (json.JSONDecodeError, TypeError):
        return 0  # Return 0 if there's an error in parsing


# Function to check for multiple applications (example implementation)
def has_multiple_applications(id_number):
    # Check if the applicant ID appears more than once (mock implementation)
    return df['No. K/P (baru)/Polis/Tentera/No. Pasport'].value_counts().get(id_number, 0) > 1

# Function to label fraud based on the defined rules
def label_fraud(row):
    # Rule 1: Missing or invalid identification
    if row['No. K/P (baru)/Polis/Tentera/No. Pasport'] == '':
        return 1  # Fraud: Missing ID
    if row['Emel'] == '' or not is_valid_email(row['Emel']):
        return 1  # Fraud: Invalid email

    if row['Age'] < 18 and row['Pekerjaan'] != 'Tidak Bekerja':
        return 1  # Fraud: Underage working

    # Rule 3: Financial discrepancies
    total_expenditure = calculate_total_expenditure(row['Perbelanjaan Bulanan'])
    total_income = calculate_total_income(row['Sumber Pendapatan Bulanan'])
    
    if total_expenditure > total_income:
        return 1  # Fraud: High expenditure with income source

    # Rule 4: Behavioral indicators
    if row['Days Since'] < 30 and has_multiple_applications(row['No. K/P (baru)/Polis/Tentera/No. Pasport']):
        return 1  # Fraud: Frequent applications

    # Rule 5: Relationship to staff
    if row['Hubungan kekeluargaan dengan kakitangan LZS?'] == 'YA' and row['Nama Kakitangan'] == '':
        return 1  # Fraud: Close relationship without staff name

    # If no fraud indicators are found
    return 0  # Not fraud

# Apply the labeling function to the DataFrame
df['is_fraud'] = df.apply(label_fraud, axis=1)

# Display the DataFrame with the fraud labels
print(df[['No. K/P (baru)/Polis/Tentera/No. Pasport', 'Emel', 'is_fraud']])

      No. K/P (baru)/Polis/Tentera/No. Pasport  \
0                               621211-55-6506   
1                               780813-48-6819   
2                               670221-54-1335   
3                               980221-14-6270   
4                               641212-56-7074   
...                                        ...   
14995                           830522-39-7770   
14996                           050414-02-5406   
14997                           690529-15-2690   
14998                           670406-41-5316   
14999                           751112-34-3119   

                                  Emel  is_fraud  
0            devi.ganesh55@hotmail.com         0  
1              lee.jia.hao16@yahoo.com         1  
2           abdullah.khairul54@msn.com         0  
3      noraini..abdullah17@hotmail.com         1  
4              fatimah.hafiz58@msn.com         0  
...                                ...       ...  
14995        goh.pei.shan17@outlook.my    

In [5]:
df["is_fraud"].value_counts()

is_fraud
0    10048
1     4952
Name: count, dtype: int64

In [7]:
df["Maklumat Isi Rumah"].value_counts()

Maklumat Isi Rumah
[{"Nama Penuh": "Goh bin Wei Jie", "No. Kad Pengenalan/Sijil Kelahiran": "030423-43-2975", "Hubungan": "Adik", "Umur": 21, "Status": "IPT/Kolej", "Kesihatan": "Sihat", "Pendapatan Kasar (RM)": 0.0, "Pendapatan Bersih (RM)": 0.0}, {"Nama Penuh": "Lim bin Wei Jie", "No. Kad Pengenalan/Sijil Kelahiran": "720907-03-4951", "Hubungan": "Anak", "Umur": 52, "Status": "Tidak Bekerja", "Kesihatan": "Sihat", "Pendapatan Kasar (RM)": 0.0, "Pendapatan Bersih (RM)": 0.0}, {"Nama Penuh": "Khairul bin Syazwan", "No. Kad Pengenalan/Sijil Kelahiran": "040202-37-2105", "Hubungan": "Tiada Kaitan", "Umur": 21, "Status": "IPT/Kolej", "Kesihatan": "Sihat", "Pendapatan Kasar (RM)": 0.0, "Pendapatan Bersih (RM)": 0.0}, {"Nama Penuh": "Abdullah bin Razak", "No. Kad Pengenalan/Sijil Kelahiran": "790917-43-1497", "Hubungan": "Datuk", "Umur": 45, "Status": "Tidak Bekerja", "Kesihatan": "Sihat", "Pendapatan Kasar (RM)": 0.0, "Pendapatan Bersih (RM)": 0.0}, {"Nama Penuh": "Tan bin Wei Lun", "No. K

In [6]:
df.to_csv("labeled_data.csv", index=False)