In [13]:
# ==============================================================================
# PORTFOLIO PROJECT: AML ANOMALY DETECTION PROTOTYPE
# BY: Theodorus
# VERSION: 2.1 (Error fix in report generation)
# ==============================================================================

In [14]:
# Please run 'pip install pandas numpy' in your terminal if you haven't already
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import os

In [15]:
# ==============================================================================
# MISSION #2: PREPARE THE "CRIME SCENE DATA" (GENERATE FICTITIOUS DATA)
# ==============================================================================
# In this section, we will create a fake transaction dataset and save it to a .csv file
# inside the 'data/' directory. This keeps our project organized.

print("Starting Mission #2: Creating the fictitious transaction dataset...")

def generate_transaction_data(num_records=1000, output_folder='data'):
    """
    Generates fictitious transaction data and saves it to a CSV file.
    Intentionally injects anomalous data for detection purposes.
    Saves the output to the specified folder.
    """
    # --- Create the data directory if it doesn't exist ---
    if not os.path.exists(output_folder):
        print(f"Directory '{output_folder}' not found. Creating it now.")
        os.makedirs(output_folder)

    output_path = os.path.join(output_folder, 'transactions.csv')

    data = []
    user_ids = [f'user_{i}' for i in range(50)]
    base_date = datetime.now()

    # Generate normal data
    for i in range(num_records):
        data.append({
            'transaction_id': f'txn_{1000+i}',
            'user_id': random.choice(user_ids),
            'amount': round(random.uniform(10000, 5000000), 2),
            'timestamp': base_date - timedelta(days=random.randint(1, 30), hours=random.randint(0, 23)),
            'location_country': 'ID'
        })

    # --- Inject Anomalies ---

    # Anomaly 1: High-Frequency Transactions
    print("Injecting Anomaly #1: High-Frequency Transactions...")
    anomaly_user_1 = 'user_123'
    for i in range(6):
        data.append({
            'transaction_id': f'txn_hf_{i}',
            'user_id': anomaly_user_1,
            'amount': round(random.uniform(50000, 200000), 2),
            'timestamp': base_date - timedelta(minutes=i),
            'location_country': 'ID'
        })

    # Anomaly 2: High-Amount Transaction
    print("Injecting Anomaly #2: High-Amount Transaction...")
    data.append({
        'transaction_id': 'txn_ha_1',
        'user_id': 'user_456',
        'amount': 150000000.00,
        'timestamp': base_date - timedelta(days=2),
        'location_country': 'ID'
    })

    # Anomaly 3: Impossible Travel
    print("Injecting Anomaly #3: Impossible Travel...")
    anomaly_user_3 = 'user_789'
    data.append({
        'transaction_id': 'txn_it_1',
        'user_id': anomaly_user_3,
        'amount': 750000.00,
        'timestamp': base_date - timedelta(hours=5),
        'location_country': 'ID'
    })
    data.append({
        'transaction_id': 'txn_it_2',
        'user_id': anomaly_user_3,
        'amount': 1200000.00,
        'timestamp': base_date - timedelta(hours=4, minutes=30),
        'location_country': 'RU'
    })

    df = pd.DataFrame(data)
    # Save to CSV file inside the 'data' folder
    df.to_csv(output_path, index=False)
    print(f"Dataset '{output_path}' created successfully.")
    return df

# Run the function to create the data
df = generate_transaction_data()
print("\n" + "="*60 + "\n")


Starting Mission #2: Creating the fictitious transaction dataset...
Injecting Anomaly #1: High-Frequency Transactions...
Injecting Anomaly #2: High-Amount Transaction...
Injecting Anomaly #3: Impossible Travel...
Dataset 'data/transactions.csv' created successfully.




In [16]:
# ==============================================================================
# MISSION #3: BUILD THE "DETECTION ENGINE"
# ==============================================================================
# Here we will load the data from the 'data/' folder, apply our detection rules,
# and report the findings.

print("Starting Mission #3: Building the anomaly detection engine...")

# --- 1. Load & Clean Data ---
print("\nStep 1: Loading and exploring the data...")
data_path = 'data/transactions.csv'
df = pd.read_csv(data_path)
# Convert timestamp column to datetime data type
df['timestamp'] = pd.to_datetime(df['timestamp'])
print(f"Data loaded successfully from '{data_path}'. Here are the first 5 rows:")
print(df.head())
print("\nDataset Info:")
df.info()


Starting Mission #3: Building the anomaly detection engine...

Step 1: Loading and exploring the data...
Data loaded successfully from 'data/transactions.csv'. Here are the first 5 rows:
  transaction_id  user_id      amount                  timestamp  \
0       txn_1000  user_44  3134370.21 2025-06-17 13:44:09.720961   
1       txn_1001  user_18  1849730.66 2025-07-06 10:44:09.720961   
2       txn_1002  user_41  2413923.93 2025-07-12 11:44:09.720961   
3       txn_1003  user_30  3035717.93 2025-06-26 02:44:09.720961   
4       txn_1004  user_23  1450715.91 2025-07-13 08:44:09.720961   

  location_country  
0               ID  
1               ID  
2               ID  
3               ID  
4               ID  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   transaction_id    1009 non-null   object    

In [17]:
# --- 2. Implement Rule-Based Detection Logic ---
print("\nStep 2: Implementing detection rules...")

suspicious_transactions = []

# Rule #1: High-Frequency Detection
print("\n   -> Applying Rule #1: High-Frequency Detection...")
df_sorted_hf = df.sort_values(by=['user_id', 'timestamp'])
grouped = df_sorted_hf.groupby('user_id')['timestamp'].apply(list)
for user, timestamps in grouped.items():
    if len(timestamps) >= 6:
        for i in range(len(timestamps) - 5):
            if (timestamps[i+5] - timestamps[i]) < timedelta(minutes=10):
                suspicious_indices = df[
                    (df['user_id'] == user) &
                    (df['timestamp'] >= timestamps[i]) &
                    (df['timestamp'] <= timestamps[i+5])
                ].index
                for idx in suspicious_indices:
                    suspicious_transactions.append({'index': idx, 'reason': 'Suspicious - High Frequency'})



Step 2: Implementing detection rules...

   -> Applying Rule #1: High-Frequency Detection...


In [18]:
# Rule #2: High-Amount Detection
print("   -> Applying Rule #2: High-Amount Detection...")
high_amount_threshold = 50_000_000
high_amount_indices = df[df['amount'] > high_amount_threshold].index
for idx in high_amount_indices:
    suspicious_transactions.append({'index': idx, 'reason': 'Suspicious - High Amount'})


   -> Applying Rule #2: High-Amount Detection...


In [19]:
# Rule #3: Impossible Travel Detection
print("   -> Applying Rule #3: Impossible Travel Detection...")
df_sorted_it = df.sort_values(by=['user_id', 'timestamp']).copy()
df_sorted_it['prev_location'] = df_sorted_it.groupby('user_id')['location_country'].shift(1)
df_sorted_it['prev_timestamp'] = df_sorted_it.groupby('user_id')['timestamp'].shift(1)
impossible_travel_df = df_sorted_it[
    (df_sorted_it['location_country'] != df_sorted_it['prev_location']) &
    (df_sorted_it['prev_location'].notna()) &
    ((df_sorted_it['timestamp'] - df_sorted_it['prev_timestamp']) < timedelta(hours=1))
]
for idx in impossible_travel_df.index:
    suspicious_transactions.append({'index': idx, 'reason': 'Suspicious - Impossible Travel'})


   -> Applying Rule #3: Impossible Travel Detection...


In [20]:
# --- 3. Generate Final Report ---
print("\nStep 3: Creating the investigation report...")
if not suspicious_transactions:
    print("No suspicious transactions were found.")
else:
    # Create a DataFrame from the list of suspicious transaction dictionaries
    suspicious_df = pd.DataFrame(suspicious_transactions)
    # Remove duplicate flags for the same transaction and set the original index as the new index
    suspicious_df = suspicious_df.drop_duplicates(subset='index').set_index('index')

    # The join is done on the index, so the 'on' parameter is not needed.
    report_df = df.join(suspicious_df)

    # Filter out the rows that were not flagged (where 'reason' is NaN)
    # and rename the column for clarity.
    report_df = report_df[report_df['reason'].notna()].rename(columns={'reason': 'detection_reason'})

    print("\n" + "="*20 + " PRELIMINARY INVESTIGATION REPORT " + "="*20)
    print(f"Found {len(report_df)} transactions flagged as anomalies:")
    # Display the final report, ensuring the index is not shown for cleaner output
    print(report_df[['transaction_id', 'user_id', 'amount', 'timestamp', 'location_country', 'detection_reason']].to_string())
    print("="*68)

print("\nAnalysis complete. The preliminary investigation report has been generated.")



Step 3: Creating the investigation report...

Found 8 transactions flagged as anomalies:
     transaction_id   user_id        amount                  timestamp location_country                detection_reason
1000       txn_hf_0  user_123  1.320920e+05 2025-07-18 08:44:09.720961               ID     Suspicious - High Frequency
1001       txn_hf_1  user_123  1.680767e+05 2025-07-18 08:43:09.720961               ID     Suspicious - High Frequency
1002       txn_hf_2  user_123  7.559840e+04 2025-07-18 08:42:09.720961               ID     Suspicious - High Frequency
1003       txn_hf_3  user_123  7.771510e+04 2025-07-18 08:41:09.720961               ID     Suspicious - High Frequency
1004       txn_hf_4  user_123  1.471520e+05 2025-07-18 08:40:09.720961               ID     Suspicious - High Frequency
1005       txn_hf_5  user_123  1.007550e+05 2025-07-18 08:39:09.720961               ID     Suspicious - High Frequency
1006       txn_ha_1  user_456  1.500000e+08 2025-07-16 08:44:09.720961