In [13]:
# ==============================================================================
# PORTFOLIO PROJECT: AML ANOMALY DETECTION PROTOTYPE
# BY: Theodorus
# VERSION: 2.1 (Error fix in report generation)
# ==============================================================================

In [14]:
# Please run 'pip install pandas numpy' in your terminal if you haven't already
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import os

In [15]:
# ==============================================================================
# MISSION #2: PREPARE THE "CRIME SCENE DATA" (GENERATE FICTITIOUS DATA)
# ==============================================================================
# In this section, we will create a fake transaction dataset and save it to a .csv file
# inside the 'data/' directory. This keeps our project organized.

print("Starting Mission #2: Creating the fictitious transaction dataset...")

def generate_transaction_data(num_records=1000, output_folder='data'):
    """
    Generates fictitious transaction data and saves it to a CSV file.
    Intentionally injects anomalous data for detection purposes.
    Saves the output to the specified folder.
    """
    # --- Create the data directory if it doesn't exist ---
    if not os.path.exists(output_folder):
        print(f"Directory '{output_folder}' not found. Creating it now.")
        os.makedirs(output_folder)

    output_path = os.path.join(output_folder, 'transactions.csv')

    data = []
    user_ids = [f'user_{i}' for i in range(50)]
    base_date = datetime.now()

    # Generate normal data
    for i in range(num_records):
        data.append({
            'transaction_id': f'txn_{1000+i}',
            'user_id': random.choice(user_ids),
            'amount': round(random.uniform(10000, 5000000), 2),
            'timestamp': base_date - timedelta(days=random.randint(1, 30), hours=random.randint(0, 23)),
            'location_country': 'ID'
        })

    # --- Inject Anomalies ---

    # Anomaly 1: High-Frequency Transactions
    print("Injecting Anomaly #1: High-Frequency Transactions...")
    anomaly_user_1 = 'user_123'
    for i in range(6):
        data.append({
            'transaction_id': f'txn_hf_{i}',
            'user_id': anomaly_user_1,
            'amount': round(random.uniform(50000, 200000), 2),
            'timestamp': base_date - timedelta(minutes=i),
            'location_country': 'ID'
        })

    # Anomaly 2: High-Amount Transaction
    print("Injecting Anomaly #2: High-Amount Transaction...")
    data.append({
        'transaction_id': 'txn_ha_1',
        'user_id': 'user_456',
        'amount': 150000000.00,
        'timestamp': base_date - timedelta(days=2),
        'location_country': 'ID'
    })

    # Anomaly 3: Impossible Travel
    print("Injecting Anomaly #3: Impossible Travel...")
    anomaly_user_3 = 'user_789'
    data.append({
        'transaction_id': 'txn_it_1',
        'user_id': anomaly_user_3,
        'amount': 750000.00,
        'timestamp': base_date - timedelta(hours=5),
        'location_country': 'ID'
    })
    data.append({
        'transaction_id': 'txn_it_2',
        'user_id': anomaly_user_3,
        'amount': 1200000.00,
        'timestamp': base_date - timedelta(hours=4, minutes=30),
        'location_country': 'RU'
    })

    df = pd.DataFrame(data)
    # Save to CSV file inside the 'data' folder
    df.to_csv(output_path, index=False)
    print(f"Dataset '{output_path}' created successfully.")
    return df

# Run the function to create the data
df = generate_transaction_data()
print("\n" + "="*60 + "\n")


Starting Mission #2: Creating the fictitious transaction dataset...
Injecting Anomaly #1: High-Frequency Transactions...
Injecting Anomaly #2: High-Amount Transaction...
Injecting Anomaly #3: Impossible Travel...
Dataset 'data/transactions.csv' created successfully.


