1. Data Generation

In [178]:
#Merchant data generation
import random
from faker import Faker
from datetime import datetime, timedelta

faker = Faker()

def generate_merchant_profiles(count):       # generate_merchant_base
    """Generate simplified merchant profiles."""
    merchants = []
    for _ in range(count):
        merchant = {
            "merchant_id": f"M{random.randint(1000, 9999)}",
            "business_name": faker.company(),
            "business_type": random.choice(["Retail", "E-commerce"]),
            "registration_date": faker.date_this_decade(),
            "average_ticket_size": round(random.uniform(1000, 10000), 2),
            # Registration Details
            "gst_status": random.choice([True, False]),         # GST registration status
        }
        merchants.append(merchant)
    return merchants

# Example: Generate 10 merchants
merchants = generate_merchant_profiles(50)
print(merchants[:2])  # Display first 2 merchants


#Transactions Generation
#Creates and outputs transaction data for transactions held within the last 30 days assuming one transaction happens per day
from typing import List, Tuple
#Function to generate normal transactions
def generate_normal_transactions(merchant_id,  amount_range):
    """Generate normal transaction patterns."""
    transactions = []
    num_days = 30
    for _ in range(num_days):
        txn = {
            "transaction_id": f"T{random.randint(100000, 999999)}",
            "merchant_id": merchant_id,
            "timestamp": datetime.now().replace(microsecond=0) - timedelta(days=random.randint(0, num_days)),
            "amount": round(random.uniform(*amount_range), 2),
            "status": "completed",
            "customer_id": f"C{random.randint(1000, 9999)}",
            "is_anomalous": False
        }
        transactions.append(txn)
    return transactions

#Function to generate anomalous transactions 
def generate_anomalous_transactions(merchant_id, pattern):
    """Generate anomalous transaction patterns based on a given fraud type."""
    transactions = []
    num_days = 30
    Constcustomer_id = f"C{random.randint(1000, 9999)}"  # Random customer ID
    for _ in range(num_days):
        
        # For customer concentration, we generate multiple transactions for the same customer
        if pattern == "customer_concentration":
            customer_id = f"C{random.randint(1000, 9999)}"  # Random customer ID


            txn_time = datetime.now().replace(microsecond=0)
            amount = round(random.uniform(3000, 15000), 2)
            
            txn = {
                "transaction_id": f"T{random.randint(100000, 999999)}",
                "merchant_id": merchant_id,
                "customer_id": customer_id,  # Same customer repeats
                "timestamp": txn_time,
                "amount": amount,
                "status": "completed",
                "is_anomalous": True,
                "pattern": pattern
            }
            transactions.append(txn)

        # Handle other patterns (e.g., late_night, high_velocity) separately
        elif pattern == "late_night":
            
            hour = random.choice(list(range(23, 24)) + list(range(0, 5)))
            txn_time = datetime.now().replace(hour=hour, minute=random.randint(0, 59), second=0, microsecond=0)
            amount = round(random.uniform(5000, 20000), 2)
            
            txn = {
                "transaction_id": f"T{random.randint(100000, 999999)}",
                "merchant_id": merchant_id,
                "customer_id": f"C{random.randint(1000, 9999)}",  # Different customer ID
                "timestamp": txn_time,
                "amount": amount,
                "status": "completed",
                "is_anomalous": True,
                "pattern": pattern
            }
            transactions.append(txn)

        elif pattern == "high_velocity":
            txn_time = datetime.now().replace(microsecond=0)
            amount = round(random.uniform(2000, 10000), 2)
            
            txn = {
                "transaction_id": f"T{random.randint(100000, 999999)}",
                "merchant_id": merchant_id,
                "customer_id": f"C{random.randint(1000, 9999)}",  # Different customer ID
                "timestamp": txn_time,
                "amount": amount,
                "status": "completed",
                "is_anomalous": True,
                "pattern": pattern
            }
            transactions.append(txn)

        else:
        # Default case for any other patterns
            txn_time = datetime.now().replace(microsecond=0)
            amount = round(random.uniform(1000, 5000), 2)
            
            txn = {
                "transaction_id": f"T{random.randint(100000, 999999)}",
                "merchant_id": merchant_id,
                "customer_id": f"C{random.randint(1000, 9999)}",  # Different customer ID
                "timestamp": txn_time,
                "amount": amount,
                "status": "completed",
                "is_anomalous": True,
                "pattern": pattern
            }
            transactions.append(txn)

        return transactions
#Generate transactions for merchants


def generate_transactions_for_merchants(merchants):


    """Generate transactions for a given list of merchants."""
    total_merchants = len(merchants)
    num_normal = int(total_merchants * 0.8)  # 80% normal
    num_anomalous = total_merchants - num_normal  # Remaining 20% anomalous

    # Split merchants into normal and anomalous groups
    normal_merchants = merchants[:num_normal]
    anomalous_merchants = merchants[num_normal:]

 
    all_transactions = []
    for merchant in normal_merchants:
        # 80% normal transactions
        normal_txns = generate_normal_transactions(
            merchant_id=merchant["merchant_id"],
            amount_range=(100, 1000)
        )
        all_transactions.extend(normal_txns)

        # 20% anomalous transactions
        # Generate anomalous transactions
    for merchant in anomalous_merchants:
        anomaly_type = random.choice(["late_night", "high_velocity", "customer_concentration"])
        anomalous_txns = generate_anomalous_transactions(
            merchant_id=merchant["merchant_id"],
            pattern=anomaly_type
        )
        all_transactions.extend(anomalous_txns)
    return all_transactions



# Example usage:
merchants = generate_merchant_profiles(50)  # Generate 1000 merchants
transactions = generate_transactions_for_merchants(merchants)

# Display example
for txn in transactions[:1500]:
    print(txn)

[{'merchant_id': 'M3121', 'business_name': 'Chan-Miranda', 'business_type': 'E-commerce', 'registration_date': datetime.date(2020, 6, 4), 'average_ticket_size': 2098.14, 'gst_status': True}, {'merchant_id': 'M3560', 'business_name': 'Carrillo LLC', 'business_type': 'E-commerce', 'registration_date': datetime.date(2021, 4, 1), 'average_ticket_size': 6721.44, 'gst_status': False}]
{'transaction_id': 'T286672', 'merchant_id': 'M7160', 'timestamp': datetime.datetime(2024, 11, 4, 8, 56, 39), 'amount': 860.75, 'status': 'completed', 'customer_id': 'C4794', 'is_anomalous': False}
{'transaction_id': 'T792580', 'merchant_id': 'M7160', 'timestamp': datetime.datetime(2024, 11, 27, 8, 56, 39), 'amount': 265.04, 'status': 'completed', 'customer_id': 'C2224', 'is_anomalous': False}
{'transaction_id': 'T485503', 'merchant_id': 'M7160', 'timestamp': datetime.datetime(2024, 11, 25, 8, 56, 39), 'amount': 646.14, 'status': 'completed', 'customer_id': 'C1331', 'is_anomalous': False}
{'transaction_id': 'T7

2.Feature Engineering

In [179]:
#2.1. Calculate Merchant Features
#2.1.1 Transaction Velocity Metrics
#Transactions per day: 1 (It has been assumed that each merchant performs only 1 transaction in a day during Data Generation for simplicity)
#Average transactions per hour
import pandas as pd
# Convert data to DataFrame
df = pd.DataFrame(transactions)

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])
# Extract hour from the timestamp
df['hour'] = df['timestamp'].dt.hour

# Group by merchant_id and hour to count transactions
hourly_transactions = df.groupby(['merchant_id', 'hour']).size().reset_index(name='transaction_count')
# Calculate average transactions per hour for each merchant
average_txns_per_hour = (
    hourly_transactions.groupby('merchant_id')['transaction_count']
    .mean()
    .reset_index(name='average_transactions_per_hour')
)

# Display results
print(average_txns_per_hour)

   merchant_id  average_transactions_per_hour
0        M1021                            1.0
1        M1103                           30.0
2        M1181                           30.0
3        M1271                           30.0
4        M2306                           30.0
5        M2321                           30.0
6        M2573                           30.0
7        M2642                           30.0
8        M2866                            1.0
9        M3426                           30.0
10       M3613                           30.0
11       M3630                           30.0
12       M3832                           30.0
13       M3871                            1.0
14       M3893                           30.0
15       M3901                            1.0
16       M3919                           30.0
17       M4099                           30.0
18       M4262                            1.0
19       M4510                           30.0
20       M4523                    

In [180]:
#Time between consecutive transactions
# Sort data by merchant_id and timestamp
time_based_df = df.sort_values(by=['merchant_id', 'timestamp'])

# Calculate time difference (in minutes) between consecutive transactions for each merchant
time_based_df['time_diff_minutes'] = time_based_df.groupby('merchant_id')['timestamp'].diff().dt.total_seconds() / 60

# Display results
print(time_based_df)

     transaction_id merchant_id           timestamp   amount     status  \
1200        T124921       M1021 2024-11-29 08:56:39  4511.55  completed   
330         T518836       M1103 2024-10-30 08:56:39   424.44  completed   
337         T697689       M1103 2024-10-30 08:56:39   996.47  completed   
346         T486436       M1103 2024-10-30 08:56:39   257.54  completed   
331         T863733       M1103 2024-10-31 08:56:39   242.94  completed   
...             ...         ...                 ...      ...        ...   
459         T707860       M9836 2024-11-23 08:56:39   660.26  completed   
460         T181705       M9836 2024-11-23 08:56:39   849.24  completed   
466         T453394       M9836 2024-11-24 08:56:39   188.08  completed   
475         T428672       M9836 2024-11-26 08:56:39   432.17  completed   
471         T395353       M9836 2024-11-29 08:56:39   565.52  completed   

     customer_id  is_anomalous        pattern  hour  time_diff_minutes  
1200       C7941          

In [181]:
#2.1.2 Time-Based Patterns
from collections import defaultdict, Counter
def get_peak_transaction_hour(transactions):
    """
    Identify the hour with the maximum transactions.
    Args:
        transactions (list): List of transaction dictionaries with 'timestamp'.
    Returns:
        int: The hour with the most transactions.
    """
    merchant_hours = defaultdict(list)

    # Group transaction hours by merchant
    for txn in transactions:
        merchant_id = txn['merchant_id']
        hour = txn['timestamp'].hour
        merchant_hours[merchant_id].append(hour)

    # Calculate the peak hour for each merchant
    data = []
    for merchant_id, hours in merchant_hours.items():
        hour_counts = Counter(hours)
        peak_hour = max(hour_counts, key=hour_counts.get)
        peak_count = hour_counts[peak_hour]
        data.append({"merchant_id": merchant_id, "peak_hour": peak_hour, "transaction_count": peak_count})

    # Create a DataFrame
    return pd.DataFrame(data)
peak_hours_df = get_peak_transaction_hour(transactions)

# Display results
print(peak_hours_df)


   merchant_id  peak_hour  transaction_count
0        M7160          8                 30
1        M1181          8                 30
2        M2642          8                 30
3        M4523          8                 30
4        M3832          8                 30
5        M3426          8                 30
6        M9726          8                 30
7        M9247          8                 30
8        M7133          8                 30
9        M9500          8                 30
10       M3613          8                 30
11       M1103          8                 30
12       M6067          8                 30
13       M7774          8                 30
14       M5487          8                 30
15       M9836          8                 30
16       M2306          8                 30
17       M2321          8                 30
18       M7383          8                 30
19       M6782          8                 30
20       M8500          8                 30
21       M

In [182]:
#Late-night transaction frequency
def calculate_late_night_frequency(transactions):
    """
    Calculate late-night transaction frequency for each merchant.

    Args:
        transactions (list): List of transaction dictionaries with 'merchant_id' and 'timestamp'.

    Returns:
        pd.DataFrame: DataFrame with merchant_id, total_transactions, late_night_transactions, and frequency.
    """
    # Define late-night hours
    LATE_NIGHT_HOURS = list(range(23, 24)) + list(range(0, 5))

    # Initialize a dictionary to store merchant-wise data
    merchant_data = {}

    for txn in transactions:
        merchant_id = txn["merchant_id"]
        txn_hour = txn["timestamp"].hour

        if merchant_id not in merchant_data:
            merchant_data[merchant_id] = {"total": 0, "late_night": 0}

        merchant_data[merchant_id]["total"] += 1
        if txn_hour in LATE_NIGHT_HOURS:
            merchant_data[merchant_id]["late_night"] += 1

    # Calculate frequency and prepare data for DataFrame
    data = []
    for merchant_id, counts in merchant_data.items():
        total_txns = counts["total"]
        late_night_txns = counts["late_night"]
        frequency = late_night_txns / total_txns if total_txns > 0 else 0
        data.append({
            "merchant_id": merchant_id,
            "total_transactions": total_txns,
            "late_night_transactions": late_night_txns,
            "late_night_frequency": round(frequency, 2)
        })

    # Convert to DataFrame
    return pd.DataFrame(data)

# Example usage:
# transactions = [...]  # Your list of transactions with 'merchant_id' and 'timestamp'
late_night_df = calculate_late_night_frequency(transactions)

# Display results
print(late_night_df)

   merchant_id  total_transactions  late_night_transactions  \
0        M7160                  30                        0   
1        M1181                  30                        0   
2        M2642                  30                        0   
3        M4523                  30                        0   
4        M3832                  30                        0   
5        M3426                  30                        0   
6        M9726                  30                        0   
7        M9247                  30                        0   
8        M7133                  30                        0   
9        M9500                  30                        0   
10       M3613                  30                        0   
11       M1103                  30                        0   
12       M6067                  30                        0   
13       M7774                  30                        0   
14       M5487                  30                     

In [183]:
#2.1.3 Amount Distributions
#Average transaction amount:
def calculate_average_transaction_amount(transactions):
    """
    Calculate the average transaction amount for each merchant.

    Args:
        transactions (list): List of transaction dictionaries with 'merchant_id' and 'amount'.

    Returns:
        pd.DataFrame: DataFrame with merchant_id and average_transaction_amount.
    """
    # Convert transaction list to DataFrame
    df = pd.DataFrame(transactions)

    # Group by merchant_id and calculate the average transaction amount
    avg_transaction_df = (
        df.groupby("merchant_id")["amount"]
        .mean()
        .reset_index()
        .rename(columns={"amount": "average_transaction_amount"})
    )

    return avg_transaction_df

average_transaction_df = calculate_average_transaction_amount(transactions)

# Display results
print(average_transaction_df)

   merchant_id  average_transaction_amount
0        M1021                 4511.550000
1        M1103                  460.603667
2        M1181                  521.760333
3        M1271                  448.627000
4        M2306                  611.016667
5        M2321                  586.900667
6        M2573                  591.393667
7        M2642                  677.852000
8        M2866                 9576.560000
9        M3426                  558.684667
10       M3613                  552.486333
11       M3630                  497.797000
12       M3832                  537.195000
13       M3871                 6126.560000
14       M3893                  536.588000
15       M3901                16890.900000
16       M3919                  494.699000
17       M4099                  539.680000
18       M4262                 5087.540000
19       M4510                  494.039667
20       M4523                  506.411667
21       M4733                12456.740000
22       M5

In [184]:
#Variance of transaction amount
def calculate_variance_in_transaction_amounts(transactions):
    """
    Calculate the variance in transaction amounts for each merchant.

    Args:
        transactions (list): List of transaction dictionaries with 'merchant_id' and 'amount'.

    Returns:
        pd.DataFrame: DataFrame with merchant_id and variance_in_transaction_amount.
    """
    # Convert transaction list to DataFrame
    df = pd.DataFrame(transactions)

    # Group by merchant_id and calculate the variance in transaction amount
    variance_transaction_df = (
        df.groupby("merchant_id")["amount"]
        .var()  # Compute variance of transaction amounts
        .reset_index()
        .rename(columns={"amount": "variance_in_transaction_amount"})
    )

    return variance_transaction_df
variance_transaction_df = calculate_variance_in_transaction_amounts(transactions)

# Display results
print(variance_transaction_df)

   merchant_id  variance_in_transaction_amount
0        M1021                             NaN
1        M1103                    67041.145783
2        M1181                    75185.950472
3        M1271                    56802.501429
4        M2306                    74899.686961
5        M2321                    59528.635682
6        M2573                    64454.780272
7        M2642                    66948.740844
8        M2866                             NaN
9        M3426                    59229.298702
10       M3613                    66033.596831
11       M3630                    83041.251215
12       M3832                    66859.108102
13       M3871                             NaN
14       M3893                    64694.483265
15       M3901                             NaN
16       M3919                    76595.366913
17       M4099                    61770.078434
18       M4262                             NaN
19       M4510                    62235.434486
20       M452

In [185]:
#High-value transaction ratio
def calculate_high_value_transaction_ratio(transactions, threshold):
    """
    Calculate the ratio of high-value transactions for each merchant.

    Args:
        transactions (list): List of transaction dictionaries with 'merchant_id' and 'amount'.
        threshold (float): The amount above which a transaction is considered high-value.

    Returns:
        pd.DataFrame: DataFrame with merchant_id and high_value_transaction_ratio.
    """
    # Convert transaction list to DataFrame
    df = pd.DataFrame(transactions)

    # Create a new column to identify high-value transactions
    df["is_high_value"] = df["amount"] > threshold

    # Group by merchant_id and calculate the high-value transaction ratio
    high_value_ratio_df = (
        df.groupby("merchant_id")["is_high_value"]
        .mean()  # Proportion of high-value transactions
        .reset_index()
        .rename(columns={"is_high_value": "high_value_transaction_ratio"})
    )

    return high_value_ratio_df

high_value_ratio_df = calculate_high_value_transaction_ratio(transactions, threshold=10000)# Assume a threshold of 10000

# Display results
print(high_value_ratio_df)

   merchant_id  high_value_transaction_ratio
0        M1021                           0.0
1        M1103                           0.0
2        M1181                           0.0
3        M1271                           0.0
4        M2306                           0.0
5        M2321                           0.0
6        M2573                           0.0
7        M2642                           0.0
8        M2866                           0.0
9        M3426                           0.0
10       M3613                           0.0
11       M3630                           0.0
12       M3832                           0.0
13       M3871                           0.0
14       M3893                           0.0
15       M3901                           1.0
16       M3919                           0.0
17       M4099                           0.0
18       M4262                           0.0
19       M4510                           0.0
20       M4523                           0.0
21       M

In [186]:
#2.1.4 Customer Concentration
#Unique customer count
def calculate_unique_customer_count(transactions):
    """
    Calculate the unique customer count for each merchant.

    Args:
        transactions (list): List of transaction dictionaries with 'merchant_id' and 'customer_id'.

    Returns:
        pd.DataFrame: DataFrame with merchant_id and unique_customer_count.
    """
    # Convert transaction list to DataFrame
    df = pd.DataFrame(transactions)

    # Group by merchant_id and calculate the unique customer count
    unique_customer_count_df = (
        df.groupby("merchant_id")["customer_id"]
        .nunique()  # Count of unique customer_ids for each merchant
        .reset_index()
        .rename(columns={"customer_id": "unique_customer_count"})
    )

    return unique_customer_count_df

unique_customer_count_df = calculate_unique_customer_count(transactions)

# Display results
print(unique_customer_count_df)

   merchant_id  unique_customer_count
0        M1021                      1
1        M1103                     30
2        M1181                     30
3        M1271                     30
4        M2306                     30
5        M2321                     30
6        M2573                     30
7        M2642                     30
8        M2866                      1
9        M3426                     29
10       M3613                     30
11       M3630                     29
12       M3832                     29
13       M3871                      1
14       M3893                     30
15       M3901                      1
16       M3919                     30
17       M4099                     30
18       M4262                      1
19       M4510                     30
20       M4523                     30
21       M4733                      1
22       M5215                     30
23       M5235                     30
24       M5373                     30
25       M54

In [187]:
#2.2. Create Feature Normalization Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

df = pd.DataFrame(transactions)
df_combined = df.merge(peak_hours_df, on='merchant_id', how='left') \
                        .merge(average_txns_per_hour, on='merchant_id',how='left') \
                        .merge(high_value_ratio_df, on='merchant_id',how='left') \
                        .merge(late_night_df, on='merchant_id',how='left') \
                        .merge(unique_customer_count_df, on='merchant_id',how='left') \
                        .merge(time_based_df, on='merchant_id',how='left') 


df = df_combined

# Features for normalization
features_to_normalize = [ 'peak_hour', 'average_transactions_per_hour', 'high_value_transaction_ratio','late_night_frequency','unique_customer_count','time_diff_minutes']

# Create a MinMaxScaler or StandardScaler for normalization
scaler = MinMaxScaler()

# Define the preprocessing steps as a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', scaler, features_to_normalize)
    ])

# Create a pipeline to normalize the features
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Apply the pipeline to the dataset
normalized_data = pipeline.fit_transform(df)

# Convert the normalized data back to a DataFrame
normalized_df = pd.DataFrame(normalized_data, columns=features_to_normalize)

# Add the merchant_id column back to the normalized data
normalized_df['merchant_id'] = df['merchant_id']

# Display the result
print(normalized_df)


       peak_hour  average_transactions_per_hour  high_value_transaction_ratio  \
0       0.318182                            1.0                           0.0   
1       0.318182                            1.0                           0.0   
2       0.318182                            1.0                           0.0   
3       0.318182                            1.0                           0.0   
4       0.318182                            1.0                           0.0   
...          ...                            ...                           ...   
36005   0.000000                            0.0                           0.0   
36006   0.318182                            0.0                           0.0   
36007   0.318182                            0.0                           1.0   
36008   1.000000                            0.0                           1.0   
36009   0.318182                            0.0                           0.0   

       late_night_frequency

3. Model Development

In [188]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from sklearn.preprocessing import MinMaxScaler

# 3.1. Define Autoencoder Architecture
def build_autoencoder(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(64, activation='relu'),  # Encoder layer
        Dense(32, activation='relu'),  # Bottleneck
        Dense(64, activation='relu'),  # Decoder layer
        Dense(input_dim, activation='sigmoid')  # Output layer
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

# Data Preparation
data = df_combined  

# Separate normal and test data
normal_data = data.iloc[:1200]  # First 800 rows are normal behavior
test_data = data.iloc[1200:]    # Remaining rows are test data
#--->
# # Exclude 'merchant_id' and 'transaction_id' from features for processing
# features_to_exclude = ['merchant_id', 'transaction_id', 'timestamp','status','customer_id']
# normal_data_features = normal_data.drop(columns=features_to_exclude)
# test_data_features = test_data.drop(columns=features_to_exclude)
# 1. Define the columns to pass to the autoencoder
columns_to_use = [ 'peak_hour', 'average_transactions_per_hour', 'high_value_transaction_ratio','late_night_frequency','unique_customer_count','time_diff_minutes']

normal_data_to_normalize = normal_data[columns_to_use]
test_data_to_normalize = test_data[columns_to_use]
# Normalize the remaining features
scaler = MinMaxScaler()
normalized_normal_data = scaler.fit_transform(normal_data_to_normalize)
normalized_test_data = scaler.transform(test_data_to_normalize)

# print(f"Normalized Normal Data Shape: {normalized_normal_data.shape}")
# print(f"Normalized Test Data Shape: {normalized_test_data.shape}")

# 3.2. Train the autoencoder
autoencoder = build_autoencoder(input_dim=normalized_normal_data.shape[1])
autoencoder.fit(
    normalized_normal_data, normalized_normal_data,
    epochs=50, batch_size=32, shuffle=True, validation_split=0.2
)
# 3.3. Calculate Reconstruction Error Threshold
# Calculate reconstruction errors for test data
test_reconstructed = autoencoder.predict(normalized_test_data)
test_errors = np.mean(np.square(normalized_test_data - test_reconstructed), axis=1)

# Set threshold based on training data reconstruction errors
train_reconstructed = autoencoder.predict(normalized_normal_data)
train_errors = np.mean(np.square(normalized_normal_data - train_reconstructed), axis=1)
threshold = np.percentile(train_errors, 95)

# 3.4. Implement Anomaly Scoring
test_data = test_data.copy()
# Determine anomalies
test_data.loc[:, 'anomaly_score'] = test_errors
#test_data['anomaly_score'] = test_errors
test_data.loc[:, 'is_anomalous'] = test_errors > threshold
#test_data['is_anomalous'] = test_errors > threshold

# # Restore 'merchant_id' and 'transaction_id', 'timestamp' to the results
# test_data['merchant_id'] = test_data_to_normalize.loc[test_data.index, 'merchant_id']#index.map(normal_data['merchant_id'])
# test_data['transaction_id'] = test_data_to_normalize.loc[test_data.index, 'transaction_id']#index.map(normal_data['transaction_id'])
# test_data['timestamp'] = test_data_to_normalize.loc[test_data.index, 'timestamp']#index.map(normal_data['timestamp'])
# test_data['customer_id'] = test_data_to_normalize.loc[test_data.index, 'customer_id']#index.map(normal_data['customer_id'])
# View final output with anomaly information
print(test_data)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
      transaction_id_x merchant_id         timestamp_x  amount_x   status_x  \
1200           T834636       M1181 2024-11-22 08:56:39    272.07  completed   
1201           T834636       M1181 2024-11-22 08:56:39    272.07  completed   
1202           T834636       M1181 2024-11-22 08:56:39    272.07  completed   
1203           T834636       M1181 2024-11-22 08:56:39    272.07  completed   
1204          

4.Fraud Pattern Detection

In [189]:
import pandas as pd
import numpy as np

# Sample DataFrame with necessary columns
df = pd.DataFrame(transactions)

# 4.1 High Velocity Detection (Transactions per hour threshold)

# Group by 'merchant_id' and calculate the transaction frequency (velocity) per hour
df['hour'] = df['timestamp'].dt.hour
transaction_counts = df.groupby(['merchant_id', 'hour']).size().reset_index(name='transaction_count')

# Define a threshold for high velocity (e.g., more than 3 transactions per hour considered high velocity)
high_velocity_threshold = 3
high_velocity = transaction_counts[transaction_counts['transaction_count'] > high_velocity_threshold]

print("High Velocity Transactions:")
print(high_velocity)

# 4.2 Odd-Hour Pattern Detection (Transactions outside business hours)
# Let's assume normal business hours are 9 AM to 6 PM (odd hours are outside this range)

business_hours_start = 9
business_hours_end = 18

# Mark transactions occurring outside business hours as 'odd hour'
df['is_odd_hour'] = df['hour'].apply(lambda x: x < business_hours_start or x > business_hours_end)

print("\nOdd-Hour Pattern Detection:")
print(df[df['is_odd_hour'] == True])

# 4.3 Customer Concentration Analysis
# If a merchant has a high number of transactions from the same customer, it could indicate concentration risk.

# Count transactions per customer for each merchant
customer_concentration = df.groupby(['merchant_id', 'customer_id']).size().reset_index(name='customer_transaction_count')

# Define a threshold for high concentration (e.g., more than 3 transactions per customer considered high concentration)
high_concentration_threshold = 2
high_concentration = customer_concentration[customer_concentration['customer_transaction_count'] >= high_concentration_threshold]

print("\nCustomer Concentration Analysis:")
print(high_concentration)

# 4.4 Calculate Pattern-Specific Scores
# Example scoring based on the above patterns (high velocity, odd-hour, and customer concentration)

df['high_velocity_score'] = df['merchant_id'].map(lambda x: 1 if x in high_velocity['merchant_id'].values else 0)
df['odd_hour_score'] = df['is_odd_hour'].astype(int)  # 1 if in odd hours, 0 otherwise
df['customer_concentration_score'] = df['merchant_id'].map(lambda x: 1 if x in high_concentration['merchant_id'].values else 0)

# Calculate final anomaly score based on these rules
df['total_anomaly_score'] = df['high_velocity_score'] + df['odd_hour_score'] + df['customer_concentration_score']

print("\nPattern-Specific Scores:")
print(df[['merchant_id', 'transaction_id', 'high_velocity_score', 'odd_hour_score', 
          'customer_concentration_score', 'total_anomaly_score']])


High Velocity Transactions:
   merchant_id  hour  transaction_count
1        M1103     8                 30
2        M1181     8                 30
3        M1271     8                 30
4        M2306     8                 30
5        M2321     8                 30
6        M2573     8                 30
7        M2642     8                 30
9        M3426     8                 30
10       M3613     8                 30
11       M3630     8                 30
12       M3832     8                 30
14       M3893     8                 30
16       M3919     8                 30
17       M4099     8                 30
19       M4510     8                 30
20       M4523     8                 30
22       M5215     8                 30
23       M5235     8                 30
24       M5373     8                 30
25       M5487     8                 30
26       M6067     8                 30
27       M6782     8                 30
28       M6969     8                 30
29       M71