In [None]:
import pandas as pd
import numpy as np

def generate_fraud_detection_dataset(n_samples=10000, seed=42, output_file="fraud_detection_dataset.csv"):
    """
    Generates a synthetic fraud detection dataset with realistic features and fraud labeling logic.

    Parameters:
        n_samples (int): Number of samples to generate.
        seed (int): Random seed for reproducibility.
        output_file (str): Path to save the dataset as a CSV file.

    Returns:
        None: Saves the dataset to the specified CSV file.
    """
    # Set random seed for reproducibility
    np.random.seed(seed)

    # Generate features
    data = {
        "Transaction Amount": np.random.exponential(scale=500, size=n_samples),
        "Transaction Frequency": np.random.poisson(lam=3, size=n_samples),
        "Recipient Verification Status": np.random.choice(
            ["verified", "recently_registered", "suspicious"], n_samples, p=[0.6, 0.3, 0.1]
        ),
        "Recipient Blacklist Status": np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
        "Device Fingerprinting": np.random.choice([0, 1], n_samples, p=[0.85, 0.15]),
        "VPN or Proxy Usage": np.random.choice([0, 1], n_samples, p=[0.85, 0.15]),
        "Geo-Location Flags": np.random.choice(
            ["normal", "high-risk", "unusual"], n_samples, p=[0.7, 0.2, 0.1]
        ),
        "Behavioral Biometrics": np.random.normal(loc=0, scale=1, size=n_samples),
        "Time Since Last Transaction": np.random.uniform(0, 30, n_samples),
        "Social Trust Score": np.random.uniform(0, 100, n_samples),
        "Account Age": np.random.uniform(0, 5, n_samples),
        "High-Risk Transaction Times": np.random.choice([0, 1], n_samples, p=[0.75, 0.25]),
        "Past Fraudulent Behavior Flags": np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
        "Location-Inconsistent Transactions": np.random.choice([0, 1], n_samples, p=[0.85, 0.15]),
        "Normalized Transaction Amount": np.random.normal(loc=0.5, scale=0.2, size=n_samples),
        "Transaction Context Anomalies": np.random.normal(loc=0, scale=1, size=n_samples),
        "Fraud Complaints Count": np.random.poisson(lam=0.5, size=n_samples),
        "Merchant Category Mismatch": np.random.choice([0, 1], n_samples, p=[0.85, 0.15]),
        "User Daily Limit Exceeded": np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        "Recent High-Value Transaction Flags": np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
    }

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Labeling based on feature importance
    def label_transaction(row):
        fraud_score = 0
        # Feature weights
        fraud_score += row["Recipient Blacklist Status"] * 3
        fraud_score += row["Device Fingerprinting"] * 2
        fraud_score += row["VPN or Proxy Usage"] * 2
        fraud_score += 1 if row["Geo-Location Flags"] == "high-risk" else 0
        fraud_score += row["Past Fraudulent Behavior Flags"] * 3
        fraud_score += row["Location-Inconsistent Transactions"] * 2
        fraud_score += row["Merchant Category Mismatch"] * 1.5
        fraud_score += row["User Daily Limit Exceeded"] * 1.5

        # Threshold for fraud detection
        return 1 if fraud_score > 4 else 0

    # Apply labeling
    df["Label"] = df.apply(label_transaction, axis=1)

    # Save the dataset to a CSV file
    df.to_csv(output_file, index=False)
    print(f"Dataset saved to {output_file}")
    print(df["Label"].value_counts())  # Print label distribution for verification

# Example Usage
generate_fraud_detection_dataset(n_samples=10000, output_file="fraud_detection_dataset.csv")


Dataset saved to fraud_detection_dataset.csv
Label
0    8338
1    1662
Name: count, dtype: int64


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Load the dataset
file_path = 'path_to_your_file.csv'  # Replace with your file path
fraud_data = pd.read_csv(file_path)

# Initialize scalers and encoders
scaler = MinMaxScaler()
label_encoder = LabelEncoder()

# Continuous features to normalize
continuous_features = [
    'Transaction Amount', 'Behavioral Biometrics', 'Time Since Last Transaction',
    'Social Trust Score', 'Account Age', 'Normalized Transaction Amount',
    'Transaction Context Anomalies'
]

# Normalize continuous features
fraud_data[continuous_features] = scaler.fit_transform(fraud_data[continuous_features])

# Encode categorical features
categorical_feature = 'Geo-Location Flags'  # Replace with any other categorical features as needed
fraud_data[categorical_feature] = label_encoder.fit_transform(fraud_data[categorical_feature])

# Save the preprocessed dataset (optional)
output_path = 'preprocessed_fraud_data.csv'
fraud_data.to_csv(output_path, index=False)

# Display the first few rows to verify
print(fraud_data.head())


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_your_file.csv'

In [None]:
import pandas as pd
import numpy as np

def generate_refined_fraud_dataset(n_samples=10000, fraud_ratio=0.5, seed=42, output_csv="refined_fraud_dataset.csv"):
    """
    Generates a refined synthetic fraud detection dataset with balanced fraud cases and realistic features.

    Parameters:
        n_samples (int): Total number of samples to generate.
        fraud_ratio (float): Desired ratio of fraudulent transactions (0 to 1).
        seed (int): Random seed for reproducibility.
        output_csv (str): Path to save the generated dataset as a CSV file.

    Returns:
        pd.DataFrame: The refined fraud detection dataset.
    """
    np.random.seed(seed)

    # Generate base features
    data = {
        "Transaction Amount": np.random.exponential(scale=500, size=n_samples),
        "Transaction Frequency": np.random.poisson(lam=3, size=n_samples),
        "Recipient Verification Status": np.random.choice(
            ["verified", "recently_registered", "suspicious"], n_samples, p=[0.7, 0.2, 0.1]
        ),
        "Recipient Blacklist Status": np.random.choice([0, 1], n_samples, p=[0.95, 0.05]),
        "Device Fingerprinting": np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
        "VPN or Proxy Usage": np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
        "Geo-Location Flags": np.random.choice(
            ["normal", "high-risk", "unusual"], n_samples, p=[0.8, 0.15, 0.05]
        ),
        "Behavioral Biometrics": np.random.normal(loc=0, scale=1, size=n_samples),
        "Time Since Last Transaction": np.random.uniform(0, 30, n_samples),
        "Social Trust Score": np.random.uniform(0, 100, n_samples),
        "Account Age": np.random.uniform(0, 5, n_samples),
        "High-Risk Transaction Times": np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        "Past Fraudulent Behavior Flags": np.random.choice([0, 1], n_samples, p=[0.95, 0.05]),
        "Location-Inconsistent Transactions": np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
        "Normalized Transaction Amount": np.random.normal(loc=0.5, scale=0.2, size=n_samples),
        "Transaction Context Anomalies": np.random.normal(loc=0, scale=1, size=n_samples),
        "Fraud Complaints Count": np.random.poisson(lam=0.5, size=n_samples),
        "Merchant Category Mismatch": np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
        "User Daily Limit Exceeded": np.random.choice([0, 1], n_samples, p=[0.85, 0.15]),
        "Recent High-Value Transaction Flags": np.random.choice([0, 1], n_samples, p=[0.85, 0.15])
    }

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Cap extreme values for realism
    caps = {
        "Transaction Amount": 5000,
        "Behavioral Biometrics": 3,
        "Time Since Last Transaction": 30,
    }
    for feature, cap in caps.items():
        df[feature] = df[feature].clip(upper=cap)

    # Compute Risk Interaction Score
    df["Risk Interaction Score"] = (
        df["Transaction Amount"] / 1000 +  # Normalize Transaction Amount
        df["Recipient Blacklist Status"] * 2 +
        df["Past Fraudulent Behavior Flags"] * 2 +
        df["VPN or Proxy Usage"] +
        df["Geo-Location Flags"].apply(lambda x: 2 if x == "high-risk" else 0)
    )

    # Assign labels dynamically
    fraud_threshold = 5
    df["Label"] = (df["Risk Interaction Score"] > fraud_threshold).astype(int)

    # Balance fraud cases
    fraud_cases = df[df["Label"] == 1]
    non_fraud_cases = df[df["Label"] == 0]

    # Determine number of samples for each class based on fraud_ratio
    n_fraud = int(n_samples * fraud_ratio)
    n_non_fraud = n_samples - n_fraud

    # Oversample or undersample to achieve balance
    fraud_cases = fraud_cases.sample(n=n_fraud, replace=True, random_state=seed)
    non_fraud_cases = non_fraud_cases.sample(n=n_non_fraud, replace=True, random_state=seed)

    # Combine and shuffle
    balanced_df = pd.concat([fraud_cases, non_fraud_cases]).sample(frac=1, random_state=seed).reset_index(drop=True)

    # Save to CSV
    balanced_df.to_csv(output_csv, index=False)

    return balanced_df

    # Example Usage
refined_dataset = generate_refined_fraud_dataset(
    n_samples=20000,
    fraud_ratio=0.5,
    seed=42,
    output_csv="refined_fraud_dataset.csv"
)
print(refined_dataset.head())


   Transaction Amount  Transaction Frequency Recipient Verification Status  \
0           36.907066                      6                      verified   
1           37.879209                      0                      verified   
2         1250.985077                      2           recently_registered   
3         2972.505568                      3           recently_registered   
4          501.334960                      2                      verified   

   Recipient Blacklist Status  Device Fingerprinting  VPN or Proxy Usage  \
0                           0                      0                   0   
1                           1                      0                   1   
2                           1                      0                   0   
3                           0                      0                   0   
4                           0                      0                   0   

  Geo-Location Flags  Behavioral Biometrics  Time Since Last Transaction  

In [5]:
import json

# File paths to your JSON files
user_friendly_path = "/content/top_100_transactions.json"       # Replace with your actual path
model_processed_path = "/content/transactions.json"

# Load user-friendly JSON data
with open(user_friendly_path, "r") as f:
    user_friendly_data = json.load(f)

# Load model-processed JSON data
with open(model_processed_path, "r") as f:
    model_processed_data = json.load(f)

# Ensure both datasets have the same length
min_length = min(len(user_friendly_data), len(model_processed_data))

# Combine the two datasets into a list of paired objects
combined_data = [
    {"user_friendly": user_friendly_data[i], "model_processed": model_processed_data[i]}
    for i in range(min_length)
]

# Save the combined data to a new JSON file
with open("mapped_transactions.json", "w") as f:
    json.dump(combined_data, f, indent=4)

print("Combined JSON saved as mapped_final_transactions.json")

Combined JSON saved as mapped_final_transactions.json


In [None]:


import pandas as pd
import numpy as np

def generate_refined_fraud_dataset(n_samples=10000, fraud_ratio=0.5, seed=42, output_csv="refined_fraud_dataset.csv"):
    """
    Generates a refined synthetic fraud detection dataset with balanced fraud cases and realistic features.

    Parameters:
        n_samples (int): Total number of samples to generate.
        fraud_ratio (float): Desired ratio of fraudulent transactions (0 to 1).
        seed (int): Random seed for reproducibility.
        output_csv (str): Path to save the generated dataset as a CSV file.

    Returns:
        pd.DataFrame: The refined fraud detection dataset.
    """
    np.random.seed(seed)

    # Generate base features
    data = {
        "Transaction Amount": np.abs(np.random.exponential(scale=500, size=n_samples)),
        "Transaction Frequency": np.random.poisson(lam=3, size=n_samples),
        "Recipient Verification Status": np.random.choice(
            ["verified", "recently_registered", "suspicious"], n_samples, p=[0.7, 0.2, 0.1]
        ),
        "Recipient Blacklist Status": np.random.choice([0, 1], n_samples, p=[0.95, 0.05]),
        "Device Fingerprinting": np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
        "VPN or Proxy Usage": np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
        "Geo-Location Flags": np.random.choice(
            ["normal", "high-risk", "unusual"], n_samples, p=[0.8, 0.15, 0.05]
        ),
        "Behavioral Biometrics": np.abs(np.random.normal(loc=0, scale=1, size=n_samples)),
        "Time Since Last Transaction": np.random.uniform(0, 30, n_samples),
        "Social Trust Score": np.random.uniform(0, 100, n_samples),
        "Account Age": np.random.uniform(0, 5, n_samples),
        "High-Risk Transaction Times": np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        "Past Fraudulent Behavior Flags": np.random.choice([0, 1], n_samples, p=[0.95, 0.05]),
        "Location-Inconsistent Transactions": np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
        "Normalized Transaction Amount": np.abs(np.random.normal(loc=0.5, scale=0.2, size=n_samples)),
        "Transaction Context Anomalies": np.abs(np.random.normal(loc=0, scale=1, size=n_samples)),
        "Fraud Complaints Count": np.random.poisson(lam=0.5, size=n_samples),
        "Merchant Category Mismatch": np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
        "User Daily Limit Exceeded": np.random.choice([0, 1], n_samples, p=[0.85, 0.15]),
        "Recent High-Value Transaction Flags": np.random.choice([0, 1], n_samples, p=[0.85, 0.15])
    }

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Cap extreme values for realism
    caps = {
        "Transaction Amount": 5000,
        "Behavioral Biometrics": 3,
        "Time Since Last Transaction": 30,
    }
    for feature, cap in caps.items():
        df[feature] = df[feature].clip(upper=cap)

    # Assign labels dynamically
    def label_transaction(row):
        fraud_score = 0
        fraud_score += row["Transaction Amount"] / 1000  # Normalize Transaction Amount impact
        fraud_score += row["Recipient Blacklist Status"] * 2
        fraud_score += row["Past Fraudulent Behavior Flags"] * 2
        fraud_score += row["VPN or Proxy Usage"]
        fraud_score += 2 if row["Geo-Location Flags"] == "high-risk" else 0
        return 1 if fraud_score > 5 else 0

    df["Label"] = df.apply(label_transaction, axis=1)

    # Balance fraud cases
    fraud_cases = df[df["Label"] == 1]
    non_fraud_cases = df[df["Label"] == 0]

    # Determine number of samples for each class based on fraud_ratio
    n_fraud = int(n_samples * fraud_ratio)
    n_non_fraud = n_samples - n_fraud

    # Oversample or undersample to achieve balance
    fraud_cases = fraud_cases.sample(n=n_fraud, replace=True, random_state=seed)
    non_fraud_cases = non_fraud_cases.sample(n=n_non_fraud, replace=True, random_state=seed)

    # Combine and shuffle
    balanced_df = pd.concat([fraud_cases, non_fraud_cases]).sample(frac=1, random_state=seed).reset_index(drop=True)

    # Save to CSV
    balanced_df.to_csv(output_csv, index=False)

    return balanced_df

    # Example Usage
refined_dataset = generate_refined_fraud_dataset(
    n_samples=20000,
    fraud_ratio=0.5,
    seed=42,
    output_csv="fraud_dataset.csv"
)
print(refined_dataset.head())



   Transaction Amount  Transaction Frequency Recipient Verification Status  \
0           36.907066                      6                      verified   
1           37.879209                      0                      verified   
2         1250.985077                      2           recently_registered   
3         2972.505568                      3           recently_registered   
4          501.334960                      2                      verified   

   Recipient Blacklist Status  Device Fingerprinting  VPN or Proxy Usage  \
0                           0                      0                   0   
1                           1                      0                   1   
2                           1                      0                   0   
3                           0                      0                   0   
4                           0                      0                   0   

  Geo-Location Flags  Behavioral Biometrics  Time Since Last Transaction  

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Input JSON data
raw_data = {
    "features": [
        36.90706552196081, 6, "verified", 0, 0, 0, "normal",
        0.3572874504464022, 23.82594842084447, 17.22576154101364, 3.934289851050419,
        0, 0, 0, 0.5205771170664302, 0.747143472779124, 0, 0, 0, 0
    ]
}

# Define column names (match original training data order)
numerical_cols = [
    "Transaction Amount", "Transaction Frequency", "Behavioral Biometrics",
    "Time Since Last Transaction", "Social Trust Score", "Account Age",
    "High-Risk Transaction Times", "Normalized Transaction Amount",
    "Transaction Context Anomalies", "Fraud Complaints Count",
    "Merchant Category Mismatch", "User Daily Limit Exceeded",
    "Recent High-Value Transaction Flags"
]

categorical_cols = ["Recipient Verification Status", "Geo-Location Flags"]

# Map the input data to a DataFrame
raw_features = pd.DataFrame([raw_data["features"]], columns=numerical_cols + categorical_cols)

# Initialize the scaler (you must fit it using the training data MinMaxScaler)
scaler = MinMaxScaler()

# Example of fitting the scaler (use your original training data for fitting)
# scaler.fit(X_train[numerical_cols])

# Scale numerical features
raw_features[numerical_cols] = scaler.transform(raw_features[numerical_cols])

# One-hot encode categorical features
raw_features = pd.get_dummies(raw_features, columns=categorical_cols, drop_first=True)

# Ensure the feature order matches the training data
# This step requires you to know the final column order after training preprocessing
final_order = [...]  # Replace with the correct column order from your training set
raw_features = raw_features.reindex(columns=final_order, fill_value=0)

# Convert to a list for model input
preprocessed_features = raw_features.iloc[0].tolist()

print(preprocessed_features)

ValueError: 15 columns passed, passed data had 20 columns