In [31]:
import pandas as pd
import numpy as np
import json

In [32]:
NUM_SAMPLES = 100000
with open('sample_parameters.json', 'r') as fp:
    SAMPLE_DATA = json.load(fp)

ATTRIBUTES = SAMPLE_DATA["normal_visitor"]["attributes"]
LABELS = ["manual_fraud", "bot_fraud", "visitor_profile"]

In [33]:
def generate_label(label, key):
    """Generate a label vector for a particular visitor profile"""
    if label == "visitor_profile":
        key_data = [key] * int(SAMPLE_DATA[key]["traffic_pct"] * NUM_SAMPLES)
    else:
        key_data = np.random.binomial(1, SAMPLE_DATA[key][label], int(SAMPLE_DATA[key]["traffic_pct"] * NUM_SAMPLES))
    return key_data

def generate_attribute(attribute, key):
    """Generate an attribute vector for a particular visitor profile"""
    if SAMPLE_DATA[key]["attributes"][attribute] is not None:
        key_data = np.random.binomial(1, SAMPLE_DATA[key]["attributes"][attribute], int(SAMPLE_DATA[key]["traffic_pct"] * NUM_SAMPLES))
    else:
        key_data = int(SAMPLE_DATA[key]["traffic_pct"] * NUM_SAMPLES) * [None]
    return key_data

def generate_data():
    """Generate a sample dataset based on the rules/probabilities encoded into the SAMPLE_DATA json file"""
    sample_df = pd.DataFrame()
    # Generate labels
    for label in LABELS:
        col_data = []
        for key in SAMPLE_DATA.keys():
            col_data.extend(generate_label(label, key))
        sample_df[label] = col_data
    
    # Generate attributes
    for attribute in ATTRIBUTES:
        col_data = []
        for key in SAMPLE_DATA.keys():
            col_data.extend(generate_attribute(attribute, key))
        sample_df[attribute] = col_data
    
    return sample_df


In [34]:
sample_df = generate_data()

In [35]:
sample_df

Unnamed: 0,manual_fraud,bot_fraud,visitor_profile,hasUnusualLocation,hasUntrustedCountry,hasInsufficientRiskBudget,hasUnusualVisitTime,hasShortVisitorLife,hasSuspiciousIP,hasUnusualTimingCharacteristics,...,hasUnusualSessionTimeLength,hasShiftingLocation,hasTooManySimilarEvents,hasLargePurchaseSize,hasTooManyCardFingerprints,hasCardIssuerBankIPLocationMismatch,hasCardIssuerBankBillingLocationMismatch,hasCardIssuerBankShippingLocationMismatch,hasBillingToShippingCountryMismatch,hasSuspiciousEmail
0,0,0,normal_visitor,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0,0,normal_visitor,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0,0,normal_visitor,0,0,1,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0,0,normal_visitor,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0,0,normal_visitor,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,1,bot_smart_card_cracking,1,1,1,1,1,1,1,...,1,0,0,0.0,1.0,0.0,1.0,1.0,0.0,1
99996,0,0,bot_smart_card_cracking,1,0,1,0,0,0,0,...,1,1,1,0.0,1.0,1.0,1.0,1.0,1.0,1
99997,0,0,bot_smart_card_cracking,1,0,1,1,0,1,1,...,1,0,1,0.0,1.0,1.0,1.0,1.0,0.0,1
99998,0,1,bot_smart_card_cracking,1,1,1,1,1,1,1,...,1,1,1,0.0,1.0,0.0,0.0,1.0,1.0,1


In [36]:
sample_df.shape

(100000, 21)