In [1]:
import time 


import pandas as pd
import numpy as np 
from collections import OrderedDict
from faker import Factory
from random import sample

In [2]:
fake = Factory.create()

In [3]:
companies = pd.read_csv('companies.csv', 
                        index_col=False)

In [4]:
# get company names 
company_names = list(companies['name'].values)

In [5]:
def simulate_customer(n):
    
    """simulate n customers, return a dataframe.
       Each customer has id, age, name, address, compromised: 0 or 1 
       where 1 if compromised and value is set by later function."""
    
    data = OrderedDict()
    
    data['id'] = np.arange(n)
    data['name'] = [fake.name() for n in range(n)]
    data['uuid'] = [fake.uuid4() for n in range(n)]
    data['age'] = np.random.randint(18, 85, size=n)
    data['addresses'] = np.array([fake.address().replace('\n', ', ') for n in range(n)])
    data['compromised'] = np.zeros(n)
    data['compromised_time'] = np.zeros(n)
    
    df = pd.DataFrame(data)
    return df

In [6]:
simulate_customer(3)

Unnamed: 0,id,name,uuid,age,addresses,compromised,compromised_time
0,0,Steven Johnson,06e182cc-2b04-402d-a972-9c428baf9512,54,"01860 John Prairie Apt. 449, West Charlesview,...",0.0,0.0
1,1,Tony Galloway,358b4247-5350-3cd8-a311-30dcc9d32684,21,"19588 Heather Summit, North Gregory, IN 05388-...",0.0,0.0
2,2,Denise Miller,7cc59a4c-72ad-2f49-caa6-bf79f4f528f3,41,"293 Megan Glen, East Brian, TN 58477",0.0,0.0


In [7]:
def simulate_business(n, company, fraud_prob = 0.1):
    
    data = OrderedDict()
    
    data['id'] = np.arange(n)
    data['company_name'] =sample(company, n)
    data['addresses'] = [fake.address().replace('\n', ', ') for n in range(n)]
    data['fraudsters'] = np.where(np.random.sample(size=n) < fraud_prob, 1, 0)
    
    df = pd.DataFrame(data)
    return df
                                        

In [8]:
def simulate_good_transaction(ppl, companies, max_trans = 50,
                              fraud_prob = 1,  earliest='-1mon',
                              latest = 'now', amin=10, amax=500):
    
    transactions = []
    
    for p in range(ppl.shape[0]):
    
        person = ppl.iloc[p]
        num_trans = np.random.randint(1, max_trans+1)
    
        for i in range(num_trans):
        
            time = fake.date_time_between(start_date="-2mon", end_date="now", tzinfo=None)
            company = companies.sample(1)
            amount = np.round(np.random.uniform(amin, amax), decimals=2)
        
            # if fraudsters are present, they may or may not steal
            # the users information
        
            if company['fraudsters'].values[0] == 1:
                
                if np.random.sample() < fraud_prob:
                    ppl.at[person['id'], 'compromised'] =  1
                    ppl.at[person['id'], 'compromised_time'] =  time
            
            transaction = '{time} uuid={uuid} user="{user}" business="{business}" amount={amount} disputed=false'.format(
                    time = time,
                    uuid = person['uuid'],
                    user = person['name'],
                    business = company['company_name'].values[0],
                    amount = amount)

            transactions.append(transaction)
            
    return transactions

In [27]:

def fraud(person, companies, time, amin = 5,amax=3000):

    business = companies.sample(1)
    amount = np.round(np.random.uniform(amin, amax), decimals=2)

    transaction = '{time} uuid={uuid} user="{user}" business="{business}" amount={amount} disputed=true'\
    .format(
            time = time,
            uuid = person['uuid'],
            user = person['name'],
            business = companies['name'].values[0],
            amount = amount
    )
    
    return transaction

In [28]:
def simulate_bad_transaction(ppl, companies,max_trans = 50, 
                              customer_fraud_detect_prob = 0.01,
                              latest = '+20d',amin = 5, amix=3000):
    
    transactions = []
    compromised = ppl[ppl['compromised'] ==1]
    

    for i in range(compromised.shape[0]):
        
        person = compromised.iloc[i]
        earliest = person['compromised_time']
        time = fake.date_time_between(start_date=earliest, end_date=latest, tzinfo=None)
        num_trans = np.random.randint(3, max_trans+1)
        
        for j in range(num_trans):
             
            if j == 1:
                transaction = fraud(person, companies, time)
                transactions.append(transaction)
                
            # Each fradulent transaction has +1% chance of being cause from the user
            # Imaginging they are maybe monitoring their transaction history
            # Or happen to be checking one thing or another
            if np.random.sample() < j/100:
                transaction = fraud(person, companies, time)
                transactions.append(transaction)
                
                
    return transactions
        

In [29]:
businesses = simulate_business(100, company_names)

In [30]:
customers = simulate_customer(50)

In [31]:
normal_trans = simulate_good_transaction(customers, businesses)

In [32]:
normal_trans[:10]

['2018-04-15 23:39:33 uuid=741596b9-657a-5135-6092-9db8d57ac628 user="Christopher Sparks" business="Two Harbors Investments Corp" amount=404.04 disputed=false',
 '2018-04-15 23:40:22 uuid=741596b9-657a-5135-6092-9db8d57ac628 user="Christopher Sparks" business="Western Asset Mortgage Defined Opportunity Fund Inc" amount=363.12 disputed=false',
 '2018-04-15 23:41:15 uuid=741596b9-657a-5135-6092-9db8d57ac628 user="Christopher Sparks" business="YuMe, Inc." amount=96.82 disputed=false',
 '2018-04-15 23:40:57 uuid=741596b9-657a-5135-6092-9db8d57ac628 user="Christopher Sparks" business="Two Harbors Investments Corp" amount=113.64 disputed=false',
 '2018-04-15 23:39:39 uuid=741596b9-657a-5135-6092-9db8d57ac628 user="Christopher Sparks" business="Zhaopin Limited" amount=203.08 disputed=false',
 '2018-04-15 23:40:28 uuid=741596b9-657a-5135-6092-9db8d57ac628 user="Christopher Sparks" business="Marine Harvest ASA" amount=160.55 disputed=false',
 '2018-04-15 23:39:41 uuid=741596b9-657a-5135-6092-9d

In [33]:
fraud_trans = simulate_bad_transaction(customers, companies)

In [34]:
fraud_trans[:10]

['2018-04-23 03:24:28 uuid=0edc815e-4ac3-e5b4-f50b-5bb5f8eff9d3 user="Renee Lee" business="3D Systems Corporation" amount=2375.92 disputed=true',
 '2018-04-23 03:24:28 uuid=0edc815e-4ac3-e5b4-f50b-5bb5f8eff9d3 user="Renee Lee" business="3D Systems Corporation" amount=2593.49 disputed=true',
 '2018-04-23 03:24:28 uuid=0edc815e-4ac3-e5b4-f50b-5bb5f8eff9d3 user="Renee Lee" business="3D Systems Corporation" amount=2408.14 disputed=true',
 '2018-04-23 03:24:28 uuid=0edc815e-4ac3-e5b4-f50b-5bb5f8eff9d3 user="Renee Lee" business="3D Systems Corporation" amount=2079.99 disputed=true',
 '2018-04-23 03:24:28 uuid=0edc815e-4ac3-e5b4-f50b-5bb5f8eff9d3 user="Renee Lee" business="3D Systems Corporation" amount=377.39 disputed=true',
 '2018-04-23 03:24:28 uuid=0edc815e-4ac3-e5b4-f50b-5bb5f8eff9d3 user="Renee Lee" business="3D Systems Corporation" amount=1216.69 disputed=true',
 '2018-04-23 03:24:28 uuid=0edc815e-4ac3-e5b4-f50b-5bb5f8eff9d3 user="Renee Lee" business="3D Systems Corporation" amount=163