In [12]:
import time 


import pandas as pd
import numpy as np 
from collections import OrderedDict
from faker import Factory
from random import sample

In [13]:
fake = Factory.create()

In [14]:
companies = pd.read_csv('companies.csv', 
                        index_col=False)

In [15]:
# get company names 
company_names = list(companies['name'].values)

In [16]:
def simulate_customer(n):
    
    """simulate n customers, return a dataframe.
       Each customer has id, age, name, address, compromised: 0 or 1 
       where 1 if compromised and value is set by later function."""
    
    data = OrderedDict()
    
    data['id'] = np.arange(n)
    data['name'] = [fake.name() for n in range(n)]
    data['uuid'] = [fake.uuid4() for n in range(n)]
    data['age'] = np.random.randint(18, 85, size=n)
    data['addresses'] = np.array([fake.address().replace('\n', ', ') for n in range(n)])
    data['compromised'] = np.zeros(n)
    data['compromised_time'] = np.zeros(n)
    
    df = pd.DataFrame(data)
    return df

In [17]:
simulate_customer(3)

Unnamed: 0,id,name,uuid,age,addresses,compromised,compromised_time
0,0,Bryce Brooks,d0d3b935-2b64-381c-9c10-0ee84a497c4a,46,"37794 Molina Rest, North Brett, NV 99138-4310",0.0,0.0
1,1,Brittany Massey,29cff95a-0f13-130c-9adc-d607a1866b56,47,"23492 Jacob Dam Apt. 679, West Elizabethbury, ...",0.0,0.0
2,2,David Contreras,fa5bb5ab-4520-c77d-a982-748f4e711a32,83,"USS Gregory, FPO AA 97016-2124",0.0,0.0


In [18]:
def simulate_business(n, company, fraud_prob = 0.1):  # can adjust the fraud_prob
    
    data = OrderedDict()
    
    data['id'] = np.arange(n)
    data['company_name'] =sample(company, n)
    data['addresses'] = [fake.address().replace('\n', ', ') for n in range(n)]
    data['fraudsters'] = np.where(np.random.sample(size=n) < fraud_prob, 1, 0)
    
    df = pd.DataFrame(data)
    return df
                                        

In [42]:
def simulate_good_transaction_df(ppl, companies, max_trans = 50,
                              fraud_prob = 1,  earliest='-1mon',
                              latest = 'now', amin=10, amax=500):
    
    transactions = []
    transactions.append(['users', 'uuids', 'time', 'business',
                         'amount', 'dispute'])
    
    
    for p in range(ppl.shape[0]):
                        
        person = ppl.iloc[p]
        num_trans = np.random.randint(1, max_trans+1)
    
        for i in range(num_trans):
            
            transaction = []
        
            time = fake.date_time_between(start_date="-2mon", end_date="now", tzinfo=None)
            company = companies.sample(1)
            amount = np.round(np.random.uniform(amin, amax), decimals=2)
                        
            transaction.append(person['name'])
            transaction.append(person['uuid'])
            transaction.append(time),
            transaction.append(company['company_name'].values[0])
            transaction.append(amount)
            transaction.append(0)    # disputed is always zero for good transactions
            transactions.append(transaction)
        
            # if fraudsters are present, they may or may not steal
            # the users information
        
            if company['fraudsters'].values[0] == 1:
                
                if np.random.sample() < fraud_prob:
                    ppl.at[person['id'], 'compromised'] =  1
                    ppl.at[person['id'], 'compromised_time'] =  time
                    
    header = transactions.pop(0)
    df = pd.DataFrame(transactions, columns = header)      
    return df

In [50]:

def fraud(person, companies, time, amin = 5,amax=3000):

    business = companies.sample(1)
    amount = np.round(np.random.uniform(amin, amax), decimals=2)

    transaction = []
            
    transaction.append(person['name'])
    transaction.append(person['uuid'])
    transaction.append(time)
    transaction.append(companies['name'].values[0])
    transaction.append(amount)
    transaction.append(1) #disputed for bad transaction is always 1
    
    return transaction

In [51]:
def simulate_bad_transaction_df(ppl, companies,max_trans = 50, 
                              customer_fraud_detect_prob = 0.01,
                              latest = '+20d',amin = 5, amix=3000):
    
    transactions = []
    compromised = ppl[ppl['compromised'] ==1]
    

    for i in range(compromised.shape[0]):
        
        person = compromised.iloc[i]
        earliest = person['compromised_time']
        time = fake.date_time_between(start_date=earliest, end_date=latest, tzinfo=None)
        num_trans = np.random.randint(3, max_trans+1)
        
        for j in range(num_trans):
             
            if j == 1:
                transaction = fraud(person, companies, time)
                transactions.append(transaction)
                
            # Each fradulent transaction has +1% chance of being cause from the user
            # Imaginging they are maybe monitoring their transaction history
            # Or happen to be checking one thing or another
            if np.random.sample() < j/100:
                transaction = fraud(person, companies, time)
                transactions.append(transaction)
    
    header = transactions.pop(0)
    df = pd.DataFrame(transactions, columns = header)  
    
    return df
        

In [43]:
businesses = simulate_business(100, company_names)

In [44]:
customers = simulate_customer(50)

In [56]:
normal_trans_df = simulate_good_transaction_df(customers, businesses)
normal_trans_df.shape

(1290, 6)

In [57]:
normal_trans_df.head()

Unnamed: 0,users,uuids,time,business,amount,dispute
0,Steven Lucas,2091806c-ff95-da79-6081-603e95dd00bc,2018-04-18 12:57:50,Agrium Inc.,151.89,0
1,Steven Lucas,2091806c-ff95-da79-6081-603e95dd00bc,2018-04-18 12:57:27,Welltower Inc.,233.1,0
2,Steven Lucas,2091806c-ff95-da79-6081-603e95dd00bc,2018-04-18 12:57:54,Imax Corporation,486.27,0
3,Steven Lucas,2091806c-ff95-da79-6081-603e95dd00bc,2018-04-18 12:56:22,Baytex Energy Corp,32.24,0
4,Steven Lucas,2091806c-ff95-da79-6081-603e95dd00bc,2018-04-18 12:57:21,Lennar Corporation,15.46,0


In [55]:
fraud_trans_df = simulate_bad_transaction_df(customers, companies)
fraud_trans_df.shape

(200, 6)

In [58]:
fraud_trans_df.head()

Unnamed: 0,Steven Lucas,2091806c-ff95-da79-6081-603e95dd00bc,2018-04-24 18:17:10,3D Systems Corporation,1439.13,1
0,Steven Lucas,2091806c-ff95-da79-6081-603e95dd00bc,2018-04-24 18:17:10,3D Systems Corporation,1776.64,1
1,Steven Lucas,2091806c-ff95-da79-6081-603e95dd00bc,2018-04-24 18:17:10,3D Systems Corporation,1234.16,1
2,Steven Lucas,2091806c-ff95-da79-6081-603e95dd00bc,2018-04-24 18:17:10,3D Systems Corporation,2354.3,1
3,Steven Lucas,2091806c-ff95-da79-6081-603e95dd00bc,2018-04-24 18:17:10,3D Systems Corporation,841.95,1
4,Carrie Kirk,b2cb2bf1-8ac8-d996-6cb6-4d405ebe0743,2018-05-01 10:50:15,3D Systems Corporation,2244.44,1
