In [1]:
import time 


import pandas as pd
import numpy as np 
from collections import OrderedDict
from faker import Factory
from random import sample

In [2]:
fake = Factory.create()

In [3]:
companies = pd.read_csv('companies.csv', 
                        index_col=False)

In [4]:
# get company names 
company_names = list(companies['name'].values)

In [5]:
def simulate_customer(n):
    
    """simulate n customers, return a dataframe.
       Each customer has id, age, name, address, compromised: 0 or 1 
       where 1 if compromised and value is set by later function."""
    
    data = OrderedDict()
    
    data['id'] = np.arange(n)
    data['name'] = [fake.name() for n in range(n)]
    data['uuid'] = [fake.uuid4() for n in range(n)]
    data['age'] = np.random.randint(18, 85, size=n)
    data['addresses'] = np.array([fake.address().replace('\n', ', ') for n in range(n)])
    data['compromised'] = np.zeros(n)
    data['compromised_time'] = np.zeros(n)
    
    df = pd.DataFrame(data)
    return df

In [7]:
def simulate_business(n, company, fraud_prob = 0.1):  # can adjust the fraud_prob
    
    data = OrderedDict()
    
    data['id'] = np.arange(n)
    data['company_name'] =sample(company, n)
    data['addresses'] = [fake.address().replace('\n', ', ') for n in range(n)]
    data['fraudsters'] = np.where(np.random.sample(size=n) < fraud_prob, 1, 0)
    
    df = pd.DataFrame(data)
    return df
                                        

In [8]:
def simulate_good_transaction_df(ppl, companies, max_trans = 50,
                              fraud_prob = 1,  earliest='-1mon',
                              latest = 'now', amin=10, amax=500):
    
    transactions = []
    transactions.append(['users', 'uuids', 'time', 'business',
                         'amount', 'dispute'])
    
    
    for p in range(ppl.shape[0]):
                        
        person = ppl.iloc[p]
        num_trans = np.random.randint(1, max_trans+1)
    
        for i in range(num_trans):
            
            transaction = []
        
            time = fake.date_time_between(start_date="-2mon", end_date="now", tzinfo=None)
            company = companies.sample(1)
            amount = np.round(np.random.uniform(amin, amax), decimals=2)
                        
            transaction.append(person['name'])
            transaction.append(person['uuid'])
            transaction.append(time),
            transaction.append(company['company_name'].values[0])
            transaction.append(amount)
            transaction.append(0)    # disputed is always zero for good transactions
            transactions.append(transaction)
        
            # if fraudsters are present, they may or may not steal
            # the users information
        
            if company['fraudsters'].values[0] == 1:
                
                if np.random.sample() < fraud_prob:
                    ppl.at[person['id'], 'compromised'] =  1
                    ppl.at[person['id'], 'compromised_time'] =  time
                    
    header = transactions.pop(0)
    df = pd.DataFrame(transactions, columns = header)      
    return df

In [9]:

def fraud(person, companies, time, amin = 5,amax=3000):

    business = companies.sample(1)
    amount = np.round(np.random.uniform(amin, amax), decimals=2)

    transaction = []
            
    transaction.append(person['name'])
    transaction.append(person['uuid'])
    transaction.append(time)
    transaction.append(companies['name'].values[0])
    transaction.append(amount)
    transaction.append(1) #disputed for bad transaction is always 1
    
    return transaction

In [10]:
def simulate_bad_transaction_df(ppl, companies,max_trans = 50, 
                              customer_fraud_detect_prob = 0.01,
                              latest = '+20d',amin = 5, amix=3000):
    
    transactions = []
    transactions.append(['users', 'uuids', 'time', 'business',
                         'amount', 'dispute'])
    
    compromised = ppl[ppl['compromised'] ==1]
    

    for i in range(compromised.shape[0]):
        
        person = compromised.iloc[i]
        earliest = person['compromised_time']
        time = fake.date_time_between(start_date=earliest, end_date=latest, tzinfo=None)
        num_trans = np.random.randint(3, max_trans+1)
        
        for j in range(num_trans):
             
            if j == 1:
                transaction = fraud(person, companies, time)
                transactions.append(transaction)
                
            # Each fradulent transaction has +1% chance of being cause from the user
            # Imaginging they are maybe monitoring their transaction history
            # Or happen to be checking one thing or another
            if np.random.sample() < j/100:
                transaction = fraud(person, companies, time)
                transactions.append(transaction)
    
    header = transactions.pop(0)
    df = pd.DataFrame(transactions, columns = header)  
    
    return df
        

In [11]:
businesses = simulate_business(100, company_names)

In [12]:
customers = simulate_customer(50)

In [13]:
normal_trans_df = simulate_good_transaction_df(customers, businesses)
normal_trans_df.shape

(1390, 6)

In [14]:
normal_trans_df.head()

Unnamed: 0,users,uuids,time,business,amount,dispute
0,Michael Mercado,1776f23c-ebfa-37b3-397f-18784791e578,2018-04-18 13:02:21,Haemonetics Corporation,387.54,0
1,Michael Mercado,1776f23c-ebfa-37b3-397f-18784791e578,2018-04-18 13:03:07,Unisys Corporation,320.93,0
2,Lawrence Strickland,364ddd21-15e6-17d1-5729-0213db374e2c,2018-04-18 13:02:20,"St. Jude Medical, Inc.",99.6,0
3,Lawrence Strickland,364ddd21-15e6-17d1-5729-0213db374e2c,2018-04-18 13:02:22,Blackrock New York Municipal Income Quality Trust,63.59,0
4,Lawrence Strickland,364ddd21-15e6-17d1-5729-0213db374e2c,2018-04-18 13:02:56,Qwest Corporation,33.28,0


In [15]:
fraud_trans_df = simulate_bad_transaction_df(customers, companies)
fraud_trans_df.shape

(175, 6)

In [16]:
fraud_trans_df.head()

Unnamed: 0,users,uuids,time,business,amount,dispute
0,Lawrence Strickland,364ddd21-15e6-17d1-5729-0213db374e2c,2018-04-23 03:57:20,3D Systems Corporation,1801.05,1
1,Jamie Greene,f735104f-8ab6-521f-e856-eb12fffd4cbf,2018-04-26 10:12:57,3D Systems Corporation,1992.92,1
2,Jamie Greene,f735104f-8ab6-521f-e856-eb12fffd4cbf,2018-04-26 10:12:57,3D Systems Corporation,2901.24,1
3,Juan Butler,76143cff-5e4d-e7f2-e446-184626809a54,2018-05-04 17:20:55,3D Systems Corporation,443.88,1
4,Juan Butler,76143cff-5e4d-e7f2-e446-184626809a54,2018-05-04 17:20:55,3D Systems Corporation,1754.47,1
