In [1]:
import pandas as pd
import numpy as np

In [2]:
original_data = pd.read_csv('original_data.csv').drop('isFlaggedFraud', axis=1)
original_data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0


In [3]:
original_data['isFraud'].value_counts()

isFraud
0    6354407
1       8213
Name: count, dtype: int64

## Split Fraud Data

In [4]:
fraud_data = original_data[original_data['isFraud'] == 1]
fraud_data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1
251,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,1
252,1,CASH_OUT,2806.0,C2101527076,2806.0,0.0,C1007251739,26202.0,0.0,1
680,1,TRANSFER,20128.0,C137533655,20128.0,0.0,C1848415041,0.0,0.0,1


In [5]:
fraud_data = fraud_data.sample(n=8000)
fraud_data.to_csv('fraud_data.csv', index=False)

## Split Normal Data

In [6]:
normal_data = original_data[original_data['isFraud'] == 0]
normal_data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0
5,1,PAYMENT,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0
6,1,PAYMENT,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0


In [7]:
for i in range(1,4):
    normal_batch = normal_data.sample(n=8000)
    normal_batch.to_csv('normal_{}.csv'.format(i), index=False)

## Concat Small Batches

In [9]:
fraud_data = pd.read_csv('fraud_data.csv')
fraud_data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,266,TRANSFER,10000000.00,C1749184895,25494862.93,15494862.93,C1827220106,0.00,0.00,1
1,419,CASH_OUT,426323.38,C1078529507,426323.38,0.00,C1957980970,102216.61,528539.99,1
2,664,CASH_OUT,84771.55,C528998506,84771.55,0.00,C1971161098,1016318.82,1101090.37,1
3,506,CASH_OUT,10000000.00,C1180822515,10000000.00,0.00,C758711155,1920358.24,11920358.24,1
4,355,CASH_OUT,883366.69,C149173089,883366.69,0.00,C504582825,2029536.87,2912903.56,1
...,...,...,...,...,...,...,...,...,...,...
7995,46,CASH_OUT,71372.00,C313736303,71372.00,0.00,C935310781,613464.04,684836.04,1
7996,686,TRANSFER,1411271.23,C1426773355,1411271.23,0.00,C285961012,0.00,0.00,1
7997,439,TRANSFER,203119.08,C1294508143,203119.08,0.00,C1542262415,0.00,0.00,1
7998,356,TRANSFER,5952739.30,C1569009550,5952739.30,0.00,C1429463996,0.00,0.00,1


In [10]:
for i in range(1,4):
    normal_batch = pd.read_csv('normal_{}.csv'.format(i))
    batch = pd.concat([fraud_data, normal_batch])
    
    batch.to_csv('batch_{}.csv'.format(i), index=False)

## Check Balance in Batch

In [11]:
batch_1 = pd.read_csv('batch_1.csv')
batch_1

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,266,TRANSFER,10000000.00,C1749184895,25494862.93,15494862.93,C1827220106,0.00,0.00,1
1,419,CASH_OUT,426323.38,C1078529507,426323.38,0.00,C1957980970,102216.61,528539.99,1
2,664,CASH_OUT,84771.55,C528998506,84771.55,0.00,C1971161098,1016318.82,1101090.37,1
3,506,CASH_OUT,10000000.00,C1180822515,10000000.00,0.00,C758711155,1920358.24,11920358.24,1
4,355,CASH_OUT,883366.69,C149173089,883366.69,0.00,C504582825,2029536.87,2912903.56,1
...,...,...,...,...,...,...,...,...,...,...
15995,325,CASH_OUT,372055.05,C883067999,100221.00,0.00,C465916758,590011.08,962066.13,0
15996,303,CASH_OUT,540169.00,C683026612,204911.00,0.00,C1294695056,14516.73,554685.74,0
15997,209,TRANSFER,1125992.27,C771882649,0.00,0.00,C458873217,1300468.96,2426461.23,0
15998,305,PAYMENT,12505.76,C263457100,383886.74,371380.97,M758191903,0.00,0.00,0


In [12]:
batch_1['isFraud'].value_counts()

isFraud
1    8000
0    8000
Name: count, dtype: int64