step - mapeia uma unidade de tempo no mundo real. Neste caso, 1 passo equivale a 1 hora. Total de passos 744 (simulação de 30 dias).

tipo - CASH-IN, CASH-OUT, DÉBITO, PAGAMENTO e TRANSFERÊNCIA.

montante - valor da transação em moeda local.

nameOrig – cliente que iniciou a transação

oldbalanceOrg - saldo inicial antes da transação

newbalanceOrig - novo saldo após a transação

nameDest - cliente que é o destinatário da transação

oldbalanceDest - destinatário do saldo inicial antes da transação. Observe que não há informações para clientes que iniciam com M (Comerciantes).

newbalanceDest - novo destinatário do saldo após a transação. Observe que não há informações para clientes que iniciam com M (Comerciantes).

isFraud - São as transações realizadas pelos agentes fraudulentos dentro da simulação. Neste conjunto de dados específico, o comportamento fraudulento dos agentes visa lucrar assumindo o controle das contas dos clientes e tentando esvaziar os fundos transferindo para outra conta e depois sacando do sistema.

isFlaggedFraud – O modelo de negócios visa controlar transferências massivas de uma conta para outra e sinalizar tentativas ilegais. Uma tentativa ilegal neste conjunto de dados é uma tentativa de transferir mais de 200.000 em uma única transação.

In [50]:
# Basic Tools
import numpy as np
import pandas as pd
from datetime import datetime as dt

import inflection
import re


# File/ OS Tools
import os
import sys
from watermark import watermark

In [51]:
def rename_columns(dataframe):
    df = dataframe.copy()
    title = lambda x: inflection.titleize(x)
    snakecase = lambda x: inflection.underscore(x)
    accent = lambda x: inflection.transliterate(x)
    spaces = lambda x: x.replace(" ", "")
    cols_old = list(df.columns)
    cols_old = list(map(title, cols_old))
    cols_old = list(map(spaces, cols_old))
    cols_old = list(map(accent, cols_old))
    cols_new = list(map(snakecase, cols_old))
    df.columns = cols_new
    return df

In [52]:
data_raw = pd.read_csv('../data/raw/data_raw.csv',low_memory=False)
data_raw = rename_columns(data_raw)
data = data_raw.copy()

# Missing Data

In [53]:
data.isna().sum()

step                0
type                0
amount              0
name_orig           0
oldbalance_org      0
newbalance_orig     0
name_dest           0
oldbalance_dest     0
newbalance_dest     0
is_fraud            0
is_flagged_fraud    0
dtype: int64

# TIPO DE DADOS 

In [54]:
data.dtypes

step                  int64
type                 object
amount              float64
name_orig            object
oldbalance_org      float64
newbalance_orig     float64
name_dest            object
oldbalance_dest     float64
newbalance_dest     float64
is_fraud              int64
is_flagged_fraud      int64
dtype: object

In [55]:
data.head()

Unnamed: 0,step,type,amount,name_orig,oldbalance_org,newbalance_orig,name_dest,oldbalance_dest,newbalance_dest,is_fraud,is_flagged_fraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [56]:
data['type'].value_counts(normalize=True)

type
CASH_OUT    0.351663
PAYMENT     0.338146
CASH_IN     0.219923
TRANSFER    0.083756
DEBIT       0.006512
Name: proportion, dtype: float64

In [57]:
data['is_fraud'].value_counts(normalize=True)

is_fraud
0    0.998709
1    0.001291
Name: proportion, dtype: float64

In [58]:
data['is_fraud'].value_counts(normalize=True,ascending=True)

is_fraud
1    0.001291
0    0.998709
Name: proportion, dtype: float64

In [59]:
data.loc[data['is_fraud'] == 1].sample(5,ignore_index=True)

Unnamed: 0,step,type,amount,name_orig,oldbalance_org,newbalance_orig,name_dest,oldbalance_dest,newbalance_dest,is_fraud,is_flagged_fraud
0,67,CASH_OUT,42513.08,C263309564,42513.08,0.0,C1511183581,0.0,42513.08,1,0
1,260,TRANSFER,175597.64,C851065850,175597.64,0.0,C1996124584,0.0,0.0,1,0
2,589,CASH_OUT,164296.85,C337181561,164296.85,0.0,C701761965,1371866.69,1536163.54,1,0
3,394,TRANSFER,49950.63,C1012366293,49950.63,0.0,C1028846036,0.0,0.0,1,0
4,122,CASH_OUT,91637.51,C1556025752,91637.51,0.0,C1407010141,7336697.67,7428335.18,1,0


In [60]:
(data['is_fraud'] == 1).sum()

8213

In [61]:
num_attributes = data.select_dtypes( include=['int64', 'float64'] )
cat_attributes = data.select_dtypes( exclude=['int64', 'float64', 'datetime64[ns]'] )

In [62]:
# Central Tendency - mean, meadina 
ct1 = pd.DataFrame( num_attributes.apply( np.mean ) ).T
ct2 = pd.DataFrame( num_attributes.apply( np.median ) ).T

# dispersion - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame( num_attributes.apply( np.std ) ).T 
d2 = pd.DataFrame( num_attributes.apply( min ) ).T 
d3 = pd.DataFrame( num_attributes.apply( max ) ).T 
d4 = pd.DataFrame( num_attributes.apply( lambda x: x.max() - x.min() ) ).T 
d5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew() ) ).T 
d6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() ) ).T 

# concatenar
m = pd.concat( [d2, d3, d4, ct1, ct2, d1, d5, d6] ).T.reset_index()
m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
m

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,step,1.0,743.0,742.0,243.3972,239.0,142.332,0.375177,0.329071
1,amount,0.0,92445520.0,92445520.0,179861.9,74871.94,603858.2,30.993949,1797.956705
2,oldbalance_org,0.0,59585040.0,59585040.0,833883.1,14208.0,2888242.0,5.249136,32.964879
3,newbalance_orig,0.0,49585040.0,49585040.0,855113.7,0.0,2924048.0,5.176884,32.066985
4,oldbalance_dest,0.0,356015900.0,356015900.0,1100702.0,132705.665,3399180.0,19.921758,948.674125
5,newbalance_dest,0.0,356179300.0,356179300.0,1224996.0,214661.44,3674129.0,19.352302,862.156508
6,is_fraud,0.0,1.0,1.0,0.00129082,0.0,0.03590479,27.779538,769.702982
7,is_flagged_fraud,0.0,1.0,1.0,2.514687e-06,0.0,0.001585775,630.603629,397659.0625


In [76]:
map_type = {'PAYMENT': 0,'TRANSFER':1,'CASH_OUT': 2,'DEBIT': 3,'CASH_IN': 4}
data['type'] = data['type'].map(map_type)

In [77]:
def remove_1_letra(df, column_name):
    df[column_name] = df[column_name].str[1:]




In [78]:
remove_1_letra(data, 'name_orig')
remove_1_letra(data, 'name_dest')

In [79]:
data.head()

Unnamed: 0,step,type,amount,name_orig,oldbalance_org,newbalance_orig,name_dest,oldbalance_dest,newbalance_dest,is_fraud,is_flagged_fraud
0,1,0,9839.64,1231006815,170136.0,160296.36,1979787155,0.0,0.0,0,0
1,1,0,1864.28,1666544295,21249.0,19384.72,2044282225,0.0,0.0,0,0
2,1,1,181.0,1305486145,181.0,0.0,553264065,0.0,0.0,1,0
3,1,2,181.0,840083671,181.0,0.0,38997010,21182.0,0.0,1,0
4,1,0,11668.14,2048537720,41554.0,29885.86,1230701703,0.0,0.0,0,0


In [82]:
data['name_dest'] = data['name_dest'].astype('int64')
data['name_orig'] = data['name_orig'].astype('int64')

In [83]:
data.dtypes

step                  int64
type                  int64
amount              float64
name_orig             int64
oldbalance_org      float64
newbalance_orig     float64
name_dest             int64
oldbalance_dest     float64
newbalance_dest     float64
is_fraud              int64
is_flagged_fraud      int64
dtype: object

In [84]:
data.to_csv("../data/processed/data_processed.csv", index=False)