step - mapeia uma unidade de tempo no mundo real. Neste caso, 1 passo equivale a 1 hora. Total de passos 744 (simulação de 30 dias).

tipo - CASH-IN, CASH-OUT, DÉBITO, PAGAMENTO e TRANSFERÊNCIA.

montante - valor da transação em moeda local.

nameOrig – cliente que iniciou a transação

oldbalanceOrg - saldo inicial antes da transação

newbalanceOrig - novo saldo após a transação

nameDest - cliente que é o destinatário da transação

oldbalanceDest - destinatário do saldo inicial antes da transação. Observe que não há informações para clientes que iniciam com M (Comerciantes).

newbalanceDest - novo destinatário do saldo após a transação. Observe que não há informações para clientes que iniciam com M (Comerciantes).

isFraud - São as transações realizadas pelos agentes fraudulentos dentro da simulação. Neste conjunto de dados específico, o comportamento fraudulento dos agentes visa lucrar assumindo o controle das contas dos clientes e tentando esvaziar os fundos transferindo para outra conta e depois sacando do sistema.

isFlaggedFraud – O modelo de negócios visa controlar transferências massivas de uma conta para outra e sinalizar tentativas ilegais. Uma tentativa ilegal neste conjunto de dados é uma tentativa de transferir mais de 200.000 em uma única transação.

In [1]:
# Basic Tools
import numpy as np
import pandas as pd
from datetime import datetime as dt

import inflection



# File/ OS Tools
import os
import sys
from watermark import watermark

In [None]:
def rename_columns(dataframe):
    df = dataframe.copy()
    title = lambda x: inflection.titleize(x)
    snakecase = lambda x: inflection.underscore(x)
    accent = lambda x: inflection.transliterate(x)
    spaces = lambda x: x.replace(" ", "")
    cols_old = list(df.columns)
    cols_old = list(map(title, cols_old))
    cols_old = list(map(spaces, cols_old))
    cols_old = list(map(accent, cols_old))
    cols_new = list(map(snakecase, cols_old))
    df.columns = cols_new
    return df

In [4]:
data_raw = pd.read_csv('../data/raw/data_raw.csv',low_memory=False)
data_raw = rename_columns(data_raw)
data = data_raw.copy()

NameError: name 'rename_columns' is not defined

# Missing Data

In [None]:
data.isna().sum()

# TIPO DE DADOS 

In [5]:
data.dtypes

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

In [22]:
map_type = {'PAYMENT': 0,'TRANSFER':1,'CASH_OUT': 2,'DEBIT': 3,'CASH_IN': 4}

In [6]:
data['type'].value_counts(normalize=True)

type
CASH_OUT    0.351663
PAYMENT     0.338146
CASH_IN     0.219923
TRANSFER    0.083756
DEBIT       0.006512
Name: proportion, dtype: float64

In [7]:
data['isFraud'].value_counts(normalize=True)

isFraud
0    0.998709
1    0.001291
Name: proportion, dtype: float64

In [8]:
data['isFraud'].value_counts(normalize=True,ascending=True)

isFraud
1    0.001291
0    0.998709
Name: proportion, dtype: float64

In [9]:
data.loc[data['isFraud'] == 1].sample(5,ignore_index=True)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,723,CASH_OUT,6199009.0,C935658927,6199009.0,0.0,C1938522935,938804.47,7137813.47,1,0
1,242,CASH_OUT,3139423.35,C1627766514,3139423.35,0.0,C1840253816,2593641.94,5733065.29,1,0
2,266,CASH_OUT,1911239.21,C951381310,1911239.21,0.0,C1006459134,188720.75,2099959.96,1,0
3,185,CASH_OUT,63009.43,C289965777,63009.43,0.0,C268959010,1057339.93,1120349.36,1,0
4,629,TRANSFER,547672.2,C1661851556,547672.2,0.0,C1014616442,0.0,0.0,1,0


In [10]:
(data['isFraud'] == 1).sum()

8213

In [11]:
num_attributes = data.select_dtypes( include=['int64', 'float64'] )
cat_attributes = data.select_dtypes( exclude=['int64', 'float64', 'datetime64[ns]'] )

In [12]:
# Central Tendency - mean, meadina 
ct1 = pd.DataFrame( num_attributes.apply( np.mean ) ).T
ct2 = pd.DataFrame( num_attributes.apply( np.median ) ).T

# dispersion - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame( num_attributes.apply( np.std ) ).T 
d2 = pd.DataFrame( num_attributes.apply( min ) ).T 
d3 = pd.DataFrame( num_attributes.apply( max ) ).T 
d4 = pd.DataFrame( num_attributes.apply( lambda x: x.max() - x.min() ) ).T 
d5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew() ) ).T 
d6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() ) ).T 

# concatenar
m = pd.concat( [d2, d3, d4, ct1, ct2, d1, d5, d6] ).T.reset_index()
m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
m

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,step,1.0,743.0,742.0,243.3972,239.0,142.332,0.375177,0.329071
1,amount,0.0,92445520.0,92445520.0,179861.9,74871.94,603858.2,30.993949,1797.956705
2,oldbalanceOrg,0.0,59585040.0,59585040.0,833883.1,14208.0,2888242.0,5.249136,32.964879
3,newbalanceOrig,0.0,49585040.0,49585040.0,855113.7,0.0,2924048.0,5.176884,32.066985
4,oldbalanceDest,0.0,356015900.0,356015900.0,1100702.0,132705.665,3399180.0,19.921758,948.674125
5,newbalanceDest,0.0,356179300.0,356179300.0,1224996.0,214661.44,3674129.0,19.352302,862.156508
6,isFraud,0.0,1.0,1.0,0.00129082,0.0,0.03590479,27.779538,769.702982
7,isFlaggedFraud,0.0,1.0,1.0,2.514687e-06,0.0,0.001585775,630.603629,397659.0625


In [13]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [15]:
desc = data.describe()
desc = desc.style.background_gradient()
desc

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.397246,179861.903549,833883.104074,855113.668579,1100701.66652,1224996.398202,0.001291,3e-06
std,142.331971,603858.231463,2888242.673038,2924048.502954,3399180.112994,3674128.94212,0.035905,0.001586
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.665,214661.44,0.0,0.0
75%,335.0,208721.4775,107315.175,144258.41,943036.7075,1111909.25,0.0,0.0
max,743.0,92445516.64,59585040.37,49585040.37,356015889.35,356179278.92,1.0,1.0


In [14]:
data.to_csv("../data/processed/data_processed.csv", index=False)

In [16]:
data['isFraud'].value_counts()

isFraud
0    6354407
1       8213
Name: count, dtype: int64