In [52]:
# Basic Tools
import pandas as pd
import numpy as np


from sklearn.preprocessing import StandardScaler


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, cross_validate



In [107]:
data = pd.read_csv('../data/processed/data_processed.csv', low_memory=False)


In [108]:
data_prep = data.copy()

In [109]:
data_prep.shape

(6362620, 18)

In [110]:
data_prep.isna().sum()

step                0
type                0
amount              0
name_orig           0
oldbalance_org      0
newbalance_orig     0
name_dest           0
oldbalance_dest     0
newbalance_dest     0
is_fraud            0
is_flagged_fraud    0
length_name_orig    0
length_name_dest    0
type_dest           0
dia                 0
hora                0
origin_missing      0
dest_missing        0
dtype: int64

In [111]:
data_prep.head()

Unnamed: 0,step,type,amount,name_orig,oldbalance_org,newbalance_orig,name_dest,oldbalance_dest,newbalance_dest,is_fraud,is_flagged_fraud,length_name_orig,length_name_dest,type_dest,dia,hora,origin_missing,dest_missing
0,1,PAYMENT,9839.64,1231006815,170136.0,160296.36,1979787155,0.0,0.0,0,0,11,11,M,1,0,0.0,9839.64
1,1,PAYMENT,1864.28,1666544295,21249.0,19384.72,2044282225,0.0,0.0,0,0,11,11,M,1,0,0.0,1864.28
2,1,TRANSFER,181.0,1305486145,181.0,0.0,553264065,0.0,0.0,1,0,11,10,C,1,0,0.0,181.0
3,1,CASH_OUT,181.0,840083671,181.0,0.0,38997010,21182.0,0.0,1,0,10,9,C,1,0,0.0,-21001.0
4,1,PAYMENT,11668.14,2048537720,41554.0,29885.86,1230701703,0.0,0.0,0,0,11,11,M,1,0,0.0,11668.14


In [112]:
data_prep.isna().sum()

step                0
type                0
amount              0
name_orig           0
oldbalance_org      0
newbalance_orig     0
name_dest           0
oldbalance_dest     0
newbalance_dest     0
is_fraud            0
is_flagged_fraud    0
length_name_orig    0
length_name_dest    0
type_dest           0
dia                 0
hora                0
origin_missing      0
dest_missing        0
dtype: int64

In [113]:
data_prep.columns

Index(['step', 'type', 'amount', 'name_orig', 'oldbalance_org',
       'newbalance_orig', 'name_dest', 'oldbalance_dest', 'newbalance_dest',
       'is_fraud', 'is_flagged_fraud', 'length_name_orig', 'length_name_dest',
       'type_dest', 'dia', 'hora', 'origin_missing', 'dest_missing'],
      dtype='object')

In [114]:
data_prep.dtypes

step                  int64
type                 object
amount              float64
name_orig             int64
oldbalance_org      float64
newbalance_orig     float64
name_dest             int64
oldbalance_dest     float64
newbalance_dest     float64
is_fraud              int64
is_flagged_fraud      int64
length_name_orig      int64
length_name_dest      int64
type_dest            object
dia                   int64
hora                  int64
origin_missing      float64
dest_missing        float64
dtype: object

# Encoding

In [115]:
map_type = {'PAYMENT': 0,'TRANSFER':1,'CASH_OUT': 2,'DEBIT': 3,'CASH_IN': 4}
data_prep['type'] = data_prep['type'].map(map_type)

In [116]:
map_type_dest = {'C': 0,'M':1}
data_prep['type_dest'] = data_prep['type_dest'].map(map_type_dest)

In [117]:
data['type_dest'].value_counts()

type_dest
C    4211125
M    2151495
Name: count, dtype: int64

In [118]:
data.shape

(6362620, 18)

# Feature Selection

In [91]:
SELECTED_FEATURES = ['step', 'type', 'amount', 'name_orig', 'oldbalance_org',
       'newbalance_orig', 'name_dest', 'oldbalance_dest', 'newbalance_dest',
       'is_flagged_fraud', 'length_name_orig', 'length_name_dest',
       'type_dest', 'dia', 'hora', 'origin_missing', 'dest_missing']



TARGET = 'is_fraud'

In [92]:
data_prep[SELECTED_FEATURES]

Unnamed: 0,type,name_orig,oldbalance_org,newbalance_orig,name_dest,oldbalance_dest,newbalance_dest,hora,dia,origin_missing,dest_missing,length_name_orig,length_name_dest
0,0,1231006815,170136.00,160296.36,1979787155,0.00,0.00,0,1,0.0,9839.64,11,11
1,0,1666544295,21249.00,19384.72,2044282225,0.00,0.00,0,1,0.0,1864.28,11,11
2,1,1305486145,181.00,0.00,553264065,0.00,0.00,0,1,0.0,181.00,11,10
3,2,840083671,181.00,0.00,38997010,21182.00,0.00,0,1,0.0,-21001.00,10,9
4,0,2048537720,41554.00,29885.86,1230701703,0.00,0.00,0,1,0.0,11668.14,11,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,2,786484425,339682.13,0.00,776919290,0.00,339682.13,22,31,0.0,679364.26,10,10
6362616,1,1529008245,6311409.28,0.00,1881841831,0.00,0.00,22,31,0.0,6311409.28,11,11
6362617,2,1162922333,6311409.28,0.00,1365125890,68488.84,6379898.11,22,31,0.0,12622818.55,11,11
6362618,1,1685995037,850002.52,0.00,2080388513,0.00,0.00,22,31,0.0,850002.52,11,11


# Normalization

Métodos:
* Feature Scaling

* Standardization

* Encoding

* Discretization

* Handling missing values

In [93]:
log_columns = data_prep[SELECTED_FEATURES].skew().sort_values(ascending=False)
log_columns = log_columns.loc[log_columns > 0.75]
print('Columns/Skew\n',log_columns)   

Columns/Skew
 dest_missing       30.331092
origin_missing     30.074746
oldbalance_dest    19.921758
newbalance_dest    19.352302
oldbalance_org      5.249136
newbalance_orig     5.176884
dtype: float64


In [94]:
cols_to_normalize = log_columns.index

In [95]:
cols_to_normalize

Index(['dest_missing', 'origin_missing', 'oldbalance_dest', 'newbalance_dest',
       'oldbalance_org', 'newbalance_orig'],
      dtype='object')

In [96]:


# # Crie uma instância do StandardScaler
# scaler = StandardScaler()

# # Ajuste o scaler apenas às colunas selecionadas e transforme essas colunas
# data_prep[cols_to_normalize] = scaler.fit_transform(data_prep[cols_to_normalize])


In [99]:
data_prep.isna().sum()

step                    0
type                    0
amount                  0
name_orig               0
oldbalance_org          0
newbalance_orig         0
name_dest               0
oldbalance_dest         0
newbalance_dest         0
is_fraud                0
is_flagged_fraud        0
length_name_orig        0
length_name_dest        0
type_dest               0
dia                     0
hora                    0
origin_missing          0
dest_missing        57366
dtype: int64

In [98]:
data_prep[cols_to_normalize] = np.log1p( data_prep[cols_to_normalize] )

  result = func(self.values, **kwargs)


In [100]:
data_prep['dest_missing'].fillna(0, inplace = True)

In [101]:
data_prep.head()

Unnamed: 0,step,type,amount,name_orig,oldbalance_org,newbalance_orig,name_dest,oldbalance_dest,newbalance_dest,is_fraud,is_flagged_fraud,length_name_orig,length_name_dest,type_dest,dia,hora,origin_missing,dest_missing
0,1,0,9839.64,1231006815,12.044359,11.984786,1979787155,0.0,0.0,0,0,11,11,1,1,0,0.0,9.194276
1,1,0,1864.28,1666544295,9.964112,9.872292,2044282225,0.0,0.0,0,0,11,11,1,1,0,0.0,7.531166
2,1,1,181.0,1305486145,5.204007,0.0,553264065,0.0,0.0,1,0,11,10,0,1,0,0.0,5.204007
3,1,2,181.0,840083671,5.204007,0.0,38997010,9.960954,0.0,1,0,10,9,0,1,0,0.0,0.0
4,1,0,11668.14,2048537720,10.634773,10.305174,1230701703,0.0,0.0,0,0,11,11,1,1,0,0.0,9.364703


In [102]:
y = data_prep[TARGET]
X = data_prep[SELECTED_FEATURES]

In [103]:
X.isna().sum()

type                0
name_orig           0
oldbalance_org      0
newbalance_orig     0
name_dest           0
oldbalance_dest     0
newbalance_dest     0
hora                0
dia                 0
origin_missing      0
dest_missing        0
length_name_orig    0
length_name_dest    0
dtype: int64

In [104]:
X.head()

Unnamed: 0,type,name_orig,oldbalance_org,newbalance_orig,name_dest,oldbalance_dest,newbalance_dest,hora,dia,origin_missing,dest_missing,length_name_orig,length_name_dest
0,0,1231006815,12.044359,11.984786,1979787155,0.0,0.0,0,1,0.0,9.194276,11,11
1,0,1666544295,9.964112,9.872292,2044282225,0.0,0.0,0,1,0.0,7.531166,11,11
2,1,1305486145,5.204007,0.0,553264065,0.0,0.0,0,1,0.0,5.204007,11,10
3,2,840083671,5.204007,0.0,38997010,9.960954,0.0,0,1,0.0,0.0,10,9
4,0,2048537720,10.634773,10.305174,1230701703,0.0,0.0,0,1,0.0,9.364703,11,11


In [105]:
seed=7
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state=seed,stratify=y)
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((5090096, 13), (5090096,), (1272524, 13), (1272524,))

In [106]:
data_prep['is_fraud'].value_counts()

is_fraud
0    6354407
1       8213
Name: count, dtype: int64