# Part 02 Feature Engineering

# 0.0. Imports

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Statistics
from scipy import stats
from pandas_profiling import ProfileReport

# Graphs
import matplotlib.pyplot as plt
import seaborn as sns

# Load images
from IPython.display import Image

# Warning
import warnings
warnings.filterwarnings( 'ignore' )

# 3.0. Feature Engineering
Theoretically, the difference between a previous transaction and a subsequent one would be equal to the total transferred:

>amount = oldbalanceOrg - newbalanceOrig

The same could be said for Destinations:

>newbalanceDest = amount + oldbalanceDest

In [2]:
# Load dataset
feature_dtypes = {'step': 'int16',
                  'type': 'category',
                  'amount': 'float32',
                  'nameOrig': 'object',
                  'oldbalanceOrg': 'float32',
                  'newbalanceOrig': 'float32',
                  'nameDest': 'object',
                  'oldbalanceDest': 'float32',
                  'newbalanceDest': 'float32',
                  'isFraud': 'uint8',
                  'isFlaggedFraud': 'uint8'}

df1 = pd.read_csv('dataset\df1.csv', dtype=feature_dtypes)

Unnamed: 0,step,type,amount,name_orig,oldbalance_orig,newbalance_orig,name_dest,oldbalance_dest,newbalance_dest,is_fraud,is_flagged_fraud
0,1,PAYMENT,9839.639648,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.280029,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.139648,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
# Origin balance
df1['error_orig'] = df1.oldbalance_orig - df1.newbalance_orig - df1.amount

# Destination balance
df1['error_dest'] = df1.oldbalance_dest + df1.amount - df1.newbalance_dest

A new feature classifying the type of customers.

In [4]:
# Customer type for name origin
# df1['orig_type'] = df1['name_orig'].str.replace(r'[0-9]', regex=True, repl='')

# Customer type for name destination
df1['dest_type'] = df1['name_dest'].str.replace(r'[0-9]', regex=True, repl='')

The `orig_type` feature has only one customer type, that column will be removed.

In [5]:
df1.dest_type.value_counts()

C    4211125
M    2151495
Name: dest_type, dtype: int64

Feature day will be created in order to ease the analysis.

In [6]:
# Day column
df1['days'] = df1.step.apply(lambda x: np.ceil(x/24).astype('int'))

In [11]:
# Saving dataset
df1.to_csv('dataset\df2.csv', index=False)

In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 15 columns):
 #   Column            Dtype   
---  ------            -----   
 0   step              int16   
 1   type              category
 2   amount            float32 
 3   name_orig         object  
 4   oldbalance_orig   float64 
 5   newbalance_orig   float64 
 6   name_dest         object  
 7   oldbalance_dest   float64 
 8   newbalance_dest   float64 
 9   is_fraud          int64   
 10  is_flagged_fraud  int64   
 11  error_orig        float64 
 12  error_dest        float64 
 13  dest_type         object  
 14  days              int64   
dtypes: category(1), float32(1), float64(6), int16(1), int64(3), object(3)
memory usage: 625.0+ MB


In [10]:
df1.describe()

Unnamed: 0,step,amount,oldbalance_orig,newbalance_orig,oldbalance_dest,newbalance_dest,is_fraud,is_flagged_fraud,error_orig,error_dest,days
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,177988.3,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06,-201092.5,55567.17,10.49191
std,142.332,599135.8,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775,606650.5,441528.8,5.921812
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-92445520.0,-75885720.0,1.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0,-249641.1,0.000625,7.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0,-68677.25,3500.49,10.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0,-2954.197,29353.02,14.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0,4.240234,13191230.0,31.0
