In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df=pd.read_csv('./Data/Fraud.csv')

In [4]:
# Starting With Data Analysis

In [5]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [6]:
df.shape 
# this gives (rows, columns) present

(6362620, 11)

In [7]:
df.size
# Gives total no. of different data points

69988820

In [8]:
df.head()
# First 5 transactions data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [9]:
# We identify data types, we perform this step so to identigy object-type features. So to convert 
# them into numerical form for preprocessing step before training machine learning models
df.dtypes

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

In [10]:
# formatting float-point numbers 'x' to diplay up to four decimal places
pd.set_option('display.float_format', lambda x: '%0.4f' %x)

In [11]:
round(df[['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig','oldbalanceDest',
                'newbalanceDest',]].describe() .T ,2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
step,6362620.0,243.4,142.33,1.0,156.0,239.0,335.0,743.0
amount,6362620.0,179861.9,603858.23,0.0,13389.57,74871.94,208721.48,92445516.64
oldbalanceOrg,6362620.0,833883.1,2888242.67,0.0,0.0,14208.0,107315.18,59585040.37
newbalanceOrig,6362620.0,855113.67,2924048.5,0.0,0.0,0.0,144258.41,49585040.37
oldbalanceDest,6362620.0,1100701.67,3399180.11,0.0,0.0,132705.66,943036.71,356015889.35
newbalanceDest,6362620.0,1224996.4,3674128.94,0.0,0.0,214661.44,1111909.25,356179278.92


In [12]:
# Understanding relationship b/w input features and target features to select the important
# features for training machine learning model

df[['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig','oldbalanceDest',
                'newbalanceDest','isFraud', 'isFlaggedFraud']].corr()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
step,1.0,0.0224,-0.0101,-0.0103,0.0277,0.0259,0.0316,0.0033
amount,0.0224,1.0,-0.0028,-0.0079,0.2941,0.4593,0.0767,0.0123
oldbalanceOrg,-0.0101,-0.0028,1.0,0.9988,0.0662,0.042,0.0102,0.0038
newbalanceOrig,-0.0103,-0.0079,0.9988,1.0,0.0678,0.0418,-0.0081,0.0038
oldbalanceDest,0.0277,0.2941,0.0662,0.0678,1.0,0.9766,-0.0059,-0.0005
newbalanceDest,0.0259,0.4593,0.042,0.0418,0.9766,1.0,0.0005,-0.0005
isFraud,0.0316,0.0767,0.0102,-0.0081,-0.0059,0.0005,1.0,0.0441
isFlaggedFraud,0.0033,0.0123,0.0038,0.0038,-0.0005,-0.0005,0.0441,1.0


In [13]:
# What we observed in above correlation matrix, is that ('oldbalanceDest', 'newbalanceDest') and 
# ('oldbalanceOrg', 'newbalanceOrg') indicates a high correlation (near to 1) or linear dependency between
# these pairs of features. And this could lead to issues in regression model.

# So, to fix this we need to remove one of the highly correated variables from each pair. 

In [14]:
df = df.drop(['newbalanceDest', 'newbalanceOrig'], axis=1)

In [15]:
df[df['amount']==0]

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,nameDest,oldbalanceDest,isFraud,isFlaggedFraud
2736447,212,CASH_OUT,0.0,C1510987794,0.0,C1696624817,0.0,1,0
3247298,250,CASH_OUT,0.0,C521393327,0.0,C480398193,0.0,1,0
3760289,279,CASH_OUT,0.0,C539112012,0.0,C1106468520,538547.63,1,0
5563714,387,CASH_OUT,0.0,C1294472700,0.0,C1325541393,7970766.57,1,0
5996408,425,CASH_OUT,0.0,C832555372,0.0,C1462759334,76759.9,1,0
5996410,425,CASH_OUT,0.0,C69493310,0.0,C719711728,2921531.34,1,0
6168500,554,CASH_OUT,0.0,C10965156,0.0,C1493336195,230289.66,1,0
6205440,586,CASH_OUT,0.0,C1303719003,0.0,C900608348,1328472.86,1,0
6266414,617,CASH_OUT,0.0,C1971175979,0.0,C1352345416,0.0,1,0
6281483,646,CASH_OUT,0.0,C2060908932,0.0,C1587892888,0.0,1,0


In [16]:
# Observation: Transactions which have 'amount'=0 are fraud transactions. We can also understand this thing as,
# if someone having 0 balance in their account, then their is no sense to transfer money. That is, probably some
# fraudster is targeting random accounts, and may be they don't aware about account balance.

In [17]:
df[df['isFlaggedFraud']==1]


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,nameDest,oldbalanceDest,isFraud,isFlaggedFraud
2736446,212,TRANSFER,4953893.08,C728984460,4953893.08,C639921569,0.0,1,1
3247297,250,TRANSFER,1343002.08,C1100582606,1343002.08,C1147517658,0.0,1,1
3760288,279,TRANSFER,536624.41,C1035541766,536624.41,C1100697970,0.0,1,1
5563713,387,TRANSFER,4892193.09,C908544136,4892193.09,C891140444,0.0,1,1
5996407,425,TRANSFER,10000000.0,C689608084,19585040.37,C1392803603,0.0,1,1
5996409,425,TRANSFER,9585040.37,C452586515,19585040.37,C1109166882,0.0,1,1
6168499,554,TRANSFER,3576297.1,C193696150,3576297.1,C484597480,0.0,1,1
6205439,586,TRANSFER,353874.22,C1684585475,353874.22,C1770418982,0.0,1,1
6266413,617,TRANSFER,2542664.27,C786455622,2542664.27,C661958277,0.0,1,1
6281482,646,TRANSFER,10000000.0,C19004745,10399045.08,C1806199534,0.0,1,1


In [18]:
# Observation: Transactions which are flag fraud by the system are actually fraud transactions

In [19]:
df[(df['amount']==df['oldbalanceOrg']) & (df['isFraud']==1)]

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,nameDest,oldbalanceDest,isFraud,isFlaggedFraud
2,1,TRANSFER,181.0000,C1305486145,181.0000,C553264065,0.0000,1,0
3,1,CASH_OUT,181.0000,C840083671,181.0000,C38997010,21182.0000,1,0
251,1,TRANSFER,2806.0000,C1420196421,2806.0000,C972765878,0.0000,1,0
252,1,CASH_OUT,2806.0000,C2101527076,2806.0000,C1007251739,26202.0000,1,0
680,1,TRANSFER,20128.0000,C137533655,20128.0000,C1848415041,0.0000,1,0
...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.1300,C786484425,339682.1300,C776919290,0.0000,1,0
6362616,743,TRANSFER,6311409.2800,C1529008245,6311409.2800,C1881841831,0.0000,1,0
6362617,743,CASH_OUT,6311409.2800,C1162922333,6311409.2800,C1365125890,68488.8400,1,0
6362618,743,TRANSFER,850002.5200,C1685995037,850002.5200,C2080388513,0.0000,1,0


In [20]:
# Observation: If transaction amount is same as amount present before transaction happended then the transaction 
# is basically a fraudulent transaction. Basically, we can say fraudsters know in this case that how much money is 
# present in the account of the victim and they directly clean the whole money from the account.