In [1]:
import pandas as pd

In [2]:
pd.__version__

'0.24.1'

In [3]:
dataset = pd.read_csv('C:\\Users\\Vivek\\Datasets\\FraudDetection\\PS_20174392719_1491204439457_log.csv')

In [4]:
dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


#### nameOrig and nameDest could be dropped from the dataset since they don't seem to have any useful information

#### isFlaggedFraud is not important either since it flags illegal attempts to transfer more than 200.000 in a single transaction

In [5]:
dataset = dataset.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis = 1)

In [6]:
dataset.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0


In [7]:
dataset.size

50900960

#### Since we have way too many rows, we can reduce the size of the dataset as Machine Learning algorithms take quite a lot of time to work with huge datasets

In [8]:
dataset_fraud = dataset[dataset['isFraud'] == 1]

dataset_nofraud = dataset[dataset['isFraud'] == 0]

In [9]:
print("Fraudulent transactions", dataset_fraud.size)
print("Non-fraudulent transactions", dataset_nofraud.size)

Fraudulent transactions 65704
Non-fraudulent transactions 50835256


In [10]:
dataset_fraud.tail()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
6362615,743,CASH_OUT,339682.13,339682.13,0.0,0.0,339682.13,1
6362616,743,TRANSFER,6311409.28,6311409.28,0.0,0.0,0.0,1
6362617,743,CASH_OUT,6311409.28,6311409.28,0.0,68488.84,6379898.11,1
6362618,743,TRANSFER,850002.52,850002.52,0.0,0.0,0.0,1
6362619,743,CASH_OUT,850002.52,850002.52,0.0,6510099.11,7360101.63,1


In [11]:
dataset_nofraud.tail()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
6362319,718,PAYMENT,8634.29,518802.0,510167.71,0.0,0.0,0
6362320,718,CASH_OUT,159188.22,3859.0,0.0,0.0,159188.22,0
6362321,718,CASH_OUT,186273.84,168046.0,0.0,24893.67,211167.51,0
6362322,718,TRANSFER,82096.45,13492.0,0.0,0.0,82096.45,0
6362323,718,DEBIT,1864.24,20426.0,18561.76,188746.0,190610.24,0


#### Dataset could be downloaded from: https://www.kaggle.com/ntnu-testimon/paysim1/version/2

In [12]:
dataset_fraud = dataset_fraud.head(10000)

In [13]:
dataset_nofraud = dataset_nofraud.head(15000)

In [14]:
dataset = pd.concat([dataset_fraud, dataset_nofraud], axis=0)

In [15]:
dataset.size

185704

In [16]:
len(dataset)

23213

In [17]:
dataset.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1
251,1,TRANSFER,2806.0,2806.0,0.0,0.0,0.0,1
252,1,CASH_OUT,2806.0,2806.0,0.0,26202.0,0.0,1
680,1,TRANSFER,20128.0,20128.0,0.0,0.0,0.0,1


In [18]:
dataset.type.unique()

array(['TRANSFER', 'CASH_OUT', 'PAYMENT', 'DEBIT', 'CASH_IN'],
      dtype=object)

#### Here the 'type' column has text. Most Machine Learning algorithms do not work well with columns that are categorical in nature. So they should be encoded into numbers.

In [20]:
dataset.dtypes

step                int64
type               object
amount            float64
oldbalanceOrg     float64
newbalanceOrig    float64
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
dtype: object

In [23]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

#### Since the type of 'type' column is object, let us change the type to 'category'

In [22]:
dataset['type'] = dataset['type'].astype('category')

#### Now use 'LabelEncoder' to encode the values in the 'type' column

In [25]:
labelEncoder = LabelEncoder()

In [26]:
dataset['type'] = labelEncoder.fit_transform(dataset['type'])

#### There are two main types of categorical variables - Ordinal and Nominal. In case of Ordinal categorical variable, the label/data has some order/hierarchy. For example, the course grades (A, B, C, D, S) could be considered an ordinal variable. 

#### Here they 'type' column is a nominal variable. So we have to encode it further using 'OneHotEncoder' after using 'LabelEncoder'

In [27]:
oneHotEncoder = OneHotEncoder()

In [28]:
oneHotEncodedArray = oneHotEncoder.fit_transform(dataset['type'].values.reshape(-1, 1)).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


#### Numpy allows the user to give one shape parameter as -1. It basically means that the user wants Numpy to figure out the dimension. The first parameter is for rows and the second one is for columns.
#### <font color = 'red'>reshape(-1,-1) is not allowed in Numpy</font>

In [29]:
type(oneHotEncodedArray)

numpy.ndarray

In [31]:
oneHotEncodedTypeColumn = pd.DataFrame(oneHotEncodedArray, 
                                       columns=["type_"+str(int(i)) for i in range(oneHotEncodedArray.shape[1])])

In [32]:
dataset = pd.concat([dataset, oneHotEncodedTypeColumn], axis = 1)

In [33]:
dataset = dataset.drop('type', axis = 1)

In [34]:
dataset.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_0,type_1,type_2,type_3,type_4
0,1.0,9839.64,170136.0,160296.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1864.28,21249.0,19384.72,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,181.0,181.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,181.0,181.0,0.0,21182.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,11668.14,41554.0,29885.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
