# Import Dataset

In [124]:
import pandas as pd
import numpy as np
from os import path

data_path = path.join('..','dataset','financialData.csv')
data = pd.read_csv(data_path)
print(data.head()) 

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


# Check for Null values

In [88]:
print(data.isnull().sum())

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


# See distribution of transaction types

In [89]:
data_type = data["type"].value_counts()
transactions = data_type.index
quantity = data_type.values


import plotly.express as px
figure = px.pie(data, 
             values=quantity, 
             names=transactions,hole = 0.5, 
             title="Distribution of Transaction Type")
figure.show()

# See feature correlation with target column (isFraud)

In [90]:
# Checking correlation
correlation = data.corr()
print(correlation["isFraud"].sort_values(ascending=False))

isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


# Transforming categorical feature (type) to numeric representation 

## Import sklearn libraries

In [91]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

## Perform transformation

In [96]:
# LABEL ENCODING
# le = LabelEncoder()
# data['type'] = le.fit_transform(data['type'])
# data.head()

In [125]:
# ONE HOT ENCODING

# generate binary balues using get_dummies
dum_df = pd.get_dummies(data['type'],prefix='transactionType',prefix_sep=':')
dum_df.head()

Unnamed: 0,transactionType:CASH_IN,transactionType:CASH_OUT,transactionType:DEBIT,transactionType:PAYMENT,transactionType:TRANSFER
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,0,1
3,0,1,0,0,0
4,0,0,0,1,0


In [126]:
# replacing the type column with the columns generated above
del data['type']
data.head()

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [127]:
data = data.join(dum_df)

In [128]:
# change isFraud to text
data["isFraud"] = data["isFraud"].map({0: "No Fraud", 1: "Fraud"})

In [129]:
data.head()

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,transactionType:CASH_IN,transactionType:CASH_OUT,transactionType:DEBIT,transactionType:PAYMENT,transactionType:TRANSFER
0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,No Fraud,0,0,0,0,1,0
1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,No Fraud,0,0,0,0,1,0
2,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,Fraud,0,0,0,0,0,1
3,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,Fraud,0,0,1,0,0,0
4,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,No Fraud,0,0,0,0,1,0


# Splitting the data

## import libraries

In [145]:
from sklearn.model_selection import train_test_split

In [146]:
# first I will select the features that will be used by the model by dropping some of the unwanted columns
labels = ['step','nameOrig','nameDest','oldbalanceDest','newbalanceDest','isFlaggedFraud']
dataForModel = data.drop(columns=labels)

In [147]:
dataForModel.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,isFraud,transactionType:CASH_IN,transactionType:CASH_OUT,transactionType:DEBIT,transactionType:PAYMENT,transactionType:TRANSFER
0,9839.64,170136.0,160296.36,No Fraud,0,0,0,1,0
1,1864.28,21249.0,19384.72,No Fraud,0,0,0,1,0
2,181.0,181.0,0.0,Fraud,0,0,0,0,1
3,181.0,181.0,0.0,Fraud,0,1,0,0,0
4,11668.14,41554.0,29885.86,No Fraud,0,0,0,1,0


In [148]:
target = dataForModel.pop('isFraud')

In [149]:
dataForModel.insert(len(dataForModel.columns),target.name,target)
dataForModel.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,transactionType:CASH_IN,transactionType:CASH_OUT,transactionType:DEBIT,transactionType:PAYMENT,transactionType:TRANSFER,isFraud
0,9839.64,170136.0,160296.36,0,0,0,1,0,No Fraud
1,1864.28,21249.0,19384.72,0,0,0,1,0,No Fraud
2,181.0,181.0,0.0,0,0,0,0,1,Fraud
3,181.0,181.0,0.0,0,1,0,0,0,Fraud
4,11668.14,41554.0,29885.86,0,0,0,1,0,No Fraud


In [150]:
X = dataForModel.iloc[:, :-1].values
y = dataForModel.iloc[:, -1].values

In [153]:
print(X)

[[9.83964000e+03 1.70136000e+05 1.60296360e+05 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [1.86428000e+03 2.12490000e+04 1.93847200e+04 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [1.81000000e+02 1.81000000e+02 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 1.00000000e+00]
 ...
 [6.31140928e+06 6.31140928e+06 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [8.50002520e+05 8.50002520e+05 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 1.00000000e+00]
 [8.50002520e+05 8.50002520e+05 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]


In [156]:
print(y)

['No Fraud' 'No Fraud' 'Fraud' ... 'Fraud' 'Fraud' 'Fraud']


In [157]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# DECISION TREE: Initialize and train model  

## import libraries

In [162]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

## Train Model

In [160]:
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
print(model.score(X_test,y_test))

0.9996793773634132


In [163]:
acc = accuracy_score(y_test,  model.predict(X_test))
print(acc)

0.9996793773634132


# KNN: Train Model

## import libraries