In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix,roc_auc_score,accuracy_score,plot_confusion_matrix,classification_report,plot_roc_curve

In [2]:
df = pd.read_csv(r"D:\pendrive\Fraud.csv")

In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
df.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [5]:
df.shape

(6362620, 11)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [7]:
pd.value_counts(df.isFraud, normalize = True)

0    0.998709
1    0.001291
Name: isFraud, dtype: float64

In [8]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [9]:
print("step:",df['step'].nunique())
print("type:",df['type'].nunique())

step: 743
type: 5


In [10]:
df.drop('step',inplace=True,axis = 1)
df.drop('nameOrig',inplace=True,axis = 1)
df.drop('nameDest',inplace=True,axis = 1)
df.drop('isFlaggedFraud',inplace=True,axis = 1)

In [11]:
df.isna().sum()

type              0
amount            0
oldbalanceOrg     0
newbalanceOrig    0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64

In [12]:
df.describe()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082
std,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,13389.57,0.0,0.0,0.0,0.0,0.0
50%,74871.94,14208.0,0.0,132705.7,214661.4,0.0
75%,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0
max,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0


In [13]:
df['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

In [14]:
df['type'].unique()

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

In [15]:
type_map = {'PAYMENT':0,'TRANSFER':1,'CASH_OUT':2,'DEBIT':3,'CASH_IN':4}
df['type'] = df['type'].map(type_map)
df.head(2)

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,0,9839.64,170136.0,160296.36,0.0,0.0,0
1,0,1864.28,21249.0,19384.72,0.0,0.0,0


In [16]:
scalar = MinMaxScaler()

In [17]:
df[df.columns] = scalar.fit_transform(df[df.columns])
df.head()

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,0.0,0.000106,0.002855,0.003233,0.0,0.0,0.0
1,0.0,2e-05,0.000357,0.000391,0.0,0.0,0.0
2,0.25,2e-06,3e-06,0.0,0.0,0.0,1.0
3,0.5,2e-06,3e-06,0.0,5.9e-05,0.0,1.0
4,0.0,0.000126,0.000697,0.000603,0.0,0.0,0.0


In [18]:
x = df.drop('isFraud',axis=1)
y = df['isFraud']

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9)
smote = SMOTE(random_state=42)

x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)


In [20]:
x_train_resampled.shape

(10167064, 6)

### Logistic Regression

In [21]:
model = LogisticRegression(random_state=42)

In [22]:
model.fit(x_train_resampled,y_train_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=42)

In [23]:
y_hat = model.predict(x_test)

In [24]:
y_scores = accuracy_score(y_test,y_hat)
y_scores

0.9686614947930255

In [25]:
confusion_matrix(y_test,y_hat)

array([[1231363,   39512],
       [    367,    1282]], dtype=int64)

In [26]:
classification_report = classification_report(y_test,y_hat)
print(classification_report)

              precision    recall  f1-score   support

         0.0       1.00      0.97      0.98   1270875
         1.0       0.03      0.78      0.06      1649

    accuracy                           0.97   1272524
   macro avg       0.52      0.87      0.52   1272524
weighted avg       1.00      0.97      0.98   1272524



### Naive Bayes

In [27]:
from sklearn.naive_bayes import GaussianNB

In [28]:
nv  = GaussianNB()

In [29]:
nv.fit(x_train_resampled,y_train_resampled)

GaussianNB()

In [30]:
y_train_resampled.value_counts()

0.0    5083532
1.0    5083532
Name: isFraud, dtype: int64

In [31]:
y_pred = nv.predict(x_test)

In [32]:
accuracy_score(y_test,y_pred)

0.9713797146458535

In [33]:
confusion_matrix(y_test,y_pred)

array([[1235416,   35459],
       [    961,     688]], dtype=int64)