In [1]:
# Loading necessary libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score

# Extract the zip data

In [3]:
from zipfile import ZipFile
data='/content/archive.zip'

with ZipFile(data,'r') as zip:
  zip.extractall()
  print('Data extracted successfully')

Data extracted successfully


In [4]:
credit_card_df=pd.read_csv('creditcard.csv')

# getting rows and columns of the dataset

In [5]:
credit_card_df.shape

(284807, 31)

In [6]:
credit_card_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# Check for missing values

In [7]:
credit_card_df.isna().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [8]:
credit_card_df['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


# Data is highly imbalanced

In [9]:
#  separating fradulent and legit transactions
#  Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise
#  fraud ------> 1  (492)
#  legit ------> 0  (284315)

In [10]:
legit=credit_card_df[credit_card_df.Class==0]
fraud=credit_card_df[credit_card_df.Class==1]

In [11]:
legit

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [12]:
legit_samples=legit.sample(n=492)

In [13]:
legit_samples

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
130755,79424.0,-0.912525,0.575782,2.541088,-2.099605,-0.147841,-0.243264,0.640160,-0.217357,1.646832,...,-0.109465,0.428960,-0.407290,0.098295,0.518636,-0.781917,0.152482,-0.250144,1.00,0
136390,81682.0,-0.820358,-2.014112,0.903973,-2.243043,-2.323282,1.280925,0.799731,0.357481,0.419350,...,-0.049142,-0.165412,1.269776,-0.369998,-0.746899,-0.217739,0.042226,0.121127,471.54,0
219329,141685.0,-1.682545,-0.476311,-0.554200,-0.358974,3.055978,-2.371444,0.051157,0.056029,-0.311857,...,0.344852,0.440206,-0.163774,-0.589972,0.349903,-0.209544,0.061689,0.221421,11.38,0
238876,149863.0,-0.675325,0.954084,0.594262,-0.593124,0.102537,0.097802,-0.126504,0.702286,-0.136932,...,0.312434,0.865232,-0.174400,0.664773,-0.422385,0.518891,0.018264,0.123278,4.90,0
1632,1264.0,-11.140706,-9.612726,-12.389545,6.013346,-32.092129,21.393069,34.303177,-7.520784,-1.925732,...,-4.709977,1.366110,-2.925888,0.843551,0.746267,0.801387,3.852046,4.157934,7712.43,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69112,53270.0,-0.308441,1.100015,0.923822,-0.382631,1.008514,-0.126832,1.121149,-0.447564,0.097719,...,-0.481957,-0.869880,-0.253581,-1.055267,0.047189,0.104576,0.026389,-0.294494,10.99,0
232585,147227.0,-0.888610,0.910986,-1.054118,-1.676935,3.632955,3.140325,0.750647,0.880528,-0.803831,...,-0.232634,-0.716378,-0.370023,0.620062,0.874215,0.573414,0.204839,0.119138,16.50,0
84581,60374.0,-1.890137,-1.431123,1.722514,-2.357184,-1.127685,0.067047,-1.072459,0.819504,-2.430908,...,0.121594,0.060163,0.153599,-0.350633,0.256292,-0.265308,0.177425,-0.077811,152.65,0
74996,55850.0,-0.389776,1.080905,1.693269,0.931013,-0.715821,-0.366998,0.027457,-0.051774,0.013124,...,0.018232,0.074949,0.056376,0.406153,-0.660062,0.369355,-0.064798,0.135137,6.47,0


In [14]:
credit_card_newdf=pd.concat([legit_samples,fraud])

In [15]:
credit_card_newdf.shape

(984, 31)

In [16]:
credit_card_newdf.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
130755,79424.0,-0.912525,0.575782,2.541088,-2.099605,-0.147841,-0.243264,0.64016,-0.217357,1.646832,...,-0.109465,0.42896,-0.40729,0.098295,0.518636,-0.781917,0.152482,-0.250144,1.0,0
136390,81682.0,-0.820358,-2.014112,0.903973,-2.243043,-2.323282,1.280925,0.799731,0.357481,0.41935,...,-0.049142,-0.165412,1.269776,-0.369998,-0.746899,-0.217739,0.042226,0.121127,471.54,0
219329,141685.0,-1.682545,-0.476311,-0.5542,-0.358974,3.055978,-2.371444,0.051157,0.056029,-0.311857,...,0.344852,0.440206,-0.163774,-0.589972,0.349903,-0.209544,0.061689,0.221421,11.38,0
238876,149863.0,-0.675325,0.954084,0.594262,-0.593124,0.102537,0.097802,-0.126504,0.702286,-0.136932,...,0.312434,0.865232,-0.1744,0.664773,-0.422385,0.518891,0.018264,0.123278,4.9,0
1632,1264.0,-11.140706,-9.612726,-12.389545,6.013346,-32.092129,21.393069,34.303177,-7.520784,-1.925732,...,-4.709977,1.36611,-2.925888,0.843551,0.746267,0.801387,3.852046,4.157934,7712.43,0


In [17]:
credit_card_newdf.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.88285,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.29268,0.147968,390.0,1
280143,169347.0,1.378559,1.289381,-5.004247,1.41185,0.442581,-1.326536,-1.41317,0.248525,-1.127396,...,0.370612,0.028234,-0.14564,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.2137,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.65225,...,0.751826,0.834108,0.190944,0.03207,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.39973,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.2537,245.0,1
281674,170348.0,1.991976,0.158476,-2.583441,0.40867,1.151147,-0.096695,0.22305,-0.068384,0.577829,...,-0.16435,-0.295135,-0.072173,-0.450261,0.313267,-0.289617,0.002988,-0.015309,42.53,1


In [18]:
credit_card_newdf['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,492
1,492


In [19]:
X=credit_card_newdf.drop(columns='Class',axis=1)
Y=credit_card_newdf['Class']

In [20]:
X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
130755,79424.0,-0.912525,0.575782,2.541088,-2.099605,-0.147841,-0.243264,0.64016,-0.217357,1.646832,...,0.300204,-0.109465,0.42896,-0.40729,0.098295,0.518636,-0.781917,0.152482,-0.250144,1.0
136390,81682.0,-0.820358,-2.014112,0.903973,-2.243043,-2.323282,1.280925,0.799731,0.357481,0.41935,...,0.42045,-0.049142,-0.165412,1.269776,-0.369998,-0.746899,-0.217739,0.042226,0.121127,471.54
219329,141685.0,-1.682545,-0.476311,-0.5542,-0.358974,3.055978,-2.371444,0.051157,0.056029,-0.311857,...,0.316235,0.344852,0.440206,-0.163774,-0.589972,0.349903,-0.209544,0.061689,0.221421,11.38
238876,149863.0,-0.675325,0.954084,0.594262,-0.593124,0.102537,0.097802,-0.126504,0.702286,-0.136932,...,-0.171139,0.312434,0.865232,-0.1744,0.664773,-0.422385,0.518891,0.018264,0.123278,4.9
1632,1264.0,-11.140706,-9.612726,-12.389545,6.013346,-32.092129,21.393069,34.303177,-7.520784,-1.925732,...,-11.748689,-4.709977,1.36611,-2.925888,0.843551,0.746267,0.801387,3.852046,4.157934,7712.43


In [21]:
Y.head()

Unnamed: 0,Class
130755,0
136390,0
219329,0
238876,0
1632,0


In [22]:
print(credit_card_newdf.shape)
print(X.shape)
print(Y.shape)

(984, 31)
(984, 30)
(984,)


In [23]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [24]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(984, 30)
(787, 30)
(197, 30)


In [25]:
scalar=StandardScaler()
X_train_stand=scalar.fit_transform(X_train)
X_test_stand=scalar.fit_transform(X_test)

In [26]:
X_train_stand

array([[-0.4733985 ,  0.30471199, -0.46376279, ...,  0.09242754,
        -0.30064239,  1.18538822],
       [ 1.67981298,  0.36167395, -0.19324246, ...,  0.11916627,
        -0.08786115, -0.32737471],
       [-0.18149494,  0.32678475, -0.23407748, ..., -0.20963583,
        -0.02117079, -0.26396962],
       ...,
       [ 1.08204714,  0.07645694,  0.52236256, ...,  0.39757391,
        -0.18428592, -0.33008456],
       [-1.01079948, -2.30361771,  1.68777009, ..., -1.23780219,
        -1.21949183, -0.23850251],
       [-1.62324147, -0.15630129,  0.44260146, ...,  0.31154152,
         0.13164638, -0.32270159]])

# Logistic Regression classifier

In [27]:
LR_model=LogisticRegression(max_iter=1000)

In [28]:
LR_model.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
#training accuracy of Logistic Regression
Y_pred_train=LR_model.predict(X_train)
train_acc=accuracy_score(Y_pred_train,Y_train)
print("Training accuracy of Logistic Regression: ",train_acc*100)

Training accuracy of Logistic Regression:  94.28208386277002


In [30]:
#testing accuracy of Logistic Regression
Y_pred_test=LR_model.predict(X_test)
test_acc=accuracy_score(Y_pred_test,Y_test)
print("Testing accuracy of Logistic Regression: ",test_acc*100)

Testing accuracy of Logistic Regression:  92.89340101522842


# Making a predictive system for Credit card fraud detection using trained Logistic Regression Classifier

In [50]:
input=(55032,-0.595875529046345,-0.609379328360504,1.65080388673875,-2.61940007910157,-0.760974206560508,0.190372350329731,-0.230436108532633,0.128748580759828,-2.20534691651146,0.592568999919234,-1.42348237286434,-1.95262953245832,-0.343486659577294,-0.411630682848407,0.602878248976977,-0.122307218439705,0.236210600185585,0.350863066428094,-0.941084358514863,-0.211310514839991,0.0483448328987812,0.331969633001,-0.251703211900471,-0.805222347042049,0.508628413675121,-0.0206636053332559,0.0437806866710002,0.0551284177519563,78,)
#converting to numpy array
convert_to_numpy_array=np.asarray(input)
#reshaping the input data
reshape_data=convert_to_numpy_array.reshape(1,-1)
#standardizing the input data
scalar.transform(reshape_data)

prediction=LR_model.predict(reshape_data)

if(prediction[0]==0):
  print("Transaction is Legit")
else:
  print("Transaction is Fraudulent")

Transaction is Legit




# SVM Classifier

In [32]:
svm_model=svm.SVC(kernel='linear')

In [33]:
svm_model.fit(X_train,Y_train)

In [34]:
#training accuracy of SVM
Y_pred_train=svm_model.predict(X_train)
train_acc=accuracy_score(Y_pred_train,Y_train)
print("Training accuracy of SVM: ",train_acc*100)

Training accuracy of SVM:  89.58068614993647


In [35]:
#testing accuracy of SVM
Y_pred_test=svm_model.predict(X_test)
test_acc=accuracy_score(Y_pred_test,Y_test)
print("Testing accuracy of SVM: ",test_acc*100)

Testing accuracy of SVM:  89.34010152284264


# Decision Tree

In [36]:
dt_model=DecisionTreeClassifier()

In [37]:
dt_model.fit(X_train,Y_train)

In [38]:
#training accuracy of DT
Y_pred_train=dt_model.predict(X_train)
train_acc=accuracy_score(Y_pred_train,Y_train)
print("Training accuracy of DT: ",train_acc*100)

Training accuracy of DT:  100.0


In [39]:
#testing accuracy of DT
Y_pred_test=dt_model.predict(X_test)
test_acc=accuracy_score(Y_pred_test,Y_test)
print("Testing accuracy of DT: ",test_acc*100)

Testing accuracy of DT:  92.38578680203045


In [40]:
models=[LogisticRegression(max_iter=100),svm.SVC(kernel='linear'),DecisionTreeClassifier()]

for model in models:
  model.fit(X_train,Y_train)
  Y_pred=model.predict(X_train)
  test_acc=accuracy_score(Y_pred,Y_train)
  print("Training accuracy of model: ",model,"= ",test_acc*100)

Training accuracy of model:  LogisticRegression() =  94.28208386277002


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training accuracy of model:  SVC(kernel='linear') =  89.58068614993647
Training accuracy of model:  DecisionTreeClassifier() =  100.0


In [41]:
models=[LogisticRegression(max_iter=2000),svm.SVC(kernel='linear'),DecisionTreeClassifier()]

for model in models:
  model.fit(X_train,Y_train)
  Y_pred=model.predict(X_test)
  test_acc=accuracy_score(Y_pred,Y_test)
  print("Testing accuracy of model: ",model,"= ",test_acc*100)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Testing accuracy of model:  LogisticRegression(max_iter=2000) =  92.89340101522842
Testing accuracy of model:  SVC(kernel='linear') =  89.34010152284264
Testing accuracy of model:  DecisionTreeClassifier() =  92.38578680203045


In [42]:
LRmodel=LogisticRegression()

# GridSearchCV for Logistric Regression

In [43]:
LR_params={
    'C':[0.01,0.1,1,10,100],
    'penalty':['l2','l1'],
    'solver':['liblinear','saga'],
    'max_iter':[10,100,200,500,1000],
    'class_weight':['balanced',None]
}

In [44]:
gridsearch=GridSearchCV(estimator=LRmodel,param_grid=LR_params,cv=5,scoring='accuracy')
gridsearch.fit(X_train,Y_train)



In [45]:
print(gridsearch.best_params_)
print(gridsearch.best_score_)

{'C': 0.1, 'class_weight': 'balanced', 'max_iter': 10, 'penalty': 'l1', 'solver': 'liblinear'}
0.9364911714907684


In [46]:
bestmodel=gridsearch.best_estimator_

In [47]:
Y_pred=bestmodel.predict(X_test)

In [48]:
print("Accuracy score: ",accuracy_score(Y_test,Y_pred)*100)

Accuracy score:  93.90862944162437
