**Importing all the necessary dependencies**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [None]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [None]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
120896,75998,1.501845,-1.193147,0.066789,-1.600847,-1.259865,-0.268558,-1.122205,0.040104,-1.79506,...,-0.22643,-0.409194,-0.074263,-0.565449,0.40435,-0.168445,0.010981,0.001626,30.0,0.0
120897,75999,-0.340455,1.201756,1.314094,0.053134,0.111616,-0.959891,0.780107,-0.128473,-0.541652,...,-0.252458,-0.609959,-0.025552,0.360654,-0.127871,0.068601,0.251848,0.10038,3.67,0.0
120898,75999,-1.089161,0.115364,1.501978,0.700866,0.493027,-0.220268,0.744468,0.128396,-0.983044,...,0.391619,0.776285,0.17939,0.218712,-0.009381,-0.353213,0.082478,0.14327,130.8,0.0
120899,76000,0.213729,0.623136,-0.259558,-0.152127,2.262088,3.579712,-0.18722,0.87758,-0.697998,...,-0.193588,-0.663299,0.088675,1.005326,-0.676412,0.251661,0.105658,0.088634,1.29,0.0
120900,76001,-0.367988,1.108999,0.787889,1.043825,-0.336914,-0.554318,0.307948,0.402319,-0.651386,...,0.226379,0.551456,0.058382,0.394346,,,,,,


In [None]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120901 entries, 0 to 120900
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    120901 non-null  int64  
 1   V1      120901 non-null  float64
 2   V2      120901 non-null  float64
 3   V3      120901 non-null  float64
 4   V4      120901 non-null  float64
 5   V5      120901 non-null  float64
 6   V6      120901 non-null  float64
 7   V7      120901 non-null  float64
 8   V8      120901 non-null  float64
 9   V9      120901 non-null  float64
 10  V10     120901 non-null  float64
 11  V11     120901 non-null  float64
 12  V12     120901 non-null  float64
 13  V13     120901 non-null  float64
 14  V14     120901 non-null  float64
 15  V15     120901 non-null  float64
 16  V16     120901 non-null  float64
 17  V17     120901 non-null  float64
 18  V18     120901 non-null  float64
 19  V19     120901 non-null  float64
 20  V20     120901 non-null  float64
 21  V21     12

In [None]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [None]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Class
0.0    120651
1.0       249
Name: count, dtype: int64

This dataset is highly unbalanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [21]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [None]:
print(legit.shape)
print(fraud.shape)

(120651, 31)
(249, 31)


In [None]:
# statistical measures of the data
legit.Amount.describe()

count    120651.000000
mean         94.185676
std         255.770139
min           0.000000
25%           6.885000
50%          24.990000
75%          84.815000
max       19656.530000
Name: Amount, dtype: float64

In [None]:
fraud.Amount.describe()

count     249.000000
mean      119.180482
std       251.160611
min         0.000000
25%         1.000000
50%        11.390000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [None]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,47510.269886,-0.242842,-0.016464,0.695109,0.143967,-0.275556,0.091375,-0.101951,0.057796,-0.073721,...,0.042749,-0.038022,-0.113385,-0.035057,0.011751,0.130599,0.027544,-0.000211,0.001646,94.185676
1.0,40280.433735,-5.8466,4.076891,-7.453278,4.641314,-4.167167,-1.511639,-6.201243,1.559279,-2.696182,...,0.259111,1.324185,-0.31484,-0.103567,-0.108411,0.199329,0.074713,0.522683,0.076517,119.180482


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 492

In [23]:
legit_sample = legit.sample(n=492)

Concatenating two DataFrames

In [24]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [25]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
6723,8422,0.417572,-2.198511,0.779381,-0.090823,-1.789522,0.483759,-0.871198,0.127745,0.956041,...,0.498631,0.725987,-0.459158,0.016831,0.149545,-0.117868,-0.051405,0.081529,453.3,0.0
108904,71127,1.185258,-0.20176,0.39032,-0.16322,-0.639903,-0.515969,-0.311286,0.136478,0.244008,...,-0.034125,-0.184632,0.104309,0.248335,0.047903,0.965084,-0.083777,-0.010573,8.84,0.0
110551,71870,0.475785,-2.012197,-0.453746,-0.051186,-1.428114,-0.903882,0.37713,-0.388998,-0.770725,...,-0.164197,-0.986202,-0.421961,0.454555,0.238126,1.042053,-0.163366,0.08064,487.98,0.0
7321,9823,-0.870857,1.252308,1.892115,-0.248112,0.287272,-0.970828,0.803264,-0.203943,0.766284,...,-0.374944,-1.011478,-0.154804,0.256508,0.179785,-0.039644,-0.172852,0.067909,3.99,0.0
17264,28580,0.929156,-1.49973,0.919232,-0.569157,-1.473876,0.781494,-1.302138,0.474007,-0.163835,...,0.491941,1.109592,-0.173686,-0.266347,0.17368,-0.015934,0.040927,0.032225,163.58,0.0


In [26]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
118308,75033,-0.43033,0.985633,0.645789,0.317131,0.616332,-1.347462,1.078234,-0.161518,-0.492856,...,-0.016378,-0.207609,-0.164119,0.25528,0.454798,-0.505032,-0.039456,-0.006358,30.14,1.0
119714,75556,-0.734303,0.435519,-0.530866,-0.47112,0.643214,0.713832,-1.234572,-2.551412,-2.057724,...,-1.004877,1.150354,-0.152555,-1.386745,0.004716,0.219146,-0.058257,0.158048,29.95,1.0
119781,75581,-2.866364,2.346949,-4.053307,3.983359,-3.463186,-1.280953,-4.474764,1.216655,-2.309829,...,1.049732,0.47584,0.40448,0.28203,-0.506901,-0.371741,0.615257,0.803163,124.53,1.0
120505,75851,-4.793667,3.418911,-5.074445,4.035987,-3.527875,-1.923242,-5.065981,1.996885,-3.097379,...,1.168618,0.289531,-0.371888,0.144761,0.084735,-0.197431,0.328672,0.835395,99.85,1.0
120837,75978,-5.140723,3.568751,-5.896245,4.16472,-4.091193,-1.98996,-5.472436,2.422821,-2.909735,...,1.13113,0.118022,-0.332704,0.139941,0.324758,-0.180769,0.17781,0.661555,99.9,1.0


In [27]:
new_dataset['Class'].value_counts()

Class
0.0    492
1.0    249
Name: count, dtype: int64

In [28]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,48228.077236,-0.330593,0.027357,0.726413,0.179829,-0.284101,0.083263,-0.180987,-0.088002,-0.087064,...,0.04292,-0.066026,-0.093986,-0.046594,-0.002022,0.081332,0.031079,-0.010562,0.006631,94.868455
1.0,40280.433735,-5.8466,4.076891,-7.453278,4.641314,-4.167167,-1.511639,-6.201243,1.559279,-2.696182,...,0.259111,1.324185,-0.31484,-0.103567,-0.108411,0.199329,0.074713,0.522683,0.076517,119.180482


Splitting the data into Features & Targets

In [29]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [30]:
print(X)

         Time        V1        V2        V3        V4        V5        V6  \
6723     8422  0.417572 -2.198511  0.779381 -0.090823 -1.789522  0.483759   
108904  71127  1.185258 -0.201760  0.390320 -0.163220 -0.639903 -0.515969   
110551  71870  0.475785 -2.012197 -0.453746 -0.051186 -1.428114 -0.903882   
7321     9823 -0.870857  1.252308  1.892115 -0.248112  0.287272 -0.970828   
17264   28580  0.929156 -1.499730  0.919232 -0.569157 -1.473876  0.781494   
...       ...       ...       ...       ...       ...       ...       ...   
118308  75033 -0.430330  0.985633  0.645789  0.317131  0.616332 -1.347462   
119714  75556 -0.734303  0.435519 -0.530866 -0.471120  0.643214  0.713832   
119781  75581 -2.866364  2.346949 -4.053307  3.983359 -3.463186 -1.280953   
120505  75851 -4.793667  3.418911 -5.074445  4.035987 -3.527875 -1.923242   
120837  75978 -5.140723  3.568751 -5.896245  4.164720 -4.091193 -1.989960   

              V7        V8        V9  ...       V20       V21       V22  \


In [31]:
print(Y)

6723      0.0
108904    0.0
110551    0.0
7321      0.0
17264     0.0
         ... 
118308    1.0
119714    1.0
119781    1.0
120505    1.0
120837    1.0
Name: Class, Length: 741, dtype: float64


Split the data into Training data & Testing Data

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [33]:
print(X.shape, X_train.shape, X_test.shape)

(741, 30) (592, 30) (149, 30)


Model Training

Logistic Regression

In [34]:
model = LogisticRegression()

In [42]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [37]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [38]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9577702702702703


In [39]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [40]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9328859060402684
