Importing the dependencies

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
 # loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [7]:
 # first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [8]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
271282,164503.0,1.922044,-0.72332,-1.831078,-1.16938,0.803403,1.266419,-0.362358,0.388426,1.736604,...,0.208709,0.985213,-0.024339,-0.922875,0.301456,-0.720864,0.086756,-0.070876,22.16,0.0
271283,164504.0,-0.322881,1.164874,0.308291,-0.433456,0.969553,-0.60354,1.004928,-0.07384,-0.725387,...,0.203458,0.628991,-0.662551,-0.461772,1.079087,0.114642,-0.050358,-0.055391,2.0,0.0
271284,164504.0,-0.717754,1.033083,-0.146104,-1.06462,1.133477,0.989543,0.410626,0.830524,-0.577077,...,-0.099268,-0.163939,0.050914,-1.63462,-0.745244,0.249702,0.274035,0.147029,0.99,0.0
271285,164504.0,2.037487,-0.045018,-3.057337,0.246962,2.94468,3.298262,-0.003798,0.675096,0.045968,...,0.037895,0.228836,0.036609,0.707028,0.513329,-0.471069,0.002749,-0.069212,1.78,0.0
271286,164505.0,1.79263,-1.247581,-2.565785,-0.535199,0.251652,-0.379364,0.427056,-0.35,,...,,,,,,,,,,


In [9]:
# dataset information
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271287 entries, 0 to 271286
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    271287 non-null  float64
 1   V1      271287 non-null  float64
 2   V2      271287 non-null  float64
 3   V3      271287 non-null  float64
 4   V4      271287 non-null  float64
 5   V5      271287 non-null  float64
 6   V6      271287 non-null  float64
 7   V7      271287 non-null  float64
 8   V8      271287 non-null  float64
 9   V9      271286 non-null  float64
 10  V10     271286 non-null  float64
 11  V11     271286 non-null  float64
 12  V12     271286 non-null  float64
 13  V13     271286 non-null  float64
 14  V14     271286 non-null  float64
 15  V15     271286 non-null  float64
 16  V16     271286 non-null  float64
 17  V17     271286 non-null  float64
 18  V18     271286 non-null  float64
 19  V19     271286 non-null  float64
 20  V20     271286 non-null  float64
 21  V21     27

In [11]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        1
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [12]:
1 # distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

0.0    270805
1.0       481
Name: Class, dtype: int64

This Dataset is highly unbalanced

0---> legit transaction

1---> fraaudulent transaction

In [13]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [14]:
print(legit.shape)
print(fraud.shape)

(270805, 31)
(481, 31)


In [16]:
legit.Amount.describe()

count    270805.000000
mean         89.256037
std         247.420484
min           0.000000
25%           5.950000
50%          22.610000
75%          78.590000
max       19656.530000
Name: Amount, dtype: float64

In [17]:
fraud.Amount.describe()

count     481.000000
mean      121.239605
std       257.722080
min         0.000000
25%         1.000000
50%         8.640000
75%       104.810000
max      2125.870000
Name: Amount, dtype: float64

In [18]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean ()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,91177.538506,0.000672,-0.01915,0.050389,-7.4e-05,-0.009064,0.009098,0.003286,-0.001187,0.004646,...,0.000542,-0.001493,-0.003805,-0.001714,0.000904,0.007281,0.000388,-0.000572,0.000389,89.256037
1.0,78754.848233,-4.843897,3.709169,-7.124813,4.604331,-3.241169,-1.403691,-5.66008,0.613744,-2.620426,...,0.370551,0.737419,0.004536,-0.051545,-0.110216,0.047133,0.049161,0.164155,0.074541,121.239605


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions -> 481

In [19]:
legit_sample = legit.sample(n=481)

Concatenating two DataFrames

In [20]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [21]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
241340,150984.0,-0.420512,0.791774,-0.06792,-0.522942,1.011483,-1.454983,1.131044,-0.251177,-0.17781,...,0.330574,0.909559,-0.244763,-0.103601,-0.078273,-0.196925,0.152904,0.207002,30.9,0.0
160284,113197.0,2.032326,-0.241231,-1.449939,0.105174,0.021605,-0.689327,-0.090975,-0.022033,0.833096,...,0.294065,0.847448,0.03629,0.725566,0.236779,-0.467913,-0.015036,-0.061335,1.0,0.0
130960,79467.0,1.144895,-0.598569,0.943488,-0.822231,-1.25693,-0.197669,-0.895436,0.362078,1.870854,...,0.055527,0.301824,0.032428,0.189048,0.297114,-0.679706,0.092471,0.019131,1.0,0.0
4251,3754.0,1.349502,-0.569443,0.624953,-0.48479,-1.079634,-0.579746,-0.872499,0.000418,0.874982,...,-0.241831,-0.674323,0.198367,-0.072912,0.049101,-0.482856,-0.017657,0.009046,17.0,0.0
114684,73579.0,1.272071,0.403513,-1.09861,0.450086,0.773322,-0.035635,0.089967,0.138674,-0.093761,...,-0.170857,-0.512817,-0.190264,-1.307132,0.56623,0.459229,-0.030636,0.013805,0.76,0.0


In [22]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
263080,160791.0,2.132386,0.705608,-3.530759,0.514779,1.527175,-1.716268,1.132791,-0.574214,0.128904,...,0.163739,0.70391,-0.245076,0.460049,0.920281,-0.216586,-0.026219,-0.025001,1.0,1.0
263274,160870.0,-0.644278,5.002352,-8.252739,7.756915,-0.216267,-2.751496,-3.358857,1.406268,-4.403852,...,0.587728,-0.605759,0.033746,-0.75617,-0.008172,0.532772,0.66397,0.192067,0.77,1.0
263324,160895.0,-0.84829,2.719882,-6.19907,3.044437,-3.30191,-1.992117,-3.734902,1.520079,-2.548788,...,1.125229,0.805258,0.199119,0.035206,0.012159,0.601658,0.137468,-0.171397,127.14,1.0
263877,161154.0,-3.387601,3.977881,-6.978585,1.657766,-1.1005,-3.599487,-3.686651,1.942252,-3.065089,...,1.043587,0.262189,-0.479224,-0.326638,-0.156939,0.113807,0.354124,0.287592,0.38,1.0
268375,163181.0,-5.238808,0.623013,-5.784507,1.678889,-0.364432,-0.477295,-4.276132,-0.695173,-2.971644,...,-0.32614,1.509239,-0.215966,-0.245727,0.893041,0.865758,0.854657,-0.964482,39.98,1.0


In [23]:
new_dataset['Class'].value_counts()

0.0    481
1.0    481
Name: Class, dtype: int64

In [25]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [26]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
241340  150984.0 -0.420512  0.791774 -0.067920 -0.522942  1.011483 -1.454983   
160284  113197.0  2.032326 -0.241231 -1.449939  0.105174  0.021605 -0.689327   
130960   79467.0  1.144895 -0.598569  0.943488 -0.822231 -1.256930 -0.197669   
4251      3754.0  1.349502 -0.569443  0.624953 -0.484790 -1.079634 -0.579746   
114684   73579.0  1.272071  0.403513 -1.098610  0.450086  0.773322 -0.035635   
...          ...       ...       ...       ...       ...       ...       ...   
263080  160791.0  2.132386  0.705608 -3.530759  0.514779  1.527175 -1.716268   
263274  160870.0 -0.644278  5.002352 -8.252739  7.756915 -0.216267 -2.751496   
263324  160895.0 -0.848290  2.719882 -6.199070  3.044437 -3.301910 -1.992117   
263877  161154.0 -3.387601  3.977881 -6.978585  1.657766 -1.100500 -3.599487   
268375  163181.0 -5.238808  0.623013 -5.784507  1.678889 -0.364432 -0.477295   

              V7        V8        V9  .

In [28]:
print(Y)

241340    0.0
160284    0.0
130960    0.0
4251      0.0
114684    0.0
         ... 
263080    1.0
263274    1.0
263324    1.0
263877    1.0
268375    1.0
Name: Class, Length: 962, dtype: float64


Split the data into Training data & Testing Data

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [30]:
print(X.shape, X_train.shape, X_test.shape)

(962, 30) (769, 30) (193, 30)


Model Training

Logistic Regression

In [31]:
model = LogisticRegression()

In [34]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [35]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [36]:
print('Accuracy on Training data :', training_data_accuracy)

Accuracy on Training data : 0.94148244473342


In [37]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [39]:
print('Accuracy on Testing data :', test_data_accuracy)

Accuracy on Testing data : 0.9481865284974094
