In [61]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [62]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [63]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [64]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
79313,57966,-0.527571,-3.483241,-0.206444,-0.583181,-2.067419,-0.129826,0.454118,-0.149176,1.863281,...,0.691203,0.22447,-0.875993,0.204226,0.079542,0.029189,-0.10922,0.178251,912.79,0.0
79314,57966,-0.541422,1.341092,0.576882,0.733835,0.259657,-0.233639,0.712069,0.190731,-1.067355,...,0.189997,0.493842,-0.141807,-0.406546,-0.043605,-0.28751,0.045194,0.100293,49.33,0.0
79315,57966,-0.830218,0.971781,0.974916,-1.348723,-0.230572,-0.778253,0.395464,0.109896,0.234581,...,-0.115767,-0.33595,-0.068152,-0.109514,-0.204347,0.7035,-0.38353,-0.22378,14.6,0.0
79316,57966,1.104838,-1.141651,0.942113,-0.661271,-1.591229,-0.056758,-1.19257,0.275125,-0.4479,...,0.508825,1.146845,-0.117753,0.241582,0.212189,-0.054096,0.023744,0.026206,99.0,0.0
79317,57967,-3.560685,3.485801,-0.070146,2.191571,-0.429913,1.075498,-0.935968,-2.147517,-1.165398,...,,,,,,,,,,


In [65]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79318 entries, 0 to 79317
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    79318 non-null  int64  
 1   V1      79318 non-null  float64
 2   V2      79318 non-null  float64
 3   V3      79318 non-null  float64
 4   V4      79318 non-null  float64
 5   V5      79318 non-null  float64
 6   V6      79318 non-null  float64
 7   V7      79318 non-null  float64
 8   V8      79318 non-null  float64
 9   V9      79318 non-null  float64
 10  V10     79318 non-null  float64
 11  V11     79317 non-null  float64
 12  V12     79317 non-null  float64
 13  V13     79317 non-null  float64
 14  V14     79317 non-null  float64
 15  V15     79317 non-null  float64
 16  V16     79317 non-null  float64
 17  V17     79317 non-null  float64
 18  V18     79317 non-null  float64
 19  V19     79317 non-null  float64
 20  V20     79317 non-null  float64
 21  V21     79317 non-null  float64
 22

In [66]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [67]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

0.0    79126
1.0      191
Name: Class, dtype: int64

In [68]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [69]:
print(legit.shape)
print(fraud.shape)

(79126, 31)
(191, 31)


In [70]:
# statistical measures of the data
legit.Amount.describe()

count    79126.000000
mean        97.773259
std        269.765878
min          0.000000
25%          7.700000
50%         26.925000
75%         89.000000
max      19656.530000
Name: Amount, dtype: float64

In [71]:
fraud.Amount.describe()

count     191.000000
mean       95.142251
std       214.122813
min         0.000000
25%         1.000000
50%         7.520000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [72]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,37164.532164,-0.241928,-0.047016,0.701122,0.152361,-0.264869,0.100851,-0.097874,0.046041,0.000222,...,0.041575,-0.030941,-0.10567,-0.03743,0.007877,0.134383,0.026147,0.000715,0.002542,97.773259
1.0,32448.565445,-6.660361,4.685496,-8.847226,5.220761,-4.885698,-2.012799,-7.016277,3.141649,-3.135469,...,0.378115,0.779382,-0.158471,-0.220629,-0.091717,0.23757,0.096534,0.58607,0.053246,95.142251


In [73]:
legit_sample = legit.sample(n=492)

In [74]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [75]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
63090,50549,1.238769,0.063438,0.360251,0.445389,-0.419586,-0.522507,-0.16656,0.061871,0.072878,...,-0.247271,-0.835703,0.104439,-0.049942,0.171698,0.103275,-0.044786,0.001795,1.98,0.0
27323,34502,-1.379,0.232794,2.6266,0.39198,0.284864,-0.18069,0.210553,0.022154,0.729753,...,-0.02517,0.283123,-0.480528,0.407437,0.455121,-0.395355,-0.28194,-0.149592,8.0,0.0
39800,39972,1.264068,1.274004,-1.953181,1.394859,1.484954,-0.867329,0.721149,-0.22833,-0.483248,...,-0.231105,-0.496493,-0.289724,-1.075149,0.91681,-0.214503,0.059242,0.086327,1.0,0.0
57199,47802,-0.586511,0.784236,1.95233,0.351215,0.383172,1.63109,-0.43249,-0.543681,0.062516,...,1.098739,0.958778,-0.247469,-1.322793,-0.43675,-0.190748,-0.019716,-0.05066,2.3,0.0
60928,49567,0.894202,-0.560954,0.865197,1.483545,-0.691903,0.708645,-0.502758,0.300284,1.097386,...,-0.121023,-0.372682,-0.118628,-0.475493,0.35441,-0.388462,0.057464,0.04751,135.0,0.0


In [76]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
76929,56806,0.016828,2.400826,-4.22036,3.462217,-0.624142,-1.294303,-2.986028,0.751883,-1.606672,...,0.285832,-0.771508,-0.2652,-0.873077,0.939776,-0.219085,0.874494,0.470434,1.0,1.0
77099,56887,-0.075483,1.812355,-2.566981,4.127549,-1.628532,-0.805895,-3.390135,1.019353,-2.451251,...,0.794372,0.270471,-0.143624,0.013566,0.634203,0.213693,0.773625,0.387434,5.0,1.0
77348,57007,-1.271244,2.462675,-2.851395,2.32448,-1.372245,-0.948196,-3.065234,1.166927,-2.268771,...,0.652941,0.081931,-0.221348,-0.523582,0.224228,0.756335,0.6328,0.250187,0.01,1.0
77387,57027,-2.335655,2.22538,-3.37945,2.178538,-3.568264,0.316814,-1.734948,1.449139,-1.980033,...,0.78554,0.297412,0.308536,-0.598416,-0.12185,-0.491018,0.701606,0.206966,444.17,1.0
77682,57163,-10.363049,4.543672,-9.795898,5.508003,-6.037156,-0.133493,-11.724346,-3.198346,-4.767842,...,-2.457145,1.687257,0.977178,-0.543369,-0.289125,-0.107586,0.330642,0.163577,1.0,1.0


In [77]:
new_dataset['Class'].value_counts()

0.0    492
1.0    191
Name: Class, dtype: int64

In [78]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,38246.01626,-0.172051,-0.031942,0.75581,0.049213,-0.251566,0.10477,-0.069438,0.010098,-0.00124,...,0.01465,0.047296,-0.054592,-0.065892,-0.006835,0.142174,0.005349,-0.007835,-0.008479,94.054431
1.0,32448.565445,-6.660361,4.685496,-8.847226,5.220761,-4.885698,-2.012799,-7.016277,3.141649,-3.135469,...,0.378115,0.779382,-0.158471,-0.220629,-0.091717,0.23757,0.096534,0.58607,0.053246,95.142251


In [79]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [80]:
print(X)

        Time         V1        V2        V3        V4        V5        V6  \
63090  50549   1.238769  0.063438  0.360251  0.445389 -0.419586 -0.522507   
27323  34502  -1.379000  0.232794  2.626600  0.391980  0.284864 -0.180690   
39800  39972   1.264068  1.274004 -1.953181  1.394859  1.484954 -0.867329   
57199  47802  -0.586511  0.784236  1.952330  0.351215  0.383172  1.631090   
60928  49567   0.894202 -0.560954  0.865197  1.483545 -0.691903  0.708645   
...      ...        ...       ...       ...       ...       ...       ...   
76929  56806   0.016828  2.400826 -4.220360  3.462217 -0.624142 -1.294303   
77099  56887  -0.075483  1.812355 -2.566981  4.127549 -1.628532 -0.805895   
77348  57007  -1.271244  2.462675 -2.851395  2.324480 -1.372245 -0.948196   
77387  57027  -2.335655  2.225380 -3.379450  2.178538 -3.568264  0.316814   
77682  57163 -10.363049  4.543672 -9.795898  5.508003 -6.037156 -0.133493   

              V7        V8        V9  ...       V20       V21       V22  \


In [81]:
print(Y)

63090    0.0
27323    0.0
39800    0.0
57199    0.0
60928    0.0
        ... 
76929    1.0
77099    1.0
77348    1.0
77387    1.0
77682    1.0
Name: Class, Length: 683, dtype: float64


In [82]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [83]:
print(X.shape, X_train.shape, X_test.shape)

(683, 30) (546, 30) (137, 30)


In [84]:
model = LogisticRegression()

In [85]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [86]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [87]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9706959706959707


In [88]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [89]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.927007299270073


In [91]:
input_data = (58060,-2.63059770314604,5.12575923008793,-6.09225454651378,5.52739306209755,1.60514485494011,-2.31988352953118,-3.20707570564086,-1.48258321071708,-5.07487063178104,-6.77833138248069,5.72525459149638,-6.94917182528041,-3.12779501198771,-11.090424813263,-0.800272569362332,-1.70750085360527,-1.68547301699574,0.274890838662414,-2.02888535240153,0.948863770149954,-0.527474204141788,0.220546094514546,-1.37110989476897,-0.504898994517705,0.382306792598482,0.395527722278956,0.782035950782542,0.62852786395406,1)

# change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array as we are predicting for one datapoint
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print(' legitimate ')

else:
  print('fraudulent')

[1.]
fraudulent




In [92]:
import pickle

In [93]:
filename = 'credit_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [94]:
# loading the saved model
loaded_model = pickle.load(open('credit_model.sav', 'rb'))

In [95]:
for column in X.columns:
  print(column)

Time
V1
V2
V3
V4
V5
V6
V7
V8
V9
V10
V11
V12
V13
V14
V15
V16
V17
V18
V19
V20
V21
V22
V23
V24
V25
V26
V27
V28
Amount
