Importing the requirements


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
#loading the dataset to a Panda Dataframe
credit_card_data = pd.read_csv('/content/creditcard.csv')
credit_card_data.head() #gives the first 5 rows of data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [None]:
credit_card_data.tail() #gives the last 5 rows of data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
269307,163622.0,0.508904,0.649664,0.973601,1.054893,-0.406338,-0.343784,0.018162,-0.061663,0.30135,...,0.071984,0.468318,0.265016,0.028976,-1.791758,-0.690011,0.418209,0.315569,1.0,0.0
269308,163623.0,1.969238,-0.386206,-0.422364,0.151434,-0.550257,-0.269741,-0.632417,0.143496,1.233122,...,-0.093816,-0.204126,0.319091,-0.481361,-0.401275,-0.914828,0.052024,-0.043066,1.0,0.0
269309,163623.0,2.143922,-0.681218,-1.32892,-0.786185,-0.457053,-1.277814,-0.080907,-0.473492,-0.66279,...,0.067409,0.222521,0.153918,0.031472,-0.000407,-0.283674,-0.035735,-0.055022,47.55,0.0
269310,163623.0,1.997843,-0.260107,-0.283649,0.346443,-0.733479,-0.853783,-0.453823,-0.096166,1.351108,...,-0.120094,-0.213117,0.356999,-0.017812,-0.374293,-0.926075,0.058296,-0.027203,1.0,0.0
269311,163624.0,2.369304,,,,,,,,,...,,,,,,,,,,


In [None]:
#getting some dataset information
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269312 entries, 0 to 269311
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    269312 non-null  float64
 1   V1      269312 non-null  float64
 2   V2      269311 non-null  float64
 3   V3      269311 non-null  float64
 4   V4      269311 non-null  float64
 5   V5      269311 non-null  float64
 6   V6      269311 non-null  float64
 7   V7      269311 non-null  float64
 8   V8      269311 non-null  float64
 9   V9      269311 non-null  float64
 10  V10     269311 non-null  float64
 11  V11     269311 non-null  float64
 12  V12     269311 non-null  float64
 13  V13     269311 non-null  float64
 14  V14     269311 non-null  float64
 15  V15     269311 non-null  float64
 16  V16     269311 non-null  float64
 17  V17     269311 non-null  float64
 18  V18     269311 non-null  float64
 19  V19     269311 non-null  float64
 20  V20     269311 non-null  float64
 21  V21     26

In [None]:
#checking the number of missing values in each column
credit_card_data.isnull().sum()
# 0 --> represent real and legit transactions
# 1 --> represent fake and fraud transactions

In [None]:
#distribution of legit transcaction & fraud transaction
credit_card_data['Class'].value_counts()

0.0    268830
1.0       481
Name: Class, dtype: int64

In [None]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class ==1]

# here if class value is 0 then entire row is stored in legit variable 
# here if class value is 1 then entire row is stored in fraud variable 

In [None]:
print(legit.shape)
print(fraud.shape)

(268830, 31)
(481, 31)


In [None]:
#statistical measure of our data
legit.Amount.describe()


count    268830.000000
mean         89.452002
std         248.011506
min           0.000000
25%           5.990000
50%          22.700000
75%          78.897500
max       19656.530000
Name: Amount, dtype: float64

In [None]:
fraud.Amount.describe()

count     481.000000
mean      121.239605
std       257.722080
min         0.000000
25%         1.000000
50%         8.640000
75%       104.810000
max      2125.870000
Name: Amount, dtype: float64

In [None]:
# comparing the values for both transaction
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,90642.349295,-0.001811,-0.020296,0.055355,0.001446,-0.010964,0.010154,0.002839,-0.001283,0.004659,...,0.000882,-0.001455,-0.00398,-0.002036,0.001193,0.008436,0.000814,-0.000568,0.000444,89.452002
1.0,78754.848233,-4.843897,3.709169,-7.124813,4.604331,-3.241169,-1.403691,-5.66008,0.613744,-2.620426,...,0.370551,0.737419,0.004536,-0.051545,-0.110216,0.047133,0.049161,0.164155,0.074541,121.239605


Here the difference is very important for our model

Here we will build a sample dataset from the main dataset containing similar distribution of legit and fraud transactions


UNDER SAMPLING

building a sample dataset containing similar distribution of normal distributions

In [None]:
legit_sample = legit.sample(n=492)

In [None]:
new_dataset = pd.concat ([ legit_sample, fraud ],axis=0)
#here we concatinate legit_sample and fraud for further evaluation
#and when axis is 0 , dataframe is added one by one
#it means that fruad values are added below legit sample...
#axis 0 is row wise
#axis 1 is column wise

In [None]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
193903,130347.0,1.795719,-0.391344,-2.106538,1.174832,0.64212,-0.154975,0.473801,-0.044963,0.301293,...,0.111226,0.090669,-0.112152,0.166873,0.384708,-0.517678,-0.061944,-0.054015,128.09,0.0
49681,44161.0,-1.080087,1.668525,-0.476599,2.803964,-0.108232,-0.24605,0.146677,0.838073,-1.756628,...,0.204499,0.516322,0.050888,0.016483,-0.947367,0.111413,0.421069,0.126093,62.4,0.0
37460,38963.0,-8.164069,4.635965,-3.910798,-0.110987,-5.057069,-0.775761,-3.550786,5.305288,0.359102,...,0.113622,-0.217779,-0.095296,0.611372,0.738774,0.37245,-0.132885,0.016547,49.74,0.0
74742,55733.0,1.184561,-0.202022,-0.699687,-0.012383,0.030743,-0.921186,0.487977,-0.315552,0.041715,...,-0.052641,-0.399791,-0.267204,-0.381238,0.593334,1.114471,-0.140171,-0.001816,108.95,0.0
9001,12486.0,-1.303586,1.797254,0.402989,0.912417,-0.295206,-0.373796,-0.128221,0.838366,0.400168,...,0.041922,0.283304,-0.095552,-0.052431,-0.115518,-0.344066,0.250413,0.11907,15.0,0.0


In [None]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
263080,160791.0,2.132386,0.705608,-3.530759,0.514779,1.527175,-1.716268,1.132791,-0.574214,0.128904,...,0.163739,0.70391,-0.245076,0.460049,0.920281,-0.216586,-0.026219,-0.025001,1.0,1.0
263274,160870.0,-0.644278,5.002352,-8.252739,7.756915,-0.216267,-2.751496,-3.358857,1.406268,-4.403852,...,0.587728,-0.605759,0.033746,-0.75617,-0.008172,0.532772,0.66397,0.192067,0.77,1.0
263324,160895.0,-0.84829,2.719882,-6.19907,3.044437,-3.30191,-1.992117,-3.734902,1.520079,-2.548788,...,1.125229,0.805258,0.199119,0.035206,0.012159,0.601658,0.137468,-0.171397,127.14,1.0
263877,161154.0,-3.387601,3.977881,-6.978585,1.657766,-1.1005,-3.599487,-3.686651,1.942252,-3.065089,...,1.043587,0.262189,-0.479224,-0.326638,-0.156939,0.113807,0.354124,0.287592,0.38,1.0
268375,163181.0,-5.238808,0.623013,-5.784507,1.678889,-0.364432,-0.477295,-4.276132,-0.695173,-2.971644,...,-0.32614,1.509239,-0.215966,-0.245727,0.893041,0.865758,0.854657,-0.964482,39.98,1.0


In [None]:
new_dataset['Class'].value_counts()

0.0    492
1.0    481
Name: Class, dtype: int64

In [None]:
new_dataset.groupby('Class').mean()

#since  we have the approx same mean it tells that the nature of the dataset is not changed
#it is same as the original dataset


Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,86788.300813,0.093432,0.069852,0.026361,-0.036968,0.088681,0.055353,-0.007676,0.093076,0.024358,...,-0.050336,-0.028692,-0.026571,-0.035123,-0.0049,0.035831,0.014736,0.008712,0.009158,64.549065
1.0,78754.848233,-4.843897,3.709169,-7.124813,4.604331,-3.241169,-1.403691,-5.66008,0.613744,-2.620426,...,0.370551,0.737419,0.004536,-0.051545,-0.110216,0.047133,0.049161,0.164155,0.074541,121.239605


splitting the data into features and targets

In [None]:
X=new_dataset.drop(columns='Class',axis=1) #axis 1 is column
Y=new_dataset['Class']

In [None]:
print(X)
print(Y)

            Time        V1        V2        V3        V4        V5        V6  \
193903  130347.0  1.795719 -0.391344 -2.106538  1.174832  0.642120 -0.154975   
49681    44161.0 -1.080087  1.668525 -0.476599  2.803964 -0.108232 -0.246050   
37460    38963.0 -8.164069  4.635965 -3.910798 -0.110987 -5.057069 -0.775761   
74742    55733.0  1.184561 -0.202022 -0.699687 -0.012383  0.030743 -0.921186   
9001     12486.0 -1.303586  1.797254  0.402989  0.912417 -0.295206 -0.373796   
...          ...       ...       ...       ...       ...       ...       ...   
263080  160791.0  2.132386  0.705608 -3.530759  0.514779  1.527175 -1.716268   
263274  160870.0 -0.644278  5.002352 -8.252739  7.756915 -0.216267 -2.751496   
263324  160895.0 -0.848290  2.719882 -6.199070  3.044437 -3.301910 -1.992117   
263877  161154.0 -3.387601  3.977881 -6.978585  1.657766 -1.100500 -3.599487   
268375  163181.0 -5.238808  0.623013 -5.784507  1.678889 -0.364432 -0.477295   

              V7        V8        V9  .

In [None]:
#splitting the data into Training Data & Testing Data
X_train, X_test,Y_train, Y_test = train_test_split(X,Y,test_size=0.3, random_state=0)
#random state describe how our data will split

#test size if amount of testing data 

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(973, 30) (681, 30) (292, 30)


MODEL TRAINING

LOGISTIC REGRESSION


In [None]:
model = LogisticRegression()

In [None]:
#training the logistic regression model with training data

model.fit(X_train,Y_train)

In [None]:
Y_predict = model.predict(X_test)

In [None]:
accuracy = accuracy_score(Y_test, Y_predict)

In [None]:
print("acuuracy of the model is -> ",accuracy)

acuuracy of the model is ->  0.9246575342465754
