In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, confusion_matrix

In [4]:
# Reading and analysing data
data = pd.read_csv("../Datasets/creditcard.csv") # data -> Pandas dataframe
print(data)    
data.describe()

            Time         V1         V2        V3        V4        V5  \
0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
...          ...        ...        ...       ...       ...       ...   
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  \
0       0.462388  0.239599  0.098698  0.363787  ... -0.01830

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15,...,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [5]:
# Accessing value by columns
print("Time column in the dataframe", data["Time"])
print("Class column in the dataframe", data["Class"])

Time column in the dataframe 0              0.0
1              0.0
2              1.0
3              1.0
4              2.0
            ...   
284802    172786.0
284803    172787.0
284804    172788.0
284805    172788.0
284806    172792.0
Name: Time, Length: 284807, dtype: float64
Class column in the dataframe 0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Name: Class, Length: 284807, dtype: int64


In [6]:
# Scaling
std_scalar = StandardScaler()

data['Amount'] = std_scalar.fit_transform(data['Amount'].values.reshape(-1,1))
data['Time'] = std_scalar.fit_transform(data['Time'].values.reshape(-1,1))
print(data)

            Time         V1         V2        V3        V4        V5  \
0      -1.996583  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1      -1.996583   1.191857   0.266151  0.166480  0.448154  0.060018   
2      -1.996562  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
3      -1.996562  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
4      -1.996541  -1.158233   0.877737  1.548718  0.403034 -0.407193   
...          ...        ...        ...       ...       ...       ...   
284802  1.641931 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  1.641952  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  1.641974   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  1.641974  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  1.642058  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  \
0       0.462388  0.239599  0.098698  0.363787  ... -0.01830

In [7]:
# Anaylsing the distribution of classes
# data["column_name"].value_counts(), returns unique values in that column

print(data['Class'].value_counts())
print('No Frauds', round(data['Class'].value_counts()[0]/len(data) * 100,2), '% of the dataset')
print('Frauds', round(data['Class'].value_counts()[1]/len(data) * 100,2), '% of the dataset')

0    284315
1       492
Name: Class, dtype: int64
No Frauds 99.83 % of the dataset
Frauds 0.17 % of the dataset


In [8]:
fraud_df = data.loc[data['Class'] == 1] # smaples which have fraud
non_fraud_df = data.loc[data['Class'] == 0][:492] # no fruad 1: 493  

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

# Shuffle dataframe rows
data = normal_distributed_df.sample(frac=1, random_state=42)

print(data['Class'].value_counts()/len(data))

1    0.5
0    0.5
Name: Class, dtype: float64


In [9]:
X = data.drop('Class', axis=1) # data.drop -> removes one columns
y = data['Class']
 
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y , test_size=0.4)  # Batch training (not-mini batch)

train_unique_label, train_counts_label = np.unique(ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(ytest, return_counts=True)

print("Number of training samples", Xtrain.shape[0])
print("Number of testing samples", Xtest.shape[0])

print(train_counts_label, test_counts_label)

Number of training samples 590
Number of testing samples 394
[285 305] [207 187]


In [10]:
# Logistic Regression

clf = LogisticRegression().fit(Xtrain, ytrain)

# OOP
# LogisticRegression is a class defined in sklearn 
# Class 
# methods 
# Initiliaze a class -> Object 

# Classifying one sample
print("Xtest", Xtest.values[0, :])
prob = clf.predict_proba(Xtest.values[0, :].reshape(1, -1))

print("Probabilities", prob)
predicted = prob.argmax() 
# argmax  argmax([0.4, 0.56, 0.87, 0.1, 0.09, -0.9]) = 2 (Returns location of max value)
print("Predicted result", predicted)
print("Original class", ytest.values[0])

predicted = clf.predict(Xtest.values[0, :].reshape(1, -1))
print("Predicted result", predicted)
print("Original class", ytest.values[0])

Xtest [ 1.12110576  1.26132424  2.72680029 -5.43501891  5.34275901  1.44704302
 -1.44258409 -0.89870185  0.12306198 -2.7484959  -3.20243612  1.99136063
 -3.98641622  0.57720684 -8.48579451 -0.79478227 -0.66613427 -1.37262938
 -0.10431296 -1.46691055  0.31333163  0.20908614 -0.42593761 -0.15443953
 -0.0188195   0.63223395  0.19292162  0.46818089  0.28048647 -0.34687244]
Probabilities [[1.59974589e-09 9.99999998e-01]]
Predicted result 1
Original class 1
Predicted result [1]
Original class 1




In [11]:
score = clf.score(Xtest, ytest) 
print("Precision on entire test dataset", score)

Precision on entire test dataset 0.9873096446700508


In [12]:
y_pred = clf.predict(Xtest.values)

# Confusion Matrix
#                | Positive Prediction | Negative Prediction
# Positive Class | True Positive (TP)  | False Negative (FN)
# Negative Class | False Positive (FP) | True Negative (TN)

# Number of correct predictions = TP + TN
# Number of wrong  predictions = FP + FN

# Number of positive samples in the dataset = TP + FN
# Number of negative samples in the dataset = TN + FP

conf_mat = confusion_matrix(y_pred, ytest)
print(conf_mat)

[[205   3]
 [  2 184]]




In [13]:
prec = precision_score(ytest, y_pred)
print("Precision score", prec)

recall = recall_score(ytest, y_pred) # TP/(TP + FN)
print("Recall score", recall)

Precision score 0.989247311827957
Recall score 0.983957219251337
