In [1]:
import pandas as pd
from sklearn import svm, datasets
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, f1_score
from sklearn.multiclass import OneVsRestClassifier

In [2]:
train_url = "./Data/Train/train_data.csv"
test_url = "./Data/Test/test_data.csv"

train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

print(train_data.shape)
print(test_data.shape)
train_data.head()

(800000, 8)
(200000, 8)


Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,5.572427,4.850337,1.14987,1.0,0.0,0.0,0.0,0.0
1,5.32753,3.933291,1.682739,1.0,1.0,0.0,1.0,0.0
2,78.945633,0.023222,2.740935,1.0,1.0,0.0,1.0,0.0
3,10.119026,26.548445,1.945075,1.0,0.0,0.0,0.0,0.0
4,33.646388,1.90924,0.888281,1.0,0.0,0.0,0.0,0.0


In [3]:
train_X, train_Y = train_data.drop("fraud", axis=1), train_data["fraud"]
test_X, test_Y = test_data.drop("fraud", axis=1), test_data["fraud"]
print("Train Data Shape", train_X.shape, train_Y.shape)
print(train_Y.value_counts())
print("Test Data Shape", test_X.shape, test_Y.shape)
print(test_Y.value_counts())
train_X.head()

Train Data Shape (800000, 7) (800000,)
0.0    730181
1.0     69819
Name: fraud, dtype: int64
Test Data Shape (200000, 7) (200000,)
0.0    182416
1.0     17584
Name: fraud, dtype: int64


Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
0,5.572427,4.850337,1.14987,1.0,0.0,0.0,0.0
1,5.32753,3.933291,1.682739,1.0,1.0,0.0,1.0
2,78.945633,0.023222,2.740935,1.0,1.0,0.0,1.0
3,10.119026,26.548445,1.945075,1.0,0.0,0.0,0.0
4,33.646388,1.90924,0.888281,1.0,0.0,0.0,0.0


In [4]:
train_X.iloc[:4].shape

(4, 7)

# Support Vector Machine Classifier

## SVM with Default Hyperparamteres

Dataset is too massive, sample first 100,000 rows of the dataset.

In [5]:
sample_size = 100000

In [6]:
# train default SVC
svc_default=svm.SVC()
svc_default.fit(train_X.iloc[:sample_size], train_Y.iloc[:sample_size])

In [7]:
# predict on test set
y_pred_default = svc_default.predict(test_X.iloc[:sample_size])

In [8]:
# output default hyperparameter accuracy score
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(test_Y.iloc[:sample_size], y_pred_default)))

Model accuracy score with default hyperparameters: 0.9300


In [9]:
# print confusion matrix
confusion_matrix(test_Y.iloc[:sample_size], y_pred_default)

array([[90388,   785],
       [ 6214,  2613]], dtype=int64)

## SVM with C = 100, Higher C means fewer outliers

In [10]:
# train SVM with C = 100
svc_highC = SVC(C=100.0) 
svc_highC.fit(train_X.iloc[:sample_size], train_Y.iloc[:sample_size])

In [11]:
# predict on test set
y_pred_highC = svc_highC.predict(test_X.iloc[:sample_size])

In [12]:
# output high C accuracy score
print('Model accuracy score with High C = 100 Value: {0:0.4f}'. format(accuracy_score(test_Y.iloc[:sample_size], y_pred_highC)))

Model accuracy score with High C = 100 Value: 0.9835


In [13]:
# print confusion matrix
confusion_matrix(test_Y.iloc[:sample_size], y_pred_highC)

array([[90863,   310],
       [ 1336,  7491]], dtype=int64)

## SVM with C = 1000, Higher C means fewer outliers

In [14]:
# train SVM with higher C = 1000
svc_higherC = SVC(C=1000.0)
svc_higherC.fit(train_X.iloc[:sample_size], train_Y.iloc[:sample_size])

In [15]:
# predict on test set
y_pred_higherC = svc_higherC.predict(test_X.iloc[:sample_size])

In [16]:
# output high C accuracy score
print('Model accuracy score with Higher C = 1000 Value: {0:0.4f}'. format(accuracy_score(test_Y.iloc[:sample_size], y_pred_higherC)))

Model accuracy score with Higher C = 1000 Value: 0.9901


In [17]:
# print confusion matrix
confusion_matrix(test_Y.iloc[:sample_size], y_pred_higherC)

array([[90934,   239],
       [  747,  8080]], dtype=int64)

## SVM with linear kernel and C = 1.0

In [18]:
linear_C1 = svm.SVC(kernel='linear', C=1.0)
linear_C1.fit(train_X.iloc[:sample_size], train_Y.iloc[:sample_size])

In [19]:
y_pred_linear_C1 = linear_C1.predict(test_X.iloc[:sample_size])

In [20]:
# get accuracy
accuracy_score(test_Y.iloc[:sample_size], y_pred_linear_C1)

0.96031

In [21]:
confusion_matrix(test_Y.iloc[:sample_size], y_pred_linear_C1)

array([[90530,   643],
       [ 3326,  5501]], dtype=int64)

## SVM with linear kernel and C = 100.0

In [22]:
linear_C100 = svm.SVC(kernel='linear', C=100.0)
linear_C100.fit(train_X.iloc[:sample_size], train_Y.iloc[:sample_size])

In [23]:
y_pred_linear_C100 = linear_C100.predict(test_X.iloc[:sample_size])

In [24]:
# get accuracy
accuracy_score(test_Y.iloc[:sample_size], y_pred_linear_C100)

0.96495

In [25]:
confusion_matrix(test_Y.iloc[:sample_size], y_pred_linear_C100)

array([[90462,   711],
       [ 2794,  6033]], dtype=int64)

## SVM with linear kernel and C = 1000.0

In [26]:
linear_C1000 = svm.SVC(kernel='linear', C=1000.0)
linear_C1000.fit(train_X.iloc[:sample_size], train_Y.iloc[:sample_size])

In [27]:
y_pred_linear_C1000 = linear_C1000.predict(test_X.iloc[:sample_size])

In [28]:
# get accuracy
accuracy_score(test_Y.iloc[:sample_size], y_pred_linear_C1000)

0.96314

In [29]:
confusion_matrix(test_Y.iloc[:sample_size], y_pred_linear_C1000)

array([[90349,   824],
       [ 2862,  5965]], dtype=int64)

## SVM with polynomial kernel and C = 1.0

In [30]:
poly_C1 = svm.SVC(kernel='poly', C=1.0)
poly_C1.fit(train_X.iloc[:sample_size], train_Y.iloc[:sample_size])

In [31]:
y_pred_poly_C1 = poly_C1.predict(test_X.iloc[:sample_size])

In [32]:
# get accuracy
accuracy_score(test_Y.iloc[:sample_size], y_pred_poly_C1)

0.92563

In [33]:
confusion_matrix(test_Y.iloc[:sample_size], y_pred_poly_C1)

array([[91042,   131],
       [ 7306,  1521]], dtype=int64)

## SVM with polynomial kernel and C = 100.0

In [34]:
poly_C100 = svm.SVC(kernel='poly', C=100.0)
poly_C100.fit(train_X.iloc[:sample_size], train_Y.iloc[:sample_size])

In [35]:
y_pred_poly_C100 = poly_C100.predict(test_X.iloc[:sample_size])

In [36]:
# get accuracy
accuracy_score(test_Y.iloc[:sample_size], y_pred_poly_C100)

0.9087

In [37]:
confusion_matrix(test_Y.iloc[:sample_size], y_pred_poly_C100)

array([[87184,  3989],
       [ 5141,  3686]], dtype=int64)

## SVM with polynomial kernel and C = 1000.0

In [38]:
poly_C1000 = svm.SVC(kernel='poly', C=1000.0)
poly_C1000.fit(train_X.iloc[:sample_size], train_Y.iloc[:sample_size])

In [39]:
y_pred_poly_C1000 = poly_C1000.predict(test_X.iloc[:sample_size])

In [40]:
# get accuracy
accuracy_score(test_Y.iloc[:sample_size], y_pred_poly_C1000)

0.90172

In [41]:
confusion_matrix(test_Y.iloc[:sample_size], y_pred_poly_C1000)

array([[86331,  4842],
       [ 4986,  3841]], dtype=int64)

## SVM with sigmoid kernel and C = 1.0

In [42]:
sig_C1 = svm.SVC(kernel='sigmoid', C=1.0)
sig_C1.fit(train_X.iloc[:sample_size], train_Y.iloc[:sample_size])

In [43]:
y_pred_sig_C1 = sig_C1.predict(test_X.iloc[:sample_size])

In [44]:
# get accuracy
accuracy_score(test_Y.iloc[:sample_size], y_pred_sig_C1)

0.84558

In [45]:
confusion_matrix(test_Y.iloc[:sample_size], y_pred_sig_C1)

array([[83686,  7487],
       [ 7955,   872]], dtype=int64)

## SVM with sigmoid kernel and C = 100.0

In [46]:
sig_C100 = svm.SVC(kernel='sigmoid', C=100.0)
sig_C100.fit(train_X.iloc[:sample_size], train_Y.iloc[:sample_size])

In [47]:
y_pred_sig_C100 = sig_C100.predict(test_X.iloc[:sample_size])

In [48]:
# get accuracy
accuracy_score(test_Y.iloc[:sample_size], y_pred_sig_C100)

0.84262

In [49]:
confusion_matrix(test_Y.iloc[:sample_size], y_pred_sig_C100)

array([[83384,  7789],
       [ 7949,   878]], dtype=int64)

## SVM with sigmoid kernel and C = 1000.0

In [50]:
sig_C1000 = svm.SVC(kernel='sigmoid', C=1000.0)
sig_C1000.fit(train_X.iloc[:sample_size], train_Y.iloc[:sample_size])

In [51]:
y_pred_sig_C1000 = sig_C1000.predict(test_X.iloc[:sample_size])

In [52]:
# get accuracy
accuracy_score(test_Y.iloc[:sample_size], y_pred_sig_C1000)

0.84259

In [53]:
confusion_matrix(test_Y.iloc[:sample_size], y_pred_sig_C1000)

array([[83381,  7792],
       [ 7949,   878]], dtype=int64)

## Most Optimal Model

In [54]:
#Show the best model
#show confusion matrix with image and color
#show the f_1 score