In [1]:
#importing basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('Training Dataset.arff', header=None, comment='@')
# data = pd.read_csv("urldata.csv")
# data = data.drop(['Domain'], axis = 1).copy()

In [3]:
data.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
dtype: int64

In [4]:
# shuffling the rows in the dataset so that when splitting the train and test set are equally distributed
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,1,-1,1,1,1,-1,-1,-1,1,1,...,1,1,1,-1,-1,-1,1,0,1,-1
1,1,-1,1,1,1,-1,-1,-1,-1,1,...,1,1,-1,1,1,-1,1,0,1,1
2,-1,-1,1,1,1,-1,1,0,-1,1,...,1,1,-1,1,0,1,1,1,1,-1
3,1,-1,1,1,1,-1,0,1,1,1,...,1,1,-1,1,0,-1,1,0,1,-1
4,1,-1,1,1,1,-1,-1,1,-1,1,...,1,1,-1,1,1,-1,1,0,1,1


In [5]:
# Sepratating & assigning features and target columns to X & y
# y = data['Label']
# X = data.drop('Label',axis=1)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [6]:
X.shape, y.shape

column_name=list(X.columns)

In [7]:
# Splitting the dataset into train and test sets: 80-20 split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, random_state = 12)
X_train.shape, X_test.shape

((8844, 30), (2211, 30))

# SVM

In [8]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

svm=SVC(probability=True)
svm.fit(X_train, y_train)

y_train_svm = svm.predict(X_train)
y_test_svm =svm.predict(X_test)


#computing the accuracy of the model performance
acc_train_svm= accuracy_score(y_train,y_train_svm)
acc_test_svm = accuracy_score(y_test,y_test_svm)

print("SVM: Accuracy on training Data: {:.3f}".format(acc_train_svm*100))
print("SVM: Accuracy on test Data: {:.3f}".format(acc_test_svm*100))

print(classification_report(y_test, y_test_svm))

SVM: Accuracy on training Data: 95.375
SVM: Accuracy on test Data: 95.251
              precision    recall  f1-score   support

          -1       0.96      0.93      0.95       983
           1       0.94      0.97      0.96      1228

    accuracy                           0.95      2211
   macro avg       0.95      0.95      0.95      2211
weighted avg       0.95      0.95      0.95      2211



In [9]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=0, shuffle=True)
results = cross_val_score(svm, X_train, y_train, cv=kfold)
print("K-Fold Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

K-Fold Accuracy: 94.505% (0.681%)


In [10]:
from sklearn.feature_selection import RFECV

# initialize SVM model
svm = SVC(kernel='linear', probability=True)

# initialize RFECV
rfecv = RFECV(estimator=svm, step=1, cv=20, scoring='accuracy')

rfecv.fit(X_train, y_train)

for i in range(X.shape[1]):
    print(column_name[i],': Selected %s, Rank: %.3f' % (rfecv.support_[i], rfecv.ranking_[i]))

0 : Selected True, Rank: 1.000
1 : Selected False, Rank: 2.000
2 : Selected True, Rank: 1.000
3 : Selected True, Rank: 1.000
4 : Selected True, Rank: 1.000
5 : Selected True, Rank: 1.000
6 : Selected True, Rank: 1.000
7 : Selected True, Rank: 1.000
8 : Selected False, Rank: 5.000
9 : Selected True, Rank: 1.000
10 : Selected True, Rank: 1.000
11 : Selected True, Rank: 1.000
12 : Selected True, Rank: 1.000
13 : Selected True, Rank: 1.000
14 : Selected True, Rank: 1.000
15 : Selected True, Rank: 1.000
16 : Selected True, Rank: 1.000
17 : Selected True, Rank: 1.000
18 : Selected True, Rank: 1.000
19 : Selected False, Rank: 6.000
20 : Selected False, Rank: 3.000
21 : Selected True, Rank: 1.000
22 : Selected False, Rank: 4.000
23 : Selected True, Rank: 1.000
24 : Selected True, Rank: 1.000
25 : Selected True, Rank: 1.000
26 : Selected True, Rank: 1.000
27 : Selected True, Rank: 1.000
28 : Selected True, Rank: 1.000
29 : Selected True, Rank: 1.000


In [11]:
X_train.columns[rfecv.support_]

Int64Index([ 0,  2,  3,  4,  5,  6,  7,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            21, 23, 24, 25, 26, 27, 28, 29],
           dtype='int64')

In [12]:
print('Optimal number of features: {}'.format(rfecv.n_features_))

Optimal number of features: 25


In [13]:
feature_importance = list(zip(column_name, rfecv.support_))
new_features = []
for key,value in enumerate(feature_importance):
    if(value[1]) == True:
        new_features.append(value[0])
        
print(new_features)

[0, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 23, 24, 25, 26, 27, 28, 29]


In [14]:
X_new = data[new_features]
X_new.head()

X_train_new, X_test_new, y_train, y_test = train_test_split(X_new, y, 
                                                    test_size = 0.2, random_state = 12)

In [15]:
svm = SVC(random_state=0, probability=True)

svm.fit(X_train_new,y_train)

y_train_svm = svm.predict(X_train_new)
y_test_svm =svm.predict(X_test_new)


#computing the accuracy of the model performance
acc_train_svm= accuracy_score(y_train,y_train_svm)
acc_test_svm = accuracy_score(y_test,y_test_svm)

# print("SVM: Accuracy on training Data: {:.3f}".format(acc_train_svm))
print("SVM: Feature Selection Accuracy: {:.3f}".format(acc_test_svm*100))

print(classification_report(y_test, y_test_svm))

SVM: Feature Selection Accuracy: 95.070
              precision    recall  f1-score   support

          -1       0.96      0.93      0.94       983
           1       0.94      0.97      0.96      1228

    accuracy                           0.95      2211
   macro avg       0.95      0.95      0.95      2211
weighted avg       0.95      0.95      0.95      2211



In [16]:
from sklearn.model_selection import GridSearchCV, cross_val_score
param_grid = {
         'C':[0.1,0.5,1],
         'gamma':[0.1,0.5,1],
         'kernel':['rbf','linear']
        }
grid = GridSearchCV(estimator = svm, param_grid = param_grid , scoring = 'accuracy', verbose = 1, n_jobs = -1, cv = 20)
grid.fit(X_train_new,y_train)
print("Best Score:" + str(grid.best_score_))
print("Best Parameters: " + str(grid.best_params_))
best_parameters = grid.best_params_
print(best_parameters)

Fitting 20 folds for each of 18 candidates, totalling 360 fits
Best Score:0.9551099047015923
Best Parameters: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}


In [17]:
svm = SVC(**best_parameters, random_state=0, probability=True)

svm.fit(X_train_new,y_train)

y_train_svm = svm.predict(X_train_new)
y_test_svm =svm.predict(X_test_new)


#computing the accuracy of the model performance
acc_train_svm= accuracy_score(y_train,y_train_svm)
acc_test_svm = accuracy_score(y_test,y_test_svm)

# print("SVM: Accuracy on training Data: {:.3f}".format(acc_train_svm))
print("SVM: Hyper Parameter Tuning Accuracy:{:.3f}".format(acc_test_svm*100.0))

print(classification_report(y_test, y_test_svm))

SVM: Hyper Parameter Tuning Accuracy:95.975
              precision    recall  f1-score   support

          -1       0.97      0.94      0.95       983
           1       0.95      0.97      0.96      1228

    accuracy                           0.96      2211
   macro avg       0.96      0.96      0.96      2211
weighted avg       0.96      0.96      0.96      2211



In [None]:
# Dump Model
import pickle

with open('svm.pkl', 'wb') as file:
    pickle.dump(svm, file)