In [1]:
#importing basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("urldata.csv")
# data = pd.read_csv('Training Dataset.arff', header=None, comment='@')
data = data.drop(['Domain'], axis = 1).copy()

In [3]:
data.isnull().sum()

Have_IP          0
Have_At          0
URL_Length       0
URL_Depth        0
Redirection      0
https_Domain     0
TinyURL          0
Prefix/Suffix    0
DNS_Record       0
Web_Traffic      0
Domain_Age       0
Domain_End       0
iFrame           0
Mouse_Over       0
Right_Click      0
Web_Forwards     0
Google_Index     0
count_dot        0
count_www        0
count_per        0
count_ques       0
count_hyphen     0
count_equal      0
Label            0
dtype: int64

In [4]:
# shuffling the rows in the dataset so that when splitting the train and test set are equally distributed
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Web_Traffic,...,Right_Click,Web_Forwards,Google_Index,count_dot,count_www,count_per,count_ques,count_hyphen,count_equal,Label
0,0,0,1,2,0,0,0,0,0,0,...,1,0,1,1,0,0,0,8,0,0
1,0,0,0,0,0,0,0,1,0,1,...,1,1,1,1,0,0,0,1,0,1
2,0,0,1,1,0,0,0,0,0,1,...,1,0,1,2,0,0,0,12,0,0
3,0,0,0,0,0,0,0,1,0,1,...,1,0,1,2,0,0,0,1,0,1
4,0,0,1,3,0,0,0,0,0,0,...,1,0,1,1,0,16,0,0,0,0


In [5]:
# Sepratating & assigning features and target columns to X & y
y = data['Label']
X = data.drop('Label',axis=1)
# X = data.iloc[:, :-1]
# y = data.iloc[:, -1]
X.shape, y.shape

column_name=list(X.columns)

# for i in range(0,y.shape[0]):
#     y = [0 if i == -1 else i for i in y]
# y


In [6]:
# Splitting the dataset into train and test sets: 80-20 split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, random_state = 12)
X_train.shape, X_test.shape

((8000, 23), (2000, 23))

In [7]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

xg=XGBClassifier(random_state=0)
xg.fit(X_train, y_train)

y_train_xg = xg.predict(X_train)
y_test_xg =xg.predict(X_test)


#computing the accuracy of the model performance
acc_train_xg= accuracy_score(y_train,y_train_xg)
acc_test_xg = accuracy_score(y_test,y_test_xg)

print("XGBoost Accuracy on training Data: {:.3f}".format(acc_train_xg*100))
print("XGBoost Accuracy on test Data: {:.3f}".format(acc_test_xg*100))

print(classification_report(y_test, y_test_xg))

XGBoost Accuracy on training Data: 99.125
XGBoost Accuracy on test Data: 98.600
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1014
           1       0.98      0.99      0.99       986

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000



In [8]:
from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=0, shuffle=True)
results = cross_val_score(xg, X_train, y_train, cv=kfold)
print("K-Fold Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

K-Fold Accuracy: 98.125% (0.244%)


In [9]:
from sklearn.feature_selection import RFECV


## create ranking among all features by selecting only one
rfecv = RFECV(XGBClassifier(random_state=0), step=1, cv=10, scoring='accuracy')
rfecv.fit(X_train, y_train)

for i in range(X.shape[1]):
    print(column_name[i],': Selected %s, Rank: %.3f' % (rfecv.support_[i], rfecv.ranking_[i]))


Have_IP : Selected False, Rank: 6.000
Have_At : Selected True, Rank: 1.000
URL_Length : Selected True, Rank: 1.000
URL_Depth : Selected True, Rank: 1.000
Redirection : Selected False, Rank: 2.000
https_Domain : Selected False, Rank: 3.000
TinyURL : Selected True, Rank: 1.000
Prefix/Suffix : Selected True, Rank: 1.000
DNS_Record : Selected True, Rank: 1.000
Web_Traffic : Selected True, Rank: 1.000
Domain_Age : Selected True, Rank: 1.000
Domain_End : Selected True, Rank: 1.000
iFrame : Selected True, Rank: 1.000
Mouse_Over : Selected True, Rank: 1.000
Right_Click : Selected False, Rank: 4.000
Web_Forwards : Selected True, Rank: 1.000
Google_Index : Selected False, Rank: 5.000
count_dot : Selected True, Rank: 1.000
count_www : Selected True, Rank: 1.000
count_per : Selected True, Rank: 1.000
count_ques : Selected True, Rank: 1.000
count_hyphen : Selected True, Rank: 1.000
count_equal : Selected True, Rank: 1.000


In [10]:
X_train.columns[rfecv.support_]

Index(['Have_At', 'URL_Length', 'URL_Depth', 'TinyURL', 'Prefix/Suffix',
       'DNS_Record', 'Web_Traffic', 'Domain_Age', 'Domain_End', 'iFrame',
       'Mouse_Over', 'Web_Forwards', 'count_dot', 'count_www', 'count_per',
       'count_ques', 'count_hyphen', 'count_equal'],
      dtype='object')

In [11]:
print('Optimal number of features: {}'.format(rfecv.n_features_))

Optimal number of features: 18


In [12]:
feature_importance = list(zip(column_name, rfecv.support_))
new_features = []
for key,value in enumerate(feature_importance):
    if(value[1]) == True:
        new_features.append(value[0])
        
print(new_features)

['Have_At', 'URL_Length', 'URL_Depth', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', 'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over', 'Web_Forwards', 'count_dot', 'count_www', 'count_per', 'count_ques', 'count_hyphen', 'count_equal']


In [13]:
X_new = data[new_features]
X_new.head()

X_train_new, X_test_new, y_train, y_test = train_test_split(X_new, y, 
                                                    test_size = 0.2, random_state = 12)

In [14]:
xg = XGBClassifier(random_state=0)

xg.fit(X_train_new,y_train)

y_train_xg = xg.predict(X_train_new)
y_test_xg =xg.predict(X_test_new)


#computing the accuracy of the model performance
acc_train_xg= accuracy_score(y_train,y_train_xg)
acc_test_xg = accuracy_score(y_test,y_test_xg)

# print("XGBoost Accuracy on training Data: {:.3f}".format(acc_train_xg))
print("Feature Selection Accuracy: {:.3f}".format(acc_test_xg*100))

print(classification_report(y_test, y_test_xg))

Feature Selection Accuracy: 98.550
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1014
           1       0.98      0.99      0.99       986

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000



In [15]:
from sklearn.model_selection import GridSearchCV, cross_val_score
param_grid = {
    'max_depth': [5,10,15,20],
              'n_estimators': [100, 200, 300],
              'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5]}

grid = GridSearchCV(estimator = xg, param_grid = param_grid , scoring = 'accuracy', verbose = 1, n_jobs = -1, cv =10)
grid.fit(X_train_new,y_train)
print("Best Score:" + str(grid.best_score_))
print("Best Parameters: " + str(grid.best_params_))
best_parameters = grid.best_params_
print(best_parameters)

Fitting 10 folds for each of 60 candidates, totalling 600 fits
Best Score:0.9818749999999999
Best Parameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200}
{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200}


In [16]:
xg = XGBClassifier(**best_parameters, random_state=0)

xg.fit(X_train_new,y_train)

y_train_xg = xg.predict(X_train_new)
y_test_xg =xg.predict(X_test_new)


#computing the accuracy of the model performance
acc_train_xg= accuracy_score(y_train,y_train_xg)
acc_test_xg = accuracy_score(y_test,y_test_xg)

# print("XGBoost Accuracy on training Data: {:.3f}".format(acc_train_xg))
print("Hyper Parameter Accuracy {:.3f}".format(acc_test_xg*100))

print(classification_report(y_test, y_test_xg))

Hyper Parameter Accuracy 98.550
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1014
           1       0.98      0.99      0.99       986

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000



In [17]:
import pickle

with open('xg.pkl', 'wb') as file:
    pickle.dump(xg, file)