In [1]:
#importing basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('Training Dataset.arff', header=None, comment='@')
# data = pd.read_csv("urldata.csv")
# data = data.drop(['Domain'], axis = 1).copy()

In [3]:
data.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
dtype: int64

In [4]:
# shuffling the rows in the dataset so that when splitting the train and test set are equally distributed
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,1,-1,1,1,1,-1,0,0,1,1,...,1,1,-1,1,-1,-1,1,0,1,-1
1,1,-1,1,1,1,-1,0,1,1,1,...,1,1,-1,1,-1,1,1,0,1,-1
2,-1,-1,1,1,1,-1,-1,-1,1,1,...,1,1,1,1,1,-1,1,1,1,-1
3,-1,-1,1,1,1,-1,0,1,-1,1,...,1,1,-1,1,0,-1,1,0,1,-1
4,-1,-1,-1,-1,-1,-1,0,-1,-1,-1,...,-1,-1,-1,-1,-1,1,-1,1,1,-1


In [5]:
# Sepratating & assigning features and target columns to X & y
# y = data['Label']
# X = data.drop('Label',axis=1)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [6]:
X.shape, y.shape

column_name=list(X.columns)

In [7]:
# Splitting the dataset into train and test sets: 80-20 split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, random_state = 12)
X_train.shape, X_test.shape

((8844, 30), (2211, 30))

# Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

dt=DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)

y_train_dt = dt.predict(X_train)
y_test_dt =dt.predict(X_test)


#computing the accuracy of the model performance
acc_train_dt= accuracy_score(y_train,y_train_dt)
acc_test_dt = accuracy_score(y_test,y_test_dt)

print("Decision Tree: Accuracy on training Data: {:.3f}".format(acc_train_dt*100.0))
print("Decision Tree: Accuracy on test Data: {:.3f}".format(acc_test_dt*100.0))

print(classification_report(y_test, y_test_dt))

Decision Tree: Accuracy on training Data: 99.050
Decision Tree: Accuracy on test Data: 97.060
              precision    recall  f1-score   support

          -1       0.97      0.97      0.97       989
           1       0.97      0.97      0.97      1222

    accuracy                           0.97      2211
   macro avg       0.97      0.97      0.97      2211
weighted avg       0.97      0.97      0.97      2211



In [9]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=5, random_state=0, shuffle=True)
results = cross_val_score(dt, X_train, y_train, cv=kfold)
print("Decision Tree K-Fold Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Decision Tree K-Fold Accuracy: 96.190% (0.389%)


In [10]:
from sklearn.feature_selection import RFECV

## create ranking among all features by selecting only one
rfecv = RFECV(estimator=dt, step=1, cv=20, scoring='accuracy')
rfecv.fit(X_train, y_train)

for i in range(X.shape[1]):
    print(column_name[i],': Selected %s, Rank: %.3f' % (rfecv.support_[i], rfecv.ranking_[i]))


0 : Selected True, Rank: 1.000
1 : Selected True, Rank: 1.000
2 : Selected True, Rank: 1.000
3 : Selected True, Rank: 1.000
4 : Selected False, Rank: 2.000
5 : Selected True, Rank: 1.000
6 : Selected True, Rank: 1.000
7 : Selected True, Rank: 1.000
8 : Selected True, Rank: 1.000
9 : Selected False, Rank: 3.000
10 : Selected False, Rank: 4.000
11 : Selected True, Rank: 1.000
12 : Selected True, Rank: 1.000
13 : Selected True, Rank: 1.000
14 : Selected True, Rank: 1.000
15 : Selected True, Rank: 1.000
16 : Selected True, Rank: 1.000
17 : Selected True, Rank: 1.000
18 : Selected True, Rank: 1.000
19 : Selected True, Rank: 1.000
20 : Selected True, Rank: 1.000
21 : Selected True, Rank: 1.000
22 : Selected True, Rank: 1.000
23 : Selected True, Rank: 1.000
24 : Selected True, Rank: 1.000
25 : Selected True, Rank: 1.000
26 : Selected True, Rank: 1.000
27 : Selected True, Rank: 1.000
28 : Selected True, Rank: 1.000
29 : Selected True, Rank: 1.000


In [11]:
X_train.columns[rfecv.support_]

Int64Index([ 0,  1,  2,  3,  5,  6,  7,  8, 11, 12, 13, 14, 15, 16, 17, 18, 19,
            20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
           dtype='int64')

In [12]:
print('Decision Tree Optimal number of features: {}'.format(rfecv.n_features_))

Decision Tree Optimal number of features: 27


In [13]:
feature_importance = list(zip(column_name, rfecv.support_))
new_features = []
for key,value in enumerate(feature_importance):
    if(value[1]) == True:
        new_features.append(value[0])
        
print(new_features)

[0, 1, 2, 3, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


In [14]:
X_new = data[new_features]
X_new.head()

X_train_new, X_test_new, y_train, y_test = train_test_split(X_new, y, 
                                                    test_size = 0.2, random_state = 12)

In [15]:
from sklearn.model_selection import GridSearchCV, cross_val_score
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5,10,15,20,25],
    'min_samples_split': [2, 3],
    'max_features': ['auto', 'sqrt', 'log2'],
    }
grid = GridSearchCV(estimator = dt, param_grid = param_grid , scoring = 'accuracy', verbose = 1, n_jobs = -1, cv =20)
grid.fit(X_train_new,y_train)
print("Best Score:" + str(grid.best_score_))
print("Best Parameters: " + str(grid.best_params_))
best_parameters = grid.best_params_
print(best_parameters)

Fitting 20 folds for each of 60 candidates, totalling 1200 fits
Best Score:0.9637082111886253
Best Parameters: {'criterion': 'entropy', 'max_depth': 25, 'max_features': 'auto', 'min_samples_split': 2}
{'criterion': 'entropy', 'max_depth': 25, 'max_features': 'auto', 'min_samples_split': 2}


In [16]:
dt = DecisionTreeClassifier(random_state=0)

dt.fit(X_train_new,y_train)

y_train_dt = dt.predict(X_train_new)
y_test_dt =dt.predict(X_test_new)


#computing the accuracy of the model performance
acc_train_dt= accuracy_score(y_train,y_train_dt)
acc_test_dt = accuracy_score(y_test,y_test_dt)

# print("Decision Tree: Accuracy on training Data: {:.3f}".format(acc_train_dt))
print("Decsision Tree Feature Selection Accuracy: {:.3f}".format(acc_test_dt*100.0))

print(classification_report(y_test, y_test_dt))

Decsision Tree Feature Selection Accuracy: 96.834
              precision    recall  f1-score   support

          -1       0.97      0.96      0.96       989
           1       0.97      0.97      0.97      1222

    accuracy                           0.97      2211
   macro avg       0.97      0.97      0.97      2211
weighted avg       0.97      0.97      0.97      2211



In [17]:
dt = DecisionTreeClassifier(**best_parameters, random_state=0)

dt.fit(X_train_new,y_train)

y_train_dt = dt.predict(X_train_new)
y_test_dt =dt.predict(X_test_new)


#computing the accuracy of the model performance
acc_train_dt= accuracy_score(y_train,y_train_dt)
acc_test_dt = accuracy_score(y_test,y_test_dt)

# print("Decision Tree: Accuracy on training Data: {:.3f}".format(acc_train_dt))
print("Decision Tree: Hyper Parameter Tuning Accuracy:{:.3f}".format(acc_test_dt*100.0))


print(classification_report(y_test, y_test_dt))

Decision Tree: Hyper Parameter Tuning Accuracy:96.834
              precision    recall  f1-score   support

          -1       0.97      0.96      0.96       989
           1       0.97      0.98      0.97      1222

    accuracy                           0.97      2211
   macro avg       0.97      0.97      0.97      2211
weighted avg       0.97      0.97      0.97      2211



In [None]:
# Dump Model
import pickle

with open('dt.pkl', 'wb') as file:
    pickle.dump(dt, file)