In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.ensemble import StackingClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

In [2]:
data = pd.read_csv('URL-Data.csv')

In [3]:
data.head()

Unnamed: 0,url,length_url,ip,nb_dots,nb_hyphens,nb_at,nb_slash,nb_dslash,https_token,ratio_digits_url,...,right_clic,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,https://www.cranepi.com/en,26,0,2,0,0,3,0,0,0.0,...,0,1,0,1242,2409,1324300,0,0,3,0
1,http://www.ijo.in,17,0,2,0,0,2,0,1,0.0,...,0,1,0,214,-1,219997,0,0,5,0
2,https://www.outdoorxl.be/,25,0,2,0,0,3,0,0,0.0,...,0,0,1,0,5687,0,0,0,1,0
3,http://www.wikiwand.com/en/MultiMediaCard,41,0,2,0,0,4,0,1,0.0,...,0,1,0,165,2391,1652,0,0,6,0
4,https://login-outlook-office365.el.r.appspot.com/,49,0,4,2,0,3,0,0,0.061224,...,0,1,0,217,5627,0,0,1,5,1


In [4]:
data.columns

Index(['url', 'length_url', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_slash',
       'nb_dslash', 'https_token', 'ratio_digits_url', 'prefix_suffix',
       'shortening_service', 'nb_hyperlinks', 'iframe', 'right_clic',
       'domain_with_copyright', 'whois_registered_domain',
       'domain_registration_length', 'domain_age', 'web_traffic', 'dns_record',
       'google_index', 'page_rank', 'status'],
      dtype='object')

In [5]:
df = data.drop(['url'], axis = 1).copy()
df.head()

Unnamed: 0,length_url,ip,nb_dots,nb_hyphens,nb_at,nb_slash,nb_dslash,https_token,ratio_digits_url,prefix_suffix,...,right_clic,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,26,0,2,0,0,3,0,0,0.0,0,...,0,1,0,1242,2409,1324300,0,0,3,0
1,17,0,2,0,0,2,0,1,0.0,0,...,0,1,0,214,-1,219997,0,0,5,0
2,25,0,2,0,0,3,0,0,0.0,0,...,0,0,1,0,5687,0,0,0,1,0
3,41,0,2,0,0,4,0,1,0.0,0,...,0,1,0,165,2391,1652,0,0,6,0
4,49,0,4,2,0,3,0,0,0.061224,0,...,0,1,0,217,5627,0,0,1,5,1


In [6]:
X = df.drop('status', axis=1)
Y = df['status']
X.shape, Y.shape

((11430, 22), (11430,))

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 0)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((10287, 22), (1143, 22), (10287,), (1143,))

In [8]:
knn = KNeighborsClassifier(n_neighbors=7, metric='euclidean')
rf = RandomForestClassifier(max_depth=5)
xgb = XGBClassifier(learning_rate=0.3,max_depth=7)

models = [('knn', knn), ('rf', rf), ('xgb', xgb)]

In [9]:
stk = StackingClassifier(estimators=models, final_estimator=SVC(kernel='rbf'))

In [10]:
knn.fit(X_train, Y_train)

In [11]:
rf.fit(X_train, Y_train)

In [12]:
xgb.fit(X_train, Y_train)

In [13]:
stk.fit(X_train, Y_train)

In [14]:
stk_pred = stk.predict(X_test)

In [15]:
acc = accuracy_score(Y_test, stk_pred)
prec = precision_score(Y_test, stk_pred)
f1 = f1_score(Y_test, stk_pred)
rec = recall_score(Y_test, stk_pred)

print("Accuracy of Stacking Classifier: "+ "%.2f" % (acc*100))
print("Precision of Stacking Classifier: "+ "%.2f" % (prec*100))
print("F1-Score of Stacking Classifier: "+ "%.2f" % (f1*100))
print("Recall of Stacking Classifier: "+ "%.2f" % (rec*100))


Accuracy of Stacking Classifier: 97.46
Precision of Stacking Classifier: 97.46
F1-Score of Stacking Classifier: 97.38
Recall of Stacking Classifier: 97.29


In [16]:
#import pickle
#pickle.dump(stk, open("StackingClassifier.pkl", "wb"))