In [507]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.ensemble import StackingClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

In [508]:
data = pd.read_csv('Final_URL_Data.csv')

In [509]:
data.head()

Unnamed: 0,url,length_url,ip,nb_dots,nb_hyphens,nb_at,nb_slash,nb_dslash,https_token,ratio_digits_url,...,right_clic,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,0,3,0,0,3,0,1,0.0,...,0,1,0,45,-1,0,1,1,4,0
1,http://shadetreetechnology.com/V4/validation/a...,77,1,1,0,0,5,0,1,0.220779,...,0,0,0,77,5767,0,0,1,2,1
2,https://support-appleld.com.secureupdate.duila...,126,1,4,1,0,5,0,0,0.150794,...,0,0,0,14,4004,5828815,0,1,0,1
3,http://rgipt.ac.in,18,0,2,0,0,2,0,1,0.0,...,0,0,0,62,-1,107721,0,0,3,0
4,http://www.iracing.com/tracks/gateway-motorspo...,55,0,2,2,0,5,0,1,0.0,...,0,1,0,224,8175,8725,0,0,6,0


In [510]:
data.columns

Index(['url', 'length_url', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_slash',
       'nb_dslash', 'https_token', 'ratio_digits_url', 'prefix_suffix',
       'shortening_service', 'nb_hyperlinks', 'iframe', 'right_clic',
       'domain_with_copyright', 'whois_registered_domain',
       'domain_registration_length', 'domain_age', 'web_traffic', 'dns_record',
       'google_index', 'page_rank', 'status'],
      dtype='object')

In [511]:
df = data.drop(['url'], axis = 1).copy()
df.head()

Unnamed: 0,length_url,ip,nb_dots,nb_hyphens,nb_at,nb_slash,nb_dslash,https_token,ratio_digits_url,prefix_suffix,...,right_clic,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,37,0,3,0,0,3,0,1,0.0,0,...,0,1,0,45,-1,0,1,1,4,0
1,77,1,1,0,0,5,0,1,0.220779,0,...,0,0,0,77,5767,0,0,1,2,1
2,126,1,4,1,0,5,0,0,0.150794,1,...,0,0,0,14,4004,5828815,0,1,0,1
3,18,0,2,0,0,2,0,1,0.0,0,...,0,0,0,62,-1,107721,0,0,3,0
4,55,0,2,2,0,5,0,1,0.0,0,...,0,1,0,224,8175,8725,0,0,6,0


In [512]:
X = df.drop('status', axis=1)
Y = df['status']
X.shape, Y.shape

((11430, 22), (11430,))

In [513]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((9144, 22), (2286, 22), (9144,), (2286,))

In [514]:
knn = KNeighborsClassifier(n_neighbors=7, metric='euclidean')
rf = RandomForestClassifier(max_depth=5)
xgb = XGBClassifier(learning_rate=0.3,max_depth=7)

models = [('knn', knn), ('rf', rf), ('xgb', xgb)]

In [515]:
stk = StackingClassifier(estimators=models, final_estimator=SVC(kernel='rbf'))

In [516]:
knn.fit(X_train, Y_train)

In [517]:
rf.fit(X_train, Y_train)

In [518]:
xgb.fit(X_train, Y_train)

In [519]:
stk.fit(X_train, Y_train)

In [520]:
knn_pred = knn.predict(X_test)
rf_pred = rf.predict(X_test)
xgb_pred = xgb.predict(X_test)

stk_pred = stk.predict(X_test)

In [521]:
pred_list = {'KNN': knn_pred, 'RandomForest': rf_pred, 'XGBoost': xgb_pred, 'StackingClassifier': stk_pred}

In [522]:
for model, pred in pred_list.items():
    
    acc = accuracy_score(Y_test, pred)
    prec = precision_score(Y_test, pred)
    f1 = f1_score(Y_test, pred)
    rec = recall_score(Y_test, pred)

    print("Accuracy of "+model+": "+ "%.2f" % (acc*100))
    print("Precision of "+model+": "+ "%.2f" % (prec*100))
    print("F1-Score of "+model+": "+ "%.2f" % (f1))
    print("Recall of "+model+": "+ "%.2f" % (rec))
    print(" ")

Accuracy of KNN: 83.95
Precision of KNN: 83.33
F1-Score of KNN: 0.84
Recall of KNN: 0.84
 
Accuracy of RandomForest: 93.66
Precision of RandomForest: 94.02
F1-Score of RandomForest: 0.94
Recall of RandomForest: 0.93
 
Accuracy of XGBoost: 96.63
Precision of XGBoost: 96.47
F1-Score of XGBoost: 0.97
Recall of XGBoost: 0.97
 
Accuracy of StackingClassifier: 96.76
Precision of StackingClassifier: 96.81
F1-Score of StackingClassifier: 0.97
Recall of StackingClassifier: 0.97
 
