In [67]:
import numpy as np
import pandas as pd

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Normalizer

from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier, plot_importance
from tpot.export_utils import set_param_recursive

In [54]:
%matplotlib widget

## Domain Only

In [2]:
X_train = pd.read_csv('data/domain_legitimates.csv').dropna()[:49000]
y_train = pd.Series(np.zeros((len(X_train),)), dtype=np.int)

X_train = pd.concat([X_train, pd.read_csv('data/domain_phishings.csv').dropna()[:49000]]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(np.ones((len(X_train) - len(y_train),)), dtype=np.int)], ignore_index=True)

In [3]:
bool_cols = ['domain_is_ip', 'path_percent20_in', 'path_single_char_dir_in', 'path_upper_dir_in']
for col in bool_cols:
    X_train[col] = X_train[col].astype(np.int)
X_train.tail()

Unnamed: 0,url_len,url_n_alpha,url_n_ampersand,url_n_digit,url_n_dot,url_n_equal,url_n_question_mark,url_n_semicolon,url_n_sp_char,url_n_underscore,...,query_n_digit,name_len,name_n_digit,name_rate_digit,ratio_domain_url,ratio_path_domain,ratio_path_url,ratio_query_domain,ratio_query_path,ratio_query_url
95770,42,32,0,3,2,0,0,0,7,0,...,0,0,0,0,2,1,1,34,1,42
95771,34,28,0,0,2,0,0,0,6,0,...,0,0,0,0,2,1,1,26,1,34
95772,30,24,0,0,2,0,0,0,6,0,...,0,0,0,0,2,1,1,22,1,30
95773,26,19,0,0,3,0,0,0,7,0,...,0,0,0,0,2,1,1,18,1,26
95774,34,29,0,0,1,0,0,0,5,0,...,0,0,0,0,2,1,1,26,1,34


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train)

In [70]:
# Average CV score on the training set was: 0.90008
exported_pipeline_domain = make_pipeline(
    StackingEstimator(estimator=SGDClassifier(alpha=0.0, eta0=1.0, fit_intercept=False, l1_ratio=0.0, learning_rate="invscaling", loss="squared_hinge", penalty="elasticnet", power_t=0.5)),
    XGBClassifier(learning_rate=0.1, max_depth=6, min_child_weight=4, n_estimators=100, nthread=1, subsample=0.6500000000000001)
)

In [None]:
exported_pipeline_domain.fit(X_train, y_train)
results_domain = exported_pipeline_domain.predict(X_test)

In [None]:
model_domain = exported_pipeline_domain.named_steps['xgbclassifier']
model_domain.get_booster().feature_names = X_train.columns.to_list()

In [60]:
plot_importance(model_domain, height=0.5)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x23274a74e88>

## URL Only

In [39]:
X_train = pd.read_csv('data/url_legitimates.csv').dropna()[:99000]
y_train = pd.Series(np.zeros((len(X_train),)), dtype=np.int)

X_train = pd.concat([X_train, pd.read_csv('data/url_phishings.csv').dropna()[:99000]]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(np.ones((len(X_train) - len(y_train),)), dtype=np.int)], ignore_index=True)

In [40]:
bool_cols = ['domain_is_ip', 'path_percent20_in', 'path_single_char_dir_in', 'path_upper_dir_in']
for col in bool_cols:
    X_train[col] = X_train[col].astype(np.int)
X_train.tail()

Unnamed: 0,url_len,url_n_alpha,url_n_ampersand,url_n_digit,url_n_dot,url_n_equal,url_n_question_mark,url_n_semicolon,url_n_sp_char,url_n_underscore,...,query_n_digit,name_len,name_n_digit,name_rate_digit,ratio_domain_url,ratio_path_domain,ratio_path_url,ratio_query_domain,ratio_query_path,ratio_query_url
199995,48,38,0,0,4,0,0,0,10,0,...,0,10,0,0.0,1,1,24,17,24,48
199996,33,26,0,1,2,0,0,0,6,0,...,0,10,1,0.1,3,1,11,15,11,33
199997,40,19,0,11,4,0,0,0,10,0,...,0,9,0,0.0,2,1,1,14,19,40
199998,80,36,0,34,1,1,0,0,9,0,...,0,0,0,0.0,5,1,2,15,58,80
199999,51,27,0,15,1,0,0,0,7,0,...,0,0,0,0.0,3,1,1,15,29,51


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train)

In [42]:
model_url = XGBClassifier(learning_rate=0.4, max_depth=10, min_child_weight=13, n_estimators=100, nthread=1, subsample=0.8)

In [43]:
model_url.fit(X_train, y_train)
results_url = model.predict(X_test)



In [44]:
model_url.get_booster().feature_names = X_train.columns.to_list()

In [61]:
plot_importance(model_url, height=0.5)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x2327dddd688>

## URL + Domain

In [63]:
X_train = pd.read_csv('data/url_legitimates.csv').dropna()[:99000]
y_train = pd.Series(np.zeros((len(X_train),)), dtype=np.int)

X_train = pd.concat([X_train, pd.read_csv('data/domain_legitimates.csv').dropna()[:49000]]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(np.zeros((len(X_train) - len(y_train),)), dtype=np.int)], ignore_index=True)

X_train = pd.concat([X_train, pd.read_csv('data/url_phishings.csv').dropna()[:99000]]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(np.ones((len(X_train) - len(y_train),)), dtype=np.int)], ignore_index=True)

X_train = pd.concat([X_train, pd.read_csv('data/domain_phishings.csv').dropna()[:49000]]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(np.ones((len(X_train) - len(y_train),)), dtype=np.int)], ignore_index=True)

In [64]:
bool_cols = ['domain_is_ip', 'path_percent20_in', 'path_single_char_dir_in', 'path_upper_dir_in']
for col in bool_cols:
    X_train[col] = X_train[col].astype(np.int)
X_train

Unnamed: 0,url_len,url_n_alpha,url_n_ampersand,url_n_digit,url_n_dot,url_n_equal,url_n_question_mark,url_n_semicolon,url_n_sp_char,url_n_underscore,...,query_n_digit,name_len,name_n_digit,name_rate_digit,ratio_domain_url,ratio_path_domain,ratio_path_url,ratio_query_domain,ratio_query_path,ratio_query_url
0,58,49,0,0,2,0,0,0,9,0,...,0,25,0,0.000000,1,5,1,15,35,58
1,74,55,0,9,2,0,0,0,10,0,...,0,35,2,0.057143,1,3,1,15,51,74
2,82,49,0,23,2,0,0,0,10,0,...,0,32,17,0.531250,1,15,2,15,60,82
3,111,84,0,12,3,0,0,0,12,1,...,0,26,6,0.230769,3,1,1,15,88,111
4,56,45,0,0,2,0,0,0,11,0,...,0,11,0,0.000000,1,1,2,15,34,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179995,28,21,0,0,2,0,0,0,7,0,...,0,0,0,0.000000,4,1,1,20,1,28
179996,32,26,0,0,2,0,0,0,6,0,...,0,0,0,0.000000,8,1,1,24,1,32
179997,45,35,0,1,5,0,0,0,9,0,...,0,0,0,0.000000,1,1,1,37,1,45
179998,41,35,0,0,2,0,0,0,6,0,...,0,0,0,0.000000,1,1,1,33,1,41


In [65]:
X_train, X_test, y_train, y_test = train_test_split(X_train.to_numpy(), y_train.to_numpy())

In [81]:
model = XGBClassifier(learning_rate=0.4, max_depth=10, min_child_weight=13, n_estimators=100, nthread=1, subsample=0.8)

In [82]:
model.fit(X_train, y_train)
results = model.predict(X_test)

In [89]:
model.get_booster().feature_names = pd.read_csv('data/domain_legitimates.csv').columns.to_list()

In [90]:
plot_importance(model, height=0.5)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x2327e3e3708>