In [1]:
import numpy as np
import pandas as pd

from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

In [8]:
def get_model_scores(t):
    my_dict = list(t.evaluated_individuals_.items())
    model_scores = pd.DataFrame()
    for model in my_dict:
        model_name = model[0]
        model_info = model[1]
        cv_score = model[1].get('internal_cv_score')  # Pull out cv_score as a column (i.e., sortable)
        model_scores = model_scores.append({'model': model_name, 'cv_score': cv_score, 'model_info': model_info,},
                                           ignore_index=True)
    model_scores = model_scores.sort_values('cv_score', ascending=False)
    return model_scores

# URL + Domain

In [2]:
X_train = pd.read_csv('data/url_legitimates.csv').dropna()[:40000]
y_train = pd.Series(np.zeros((len(X_train),)), dtype=np.int)

X_train = pd.concat([X_train, pd.read_csv('data/domain_legitimates.csv').dropna()[:40000]]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(np.zeros((len(X_train) - len(y_train),)), dtype=np.int)], ignore_index=True)

X_train = pd.concat([X_train, pd.read_csv('data/url_phishings.csv').dropna()[:40000]]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(np.ones((len(X_train) - len(y_train),)), dtype=np.int)], ignore_index=True)

X_train = pd.concat([X_train, pd.read_csv('data/domain_phishings.csv').dropna()[:40000]]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(np.ones((len(X_train) - len(y_train),)), dtype=np.int)], ignore_index=True)

In [3]:
bool_cols = ['domain_is_ip', 'path_percent20_in', 'path_single_char_dir_in', 'path_upper_dir_in']
for col in bool_cols:
    X_train[col] = X_train[col].astype(np.int)
X_train

Unnamed: 0,url_len,url_n_alpha,url_n_ampersand,url_n_digit,url_n_dot,url_n_equal,url_n_question_mark,url_n_semicolon,url_n_sp_char,url_n_underscore,...,query_n_digit,name_len,name_n_digit,name_rate_digit,ratio_domain_url,ratio_path_domain,ratio_path_url,ratio_query_domain,ratio_query_path,ratio_query_url
0,58,49,0,0,2,0,0,0,9,0,...,0,25,0,0.000000,1,5,1,15,35,58
1,74,55,0,9,2,0,0,0,10,0,...,0,35,2,0.057143,1,3,1,15,51,74
2,82,49,0,23,2,0,0,0,10,0,...,0,32,17,0.531250,1,15,2,15,60,82
3,111,84,0,12,3,0,0,0,12,1,...,0,26,6,0.230769,3,1,1,15,88,111
4,56,45,0,0,2,0,0,0,11,0,...,0,11,0,0.000000,1,1,2,15,34,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,28,21,0,0,2,0,0,0,7,0,...,0,0,0,0.000000,4,1,1,20,1,28
159996,32,26,0,0,2,0,0,0,6,0,...,0,0,0,0.000000,8,1,1,24,1,32
159997,45,35,0,1,5,0,0,0,9,0,...,0,0,0,0.000000,1,1,1,37,1,45
159998,41,35,0,0,2,0,0,0,6,0,...,0,0,0,0.000000,1,1,1,33,1,41


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_train.to_numpy(), y_train.to_numpy())

In [None]:
tpot = TPOTClassifier(generations=10, verbosity=2, n_jobs=4)
tpot.fit(X_train, y_train)

print(tpot.score(X_test, y_test))
tpot.export('tpot_pipelines/tpot_url_domain_pipeline.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=1100.0, style=ProgressStyle(d…

Generation 1 - Current best internal CV score: 0.9240583333333333


In [6]:
tpot.score(X_test, y_test)

0.927075

In [None]:
print(tpot.score(X_test, y_test))
tpot.export('tpot_pipelines/tpot_url_domain_pipeline.py')

In [9]:
model_scores = get_model_scores(tpot)
model_scores.to_csv('tpot_scores/tpot_url_domain_model_scores_2.csv')
model_scores.head()

Unnamed: 0,cv_score,model,model_info
525,0.926608,"XGBClassifier(input_matrix, XGBClassifier__lea...","{'generation': 'INVALID', 'mutation_count': 2,..."
571,0.924675,"XGBClassifier(input_matrix, XGBClassifier__lea...","{'generation': 'INVALID', 'mutation_count': 2,..."
349,0.924592,"XGBClassifier(input_matrix, XGBClassifier__lea...","{'generation': 'INVALID', 'mutation_count': 1,..."
595,0.924417,"XGBClassifier(input_matrix, XGBClassifier__lea...","{'generation': 'INVALID', 'mutation_count': 3,..."
449,0.924308,"XGBClassifier(VarianceThreshold(input_matrix, ...","{'generation': 'INVALID', 'mutation_count': 1,..."


# Only Domain

In [2]:
X_train = pd.read_csv('data/domain_legitimates.csv').dropna()[:50000]
y_train = pd.Series(np.zeros((len(X_train),)), dtype=np.int)

X_train = pd.concat([X_train, pd.read_csv('data/domain_phishings.csv').dropna()[:50000]]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(np.ones((len(X_train) - len(y_train),)), dtype=np.int)], ignore_index=True)

In [3]:
bool_cols = ['domain_is_ip', 'path_percent20_in', 'path_single_char_dir_in', 'path_upper_dir_in']
for col in bool_cols:
    X_train[col] = X_train[col].astype(np.int)
X_train

Unnamed: 0,url_len,url_n_alpha,url_n_ampersand,url_n_digit,url_n_dot,url_n_equal,url_n_question_mark,url_n_semicolon,url_n_sp_char,url_n_underscore,...,query_n_digit,name_len,name_n_digit,name_rate_digit,ratio_domain_url,ratio_path_domain,ratio_path_url,ratio_query_domain,ratio_query_path,ratio_query_url
0,22,15,0,1,2,0,0,0,6,0,...,0,0,0,0,2,1,1,14,1,22
1,19,14,0,0,1,0,0,0,5,0,...,0,0,0,0,1,1,1,11,1,19
2,18,13,0,0,1,0,0,0,5,0,...,0,0,0,0,9,1,1,9,1,18
3,27,19,0,3,1,0,0,0,5,0,...,0,0,0,0,1,1,1,19,1,27
4,31,22,0,3,2,0,0,0,6,0,...,0,0,0,0,1,1,1,23,1,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,23,16,0,0,2,0,0,0,7,0,...,0,0,0,0,1,1,1,15,1,23
49996,35,27,0,0,4,0,0,0,8,0,...,0,0,0,0,1,1,1,27,1,35
49997,33,25,0,0,4,0,0,0,8,0,...,0,0,0,0,1,1,1,25,1,33
49998,21,13,0,2,2,0,0,0,6,0,...,0,0,0,0,1,1,1,13,1,21


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_train.to_numpy(), y_train.to_numpy())

In [5]:
tpot = TPOTClassifier(generations=10, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_pipelines/tpot_domain_pipeline.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=1100.0, style=ProgressStyle(d…

Generation 1 - Current best internal CV score: 0.8974933333333333
Generation 2 - Current best internal CV score: 0.8981333333333333
Generation 3 - Current best internal CV score: 0.8982933333333334
Generation 4 - Current best internal CV score: 0.8982933333333334
Generation 5 - Current best internal CV score: 0.8992533333333335
Generation 6 - Current best internal CV score: 0.8992533333333335
Generation 7 - Current best internal CV score: 0.90008
Generation 8 - Current best internal CV score: 0.90008
Generation 9 - Current best internal CV score: 0.90008
Generation 10 - Current best internal CV score: 0.90008

Best pipeline: XGBClassifier(SGDClassifier(input_matrix, alpha=0.0, eta0=1.0, fit_intercept=False, l1_ratio=0.0, learning_rate=invscaling, loss=squared_hinge, penalty=elasticnet, power_t=0.5), learning_rate=0.1, max_depth=6, min_child_weight=4, n_estimators=100, nthread=1, subsample=0.6500000000000001)
0.8996


In [6]:
model_scores = get_model_scores(tpot)
model_scores.to_csv('tpot_scores/tpot_url_path_model_scores.csv')
model_scores.head()

Unnamed: 0,cv_score,model,model_info
986,0.900080,"XGBClassifier(SGDClassifier(input_matrix, SGDC...","{'generation': 'INVALID', 'mutation_count': 6,..."
737,0.900080,"XGBClassifier(SGDClassifier(input_matrix, SGDC...","{'generation': 'INVALID', 'mutation_count': 5,..."
1066,0.899920,"XGBClassifier(input_matrix, XGBClassifier__lea...","{'generation': 'INVALID', 'mutation_count': 5,..."
831,0.899840,GradientBoostingClassifier(ZeroCount(input_mat...,"{'generation': 'INVALID', 'mutation_count': 4,..."
734,0.899733,"GradientBoostingClassifier(input_matrix, Gradi...","{'generation': 'INVALID', 'mutation_count': 3,..."
...,...,...,...
507,-inf,"XGBClassifier(PolynomialFeatures(input_matrix,...","{'generation': 'INVALID', 'mutation_count': 5,..."
415,-inf,GradientBoostingClassifier(SGDClassifier(Polyn...,"{'generation': 'INVALID', 'mutation_count': 4,..."
483,-inf,ExtraTreesClassifier(PolynomialFeatures(input_...,"{'generation': 'INVALID', 'mutation_count': 2,..."
344,-inf,XGBClassifier(PolynomialFeatures(MultinomialNB...,"{'generation': 'INVALID', 'mutation_count': 1,..."


# Only URL

In [1]:
import numpy as np
import pandas as pd

from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

In [2]:
X_train = pd.read_csv('data/url_legitimates.csv').dropna()[:50000]
y_train = pd.Series(np.zeros((len(X_train),)), dtype=np.int)

X_train = pd.concat([X_train, pd.read_csv('data/url_phishings.csv').dropna()[:50000]]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(np.ones((len(X_train) - len(y_train),)), dtype=np.int)], ignore_index=True)

In [3]:
bool_cols = ['domain_is_ip', 'path_percent20_in', 'path_single_char_dir_in', 'path_upper_dir_in']
for col in bool_cols:
    X_train[col] = X_train[col].astype(np.int)
X_train

Unnamed: 0,url_len,url_n_alpha,url_n_ampersand,url_n_digit,url_n_dot,url_n_equal,url_n_question_mark,url_n_semicolon,url_n_sp_char,url_n_underscore,...,query_n_digit,name_len,name_n_digit,name_rate_digit,ratio_domain_url,ratio_path_domain,ratio_path_url,ratio_query_domain,ratio_query_path,ratio_query_url
0,58,49,0,0,2,0,0,0,9,0,...,0,25,0,0.000000,1,5,1,15,35,58
1,74,55,0,9,2,0,0,0,10,0,...,0,35,2,0.057143,1,3,1,15,51,74
2,82,49,0,23,2,0,0,0,10,0,...,0,32,17,0.531250,1,15,2,15,60,82
3,111,84,0,12,3,0,0,0,12,1,...,0,26,6,0.230769,3,1,1,15,88,111
4,56,45,0,0,2,0,0,0,11,0,...,0,11,0,0.000000,1,1,2,15,34,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,55,44,0,1,3,0,0,0,10,0,...,0,10,1,0.100000,1,6,5,18,30,55
49996,81,70,0,0,2,0,0,0,11,0,...,0,9,0,0.000000,1,2,1,22,52,81
49997,36,30,0,0,2,0,0,0,6,0,...,0,9,0,0.000000,1,1,2,19,10,36
49998,59,49,0,0,1,0,0,0,10,0,...,0,0,0,0.000000,1,1,1,17,35,59


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_train.to_numpy(), y_train.to_numpy())

In [None]:
tpot = TPOTClassifier(generations=8, verbosity=2, n_jobs=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_pipelines/tpot_url_path_pipeline.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=900.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: 0.9577866666666667
Generation 2 - Current best internal CV score: 0.9577866666666667
Generation 3 - Current best internal CV score: 0.9577866666666667
Generation 4 - Current best internal CV score: 0.9577866666666667
Generation 5 - Current best internal CV score: 0.9584533333333333
Generation 6 - Current best internal CV score: 0.9584533333333333
Generation 7 - Current best internal CV score: 0.9615466666666667


In [8]:
model_scores = get_model_scores(tpot)
model_scores.to_csv('tpot_scores/tpot_url_model_scores.csv')
model_scores.head()

Unnamed: 0,cv_score,model,model_info
774,0.964987,"ExtraTreesClassifier(PCA(input_matrix, PCA__it...","{'generation': 'INVALID', 'mutation_count': 5,..."
679,0.961547,RandomForestClassifier(PCA(RobustScaler(input_...,"{'generation': 'INVALID', 'mutation_count': 3,..."
807,0.961440,ExtraTreesClassifier(PCA(RobustScaler(input_ma...,"{'generation': 'INVALID', 'mutation_count': 4,..."
857,0.960320,RandomForestClassifier(PCA(MinMaxScaler(input_...,"{'generation': 'INVALID', 'mutation_count': 4,..."
545,0.958453,"RandomForestClassifier(LinearSVC(input_matrix,...","{'generation': 'INVALID', 'mutation_count': 1,..."
...,...,...,...
531,-inf,RandomForestClassifier(GaussianNB(PCA(input_ma...,"{'generation': 'INVALID', 'mutation_count': 2,..."
783,-inf,ExtraTreesClassifier(CombineDFs(PolynomialFeat...,"{'generation': 'INVALID', 'mutation_count': 3,..."
286,-inf,"RandomForestClassifier(FastICA(input_matrix, F...","{'generation': 'INVALID', 'mutation_count': 2,..."
538,-inf,RandomForestClassifier(SelectPercentile(PCA(in...,"{'generation': 'INVALID', 'mutation_count': 3,..."
