In [1]:
import numpy as np
import pandas as pd

import autokeras as ak
from sklearn.model_selection import train_test_split

In [2]:
X_train = pd.read_csv('dataset/url_legitimates.csv').dropna()[:40000]
y_train = pd.Series(np.zeros((len(X_train),)), dtype=np.int)

X_train = pd.concat([X_train, pd.read_csv('dataset/domain_legitimates.csv').dropna()[:40000]]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(np.zeros((len(X_train) - len(y_train),)), dtype=np.int)], ignore_index=True)

X_train = pd.concat([X_train, pd.read_csv('dataset/url_phishings.csv').dropna()[:40000]]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(np.ones((len(X_train) - len(y_train),)), dtype=np.int)], ignore_index=True)

X_train = pd.concat([X_train, pd.read_csv('dataset/domain_phishings.csv').dropna()[:40000]]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(np.ones((len(X_train) - len(y_train),)), dtype=np.int)], ignore_index=True)

In [3]:
bool_cols = ['domain_is_ip', 'path_percent20_in', 'path_single_char_dir_in', 'path_upper_dir_in']
for col in bool_cols:
    X_train[col] = X_train[col].astype(np.int)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_train.to_numpy(), y_train.to_numpy())

In [5]:
def text_to_int_sequence_classification(X_train, X_test, y_train, y_test):
    input_node = ak.TextInput()
    output_node = ak.TextToIntSequence()(input_node)
    output_node = ak.Embedding(max_features=20001)(output_node)
    output_node = ak.ConvBlock(separable=True)(output_node)
    output_node = ak.SpatialReduction()(output_node)
    output_node = ak.DenseBlock()(output_node)
    output_node = ak.ClassificationHead()(output_node)
    return ak.AutoModel(inputs=input_node, outputs=output_node)

def text_to_ngram_vector_classicfication(X_train, X_test, y_train, y_test):
    input_node = ak.TextInput()
    output_node = ak.TextToNgramVector()(input_node)
    #output_node = ak.ConvBlock(separable=True)(output_node)
    output_node = ak.DenseBlock()(output_node)
    output_node = ak.ClassificationHead()(output_node)
    return ak.AutoModel(inputs=input_node, outputs=output_node)

In [6]:
clf = ak.StructuredDataClassifier(max_trials=10)
clf.fit(X_train, y_train, verbose=2)

clf.evaluate(X_test, y_test)

Train for 3000 steps, validate for 750 steps
Epoch 1/1000
3000/3000 - 34s - loss: 1.6300 - accuracy: 0.6257 - val_loss: 0.6238 - val_accuracy: 0.7150
Epoch 2/1000
3000/3000 - 32s - loss: 0.7244 - accuracy: 0.6717 - val_loss: 0.5960 - val_accuracy: 0.7165
Epoch 3/1000
3000/3000 - 31s - loss: 0.6545 - accuracy: 0.6863 - val_loss: 0.6335 - val_accuracy: 0.7039
Epoch 4/1000
3000/3000 - 32s - loss: 0.6139 - accuracy: 0.6968 - val_loss: 0.5422 - val_accuracy: 0.7243
Epoch 5/1000
3000/3000 - 32s - loss: 0.5783 - accuracy: 0.7146 - val_loss: 0.5599 - val_accuracy: 0.7289
Epoch 6/1000
3000/3000 - 32s - loss: 0.5477 - accuracy: 0.7312 - val_loss: 0.6625 - val_accuracy: 0.6800
Epoch 7/1000
3000/3000 - 33s - loss: 0.5248 - accuracy: 0.7427 - val_loss: 0.5125 - val_accuracy: 0.7512
Epoch 8/1000
3000/3000 - 31s - loss: 0.5120 - accuracy: 0.7483 - val_loss: 0.5085 - val_accuracy: 0.7396
Epoch 9/1000
3000/3000 - 32s - loss: 0.4967 - accuracy: 0.7566 - val_loss: 0.4767 - val_accuracy: 0.7729
Epoch 10/1

Train for 3000 steps, validate for 750 steps
Epoch 1/1000
3000/3000 - 34s - loss: 52.5221 - accuracy: 0.4778 - val_loss: 42.3346 - val_accuracy: 0.4904
Epoch 2/1000
3000/3000 - 32s - loss: 34.0202 - accuracy: 0.5047 - val_loss: 28.9405 - val_accuracy: 0.5139
Epoch 3/1000
3000/3000 - 32s - loss: 24.2482 - accuracy: 0.5205 - val_loss: 22.1653 - val_accuracy: 0.5200
Epoch 4/1000
3000/3000 - 32s - loss: 19.3034 - accuracy: 0.5280 - val_loss: 18.4052 - val_accuracy: 0.5266
Epoch 5/1000
3000/3000 - 31s - loss: 16.2562 - accuracy: 0.5307 - val_loss: 15.8612 - val_accuracy: 0.5256
Epoch 6/1000
3000/3000 - 32s - loss: 14.0903 - accuracy: 0.5290 - val_loss: 13.9117 - val_accuracy: 0.5280
Epoch 7/1000
3000/3000 - 32s - loss: 12.3878 - accuracy: 0.5366 - val_loss: 12.3503 - val_accuracy: 0.5403
Epoch 8/1000
3000/3000 - 32s - loss: 11.0462 - accuracy: 0.5476 - val_loss: 11.1202 - val_accuracy: 0.5487
Epoch 9/1000
3000/3000 - 32s - loss: 10.0056 - accuracy: 0.5549 - val_loss: 10.1625 - val_accuracy:

KeyboardInterrupt: 