### Extract Features

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
data_path = Path('data/merged/')

In [8]:
df_benign = pd.read_csv(data_path / 'benign.csv')
df_malware = pd.read_csv(data_path / 'malware.csv')
df_phishing = pd.read_csv(data_path / 'phishing.csv')
len(df_benign), len(df_malware), len(df_phishing)

(6152013, 304285, 2041031)

In [9]:
from feature_extractor import feature_extract

In [10]:
feature_extract(df_benign.url.to_list(), 'benign_structured.csv')

In [11]:
feature_extract(df_malware.url.to_list(), 'malware_structured.csv')

In [12]:
feature_extract(df_phishing.url.to_list(), 'phishing_structured.csv')

### Read Features

In [5]:
df_benign = pd.read_csv(data_path / 'benign_structured.csv')
df_malware = pd.read_csv(data_path / 'malware_structured.csv')
df_phishing = pd.read_csv(data_path / 'phishing_structured.csv')
len(df_benign), len(df_malware), len(df_phishing)

  interactivity=interactivity, compiler=compiler, result=result)


(6151055, 304241, 2037945)

In [11]:
df_benign.to_csv(data_path / 'benign_structured.csv', index=False)
df_malware.to_csv(data_path / 'malware_structured.csv', index=False)
df_phishing.to_csv(data_path / 'phishing_structured.csv', index=False)

In [12]:
df_benign = df_benign.sample(n=300000, random_state=119)
df_malware = df_malware.sample(n=30000, random_state=119)
df_phishing = df_phishing.sample(n=100000, random_state=119)

In [13]:
df_benign['CLASS_LABEL'] = np.asarray([0] * len(df_benign))
df_malware['CLASS_LABEL'] = np.asarray([1] * len(df_malware))
df_phishing['CLASS_LABEL'] = np.asarray([2] * len(df_phishing))

In [14]:
X_train = pd.concat([df_benign, df_malware, df_phishing], ignore_index=True)
y_train = X_train.pop('CLASS_LABEL')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=119)

In [16]:
del df_benign, df_malware, df_phishing

### Train Model

In [17]:
from tpot import TPOTClassifier
from tpot.config import classifier_config_dict

d = dict(classifier_config_dict)
d.pop('sklearn.naive_bayes.GaussianNB')
d.pop('sklearn.naive_bayes.BernoulliNB')
d.pop('sklearn.naive_bayes.MultinomialNB')
d.pop('sklearn.tree.DecisionTreeClassifier')
d.pop('sklearn.ensemble.ExtraTreesClassifier')
d.pop('sklearn.ensemble.GradientBoostingClassifier')
d.pop('sklearn.ensemble.RandomForestClassifier')
d.pop('sklearn.neighbors.KNeighborsClassifier')
d.pop('sklearn.svm.LinearSVC')
d.pop('sklearn.linear_model.SGDClassifier')
d.pop('sklearn.linear_model.LogisticRegression')
d

{'penalty': ['l1', 'l2'],
 'C': [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0],
 'dual': [True, False]}

In [18]:
from tpot import TPOTClassifier

In [19]:
tpot = TPOTClassifier(generations=16, verbosity=2, n_jobs=8)
tpot.fit(X_train, y_train)

print(tpot.score(X_test, y_test))
tpot.export('tpot_pipelines/tpot_structured_pipeline.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=1700.0, style=ProgressStyle(d…



RuntimeError: There was an error in the TPOT optimization process. This could be because the data was not formatted properly, or because data for a regression problem was provided to the TPOTClassifier object. Please make sure you passed the data to TPOT correctly.