In [1]:
!pip install -U imbalanced-learn

Requirement already up-to-date: imbalanced-learn in /usr/local/lib/python3.6/dist-packages (0.6.2)


### Read dataset

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

In [3]:
data_path = Path('data/merged/')

In [4]:
df_benign = pd.read_csv(data_path / 'benign.csv')
df_malware = pd.read_csv(data_path / 'malware.csv')
df_phishing = pd.read_csv(data_path / 'phishing.csv')
len(df_benign), len(df_malware), len(df_phishing)

(6152013, 304285, 2041031)

### Sampling, Split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
df_benign = df_benign.sample(n=300000, random_state=119)
df_malware = df_malware.sample(n=30000, random_state=119)
df_phishing = df_phishing.sample(n=100000, random_state=119)

In [7]:
df_benign['CLASS_LABEL'] = np.asarray([0] * len(df_benign))
df_malware['CLASS_LABEL'] = np.asarray([1] * len(df_malware))
df_phishing['CLASS_LABEL'] = np.asarray([2] * len(df_phishing))

In [8]:
X_train = pd.concat([df_benign, df_malware, df_phishing], ignore_index=True)
y_train = X_train.pop('CLASS_LABEL')
X_train = X_train['url']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=119)

In [10]:
del df_benign, df_malware, df_phishing

### Extract Features

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [12]:
tfidf = TfidfVectorizer(max_features=500, min_df=2)
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

In [13]:
X_train = X_train.toarray()
X_test = X_test.toarray()

### Train Model

In [14]:
from tpot import TPOTClassifier
from tpot.config import classifier_config_dict

In [15]:
d = dict(classifier_config_dict)
d.pop('sklearn.naive_bayes.GaussianNB')
d.pop('sklearn.naive_bayes.BernoulliNB')
d.pop('sklearn.naive_bayes.MultinomialNB')
d.pop('sklearn.tree.DecisionTreeClassifier')
d.pop('sklearn.ensemble.ExtraTreesClassifier')
d.pop('sklearn.ensemble.GradientBoostingClassifier')
d.pop('sklearn.ensemble.RandomForestClassifier')
d.pop('sklearn.neighbors.KNeighborsClassifier')
d.pop('sklearn.svm.LinearSVC')
d.pop('sklearn.linear_model.SGDClassifier')
d.pop('sklearn.linear_model.LogisticRegression')
d

{'xgboost.XGBClassifier': {'n_estimators': [100],
  'max_depth': range(1, 11),
  'learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0],
  'subsample': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
         0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]),
  'min_child_weight': range(1, 21),
  'nthread': [1]},
 'sklearn.preprocessing.Binarizer': {'threshold': array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
         0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ])},
 'sklearn.decomposition.FastICA': {'tol': array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
         0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ])},
 'sklearn.cluster.FeatureAgglomeration': {'linkage': ['ward',
   'complete',
   'average'],
  'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']},
 'sklearn.preprocessing.MaxAbsScaler': {},
 'sklearn.preprocessing.MinMaxScaler': {},
 'sklearn.preprocessing.Normalizer': {'

In [None]:
tpot = TPOTClassifier(generations=50, verbosity=2, n_jobs=16, random_state=119, config_dict=d)
tpot.fit(X_train, y_train)

print(tpot.score(X_test, y_test))
tpot.export('tpot_pipelines/tpot_structured_pipeline.py')