In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data_path = Path('data/merged/')

## Read URLs

In [3]:
df_benign = pd.read_csv(data_path / 'benign.csv')
df_malware = pd.read_csv(data_path / 'malware.csv')
df_phishing = pd.read_csv(data_path / 'phishing.csv')
df_attacked = pd.read_csv(data_path / 'attacked.csv')
df_spam = pd.read_csv(data_path / 'spam.csv')

len(df_benign), len(df_malware), len(df_phishing), len(df_attacked), len(df_spam)

(6486097, 315525, 2287232, 95308, 11921)

## or Structured Features

In [11]:
# When needed
columns = ['url_len', 'url_n_alpha', 'url_n_ampersand', 'url_n_digit', 'url_n_dot', 'url_n_equal', 'url_n_question_mark', 'url_n_semicolon', 'url_n_sp_char', 'url_n_underscore', 'url_rate_digit', 'url_ratio_digit_letter', 'domain_is_ip', 'domain_len', 'domain_n_at_sign', 'domain_n_digit', 'domain_n_hyphen', 'domain_n_nonalnum', 'primary_domain_entropy', 'primary_domain_len', 'primary_domain_n_at_sign', 'primary_domain_n_digit', 'primary_domain_n_hyphen', 'primary_domain_n_nonalnum', 'subdomain_len', 'subdomain_n', 'subdomain_n_dot', 'path_avglen_dir', 'path_dir_rate_digit', 'path_len', 'path_maxlen_dir', 'path_n_digit', 'path_n_dir', 'path_n_double_slash', 'path_n_sp_char', 'path_n_zero', 'path_percent20_in', 'path_rate_digit', 'path_ratio_upper_lower', 'path_single_char_dir_in', 'path_upper_dir_in', 'params_len', 'query_len', 'query_n', 'query_n_digit', 'name_len', 'name_n_digit', 'name_rate_digit', 'ratio_domain_url', 'ratio_path_domain', 'ratio_path_url', 'ratio_query_domain', 'ratio_query_path', 'ratio_query_url']

In [17]:
df_benign = pd.read_csv(data_path / 'benign_structured.csv')
df_malware = pd.read_csv(data_path / 'malware_structured.csv')
df_phishing = pd.read_csv(data_path / 'phishing_structured.csv')
df_attacked = pd.read_csv(data_path / 'attacked_structured.csv')
df_spam = pd.read_csv(data_path / 'spam_structured.csv')

len(df_benign), len(df_malware), len(df_phishing), len(df_attacked), len(df_spam)

(6485136, 315480, 2285553, 95308, 11921)

In [15]:
## Only once!!!!
bool_cols = ['domain_is_ip', 'path_percent20_in', 'path_single_char_dir_in', 'path_upper_dir_in']
for col in bool_cols:
    df_benign[col] = df_benign[col] == 'True'
    df_benign[col] = df_benign[col].astype(np.int)
    df_malware[col] = df_malware[col] == 'True'
    df_malware[col] = df_malware[col].astype(np.int)
    df_phishing[col] = df_phishing[col] == 'True'
    df_phishing[col] = df_phishing[col].astype(np.int)
    df_attacked[col] = df_attacked[col] == 'True'
    df_attacked[col] = df_attacked[col].astype(np.int)
    df_spam[col] = df_spam[col] == 'True'
    df_spam[col] = df_spam[col].astype(np.int)

df_benign.to_csv(data_path / 'benign_structured.csv', index=False)
df_malware.to_csv(data_path / 'malware_structured.csv', index=False)
df_phishing.to_csv(data_path / 'phishing_structured.csv', index=False)
df_attacked.to_csv(data_path / 'attacked_structured.csv', index=False)
df_spam.to_csv(data_path / 'spam_structured.csv', index=False)

### Sampling

In [4]:
df_benign = df_benign.sample(n=9000, random_state=119)
df_malware = df_malware.sample(n=500, random_state=119)
df_phishing = df_phishing.sample(n=9000, random_state=119)
df_attacked = df_attacked.sample(n=9000, random_state=119)
df_spam = df_spam.sample(n=9000, random_state=119)

In [5]:
df_benign['CLASS_LABEL'] = [0] * len(df_benign)
df_malware['CLASS_LABEL'] = [1] * len(df_malware)
df_phishing['CLASS_LABEL'] = [2] * len(df_phishing)
df_attacked['CLASS_LABEL'] = [3] * len(df_attacked)
df_spam['CLASS_LABEL'] = [4] * len(df_spam)

### Split

In [6]:
X_train = pd.concat([df_benign, df_malware, df_phishing, df_attacked, df_spam], ignore_index=True)
y_train = X_train.pop('CLASS_LABEL')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=119)

### Or Extract Features

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
tfidf = TfidfVectorizer(max_features=100)
X_train_tfidf = tfidf.fit_transform(X_train.url).toarray()
X_test_tfidf = tfidf.transform(X_test.url).toarray()

y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

### Train Model

In [10]:
from tpot import TPOTClassifier
from tpot.config import classifier_config_dict

d = dict(classifier_config_dict)
d.pop('sklearn.naive_bayes.GaussianNB')
d.pop('sklearn.naive_bayes.BernoulliNB')
d.pop('sklearn.naive_bayes.MultinomialNB')
d.pop('sklearn.tree.DecisionTreeClassifier')
d.pop('sklearn.ensemble.ExtraTreesClassifier')
d.pop('sklearn.ensemble.GradientBoostingClassifier')
d.pop('sklearn.ensemble.RandomForestClassifier')
d.pop('sklearn.neighbors.KNeighborsClassifier')
d.pop('sklearn.svm.LinearSVC')
d.pop('sklearn.linear_model.SGDClassifier')
d.pop('sklearn.linear_model.LogisticRegression')
d['sklearn.decomposition.PCA']['iterated_power'] = range(1,21)

In [11]:
classifier_config_sparse = {
    'tpot.builtins.OneHotEncoder': {
        'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25]
    },

    'sklearn.ensemble.RandomForestClassifier': {
        'n_estimators': [100],
        'criterion': ["gini", "entropy"],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf':  range(1, 21),
        'bootstrap': [True, False]
    },

    'xgboost.XGBClassifier': {
        'n_estimators': [100],
        'max_depth': range(1, 11),
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'subsample': np.arange(0.05, 1.01, 0.05),
        'min_child_weight': range(1, 21),
        'nthread': [1]
    },

    'sklearn.feature_selection.SelectFwe': {
        'alpha': np.arange(0, 0.05, 0.001),
        'score_func': {
            'sklearn.feature_selection.f_classif': None
        }
    },

    'sklearn.feature_selection.SelectPercentile': {
        'percentile': range(1, 100),
        'score_func': {
            'sklearn.feature_selection.f_classif': None
        }
    },

    'sklearn.feature_selection.VarianceThreshold': {
        'threshold': np.arange(0.05, 1.01, 0.05)
    },

    'sklearn.feature_selection.RFE': {
        'step': np.arange(0.05, 1.01, 0.05),
        'estimator': {
            'sklearn.ensemble.ExtraTreesClassifier': {
                'n_estimators': [100],
                'criterion': ['gini', 'entropy'],
                'max_features': np.arange(0.05, 1.01, 0.05)
            }
        }
    },

    'sklearn.feature_selection.SelectFromModel': {
        'threshold': np.arange(0, 1.01, 0.05),
        'estimator': {
            'sklearn.ensemble.ExtraTreesClassifier': {
                'n_estimators': [100],
                'criterion': ['gini', 'entropy'],
                'max_features': np.arange(0.05, 1.01, 0.05)
            }
        }
    }
}

In [None]:
tpot = TPOTClassifier(generations=4, verbosity=2, n_jobs=8, random_state=119, config_dict=classifier_config_sparse)
tpot.fit(X_train_tfidf, y_train)

print(tpot.score(X_test_tfidf, y_test))
tpot.export('tpot_pipelines/tpot_tfidf_pipeline.py')