In [1]:
from os.path import expanduser, join
import pathlib

root = pathlib.Path().resolve()

In [2]:
from os import listdir

dir = listdir(root)
if 't.ex-Graph' in dir:
  root = join(root, 't.ex-Graph')

In [3]:
import sys
sys.path.insert(1, join(root, 'lib'))

import config
import functions
import data
import model
import export

In [4]:
datasets = [{
    'label': 'HTTP/S Graph (SLDs)',
    'data': data.read(join(root, 'data', 'graph-data-sld.csv'))
  }, {
    'label': 'HTTP/S Graph (FQDN)',
    'data': data.read(join(root, 'data', 'graph-data-fqdn.csv'))
  }
]

In [5]:
features = [col for col in list(datasets[0].get('data').columns) if col.lower() not in ['id', 'weight', 'tracker']]

In [6]:
from sklearn.preprocessing import LabelEncoder

for dataset in datasets:
  dataset.get('data')['tracker'] = LabelEncoder().fit_transform(dataset.get('data')['tracker'])

In [7]:
extension = []
for dataset in datasets:
  extension.append({
    'label': dataset.get('label') + ' 50/50',
    'data': data.sample_equal_distribution(dataset.get('data'), 'tracker')
  })

datasets.extend(extension)

In [8]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier

models = {
  'continuous': [
    LinearRegression(n_jobs=-1),
    RandomForestRegressor(n_estimators=200, random_state=0, n_jobs=-1)
  ],
  'category':[
    DecisionTreeClassifier(),
    LogisticRegression(solver='lbfgs', max_iter=1000, n_jobs=-1)
  ]
}

In [9]:
%matplotlib agg

results = model.compute_results(
  datasets, 
  models, 
  features, 
  ['weight', 'tracker']
)

KeyboardInterrupt: 

In [None]:
export.classification_results(results, root)

In [None]:
export.aggregated_classification_results(root)

In [None]:
ncols = max(len(models['continuous']), len(models['category']))
export.feature_importances(2, ncols, results, root)