In [14]:
from os.path import expanduser, join
root = join(expanduser("~"), 't.ex-Graph')

In [15]:
import sys
sys.path.insert(1, join(root, 'lib'))

import config
import functions
import data
import model
import export

In [16]:
datasets = [{
    'label': 'HTTP/S Graph (SLDs)',
    'data': data.read(join(root, 'data', 'graph-data-sld.csv'))
  }, {
    'label': 'HTTP/S Graph (FQDN)',
    'data': data.read(join(root, 'data', 'graph-data-fqdn.csv'))
  }
]

In [17]:
features = [col for col in list(datasets[0].get('data').columns) if col.lower() not in ['id', 'weight', 'tracker']]

In [18]:
from sklearn.preprocessing import LabelEncoder

for dataset in datasets:
  dataset.get('data')['tracker'] = LabelEncoder().fit_transform(dataset.get('data')['tracker'])

In [19]:
extension = []
for dataset in datasets:
  extension.append({
    'label': dataset.get('label') + ' 50/50',
    'data': data.sample_equal_distribution(dataset.get('data'), 'tracker')
  })

datasets.extend(extension)

In [20]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier

models = {
  'continuous': [
    LinearRegression(n_jobs=-1),
    RandomForestRegressor(n_estimators=200, random_state=0, n_jobs=-1)
  ],
  'category':[
    DecisionTreeClassifier(),
    LogisticRegression(solver='lbfgs', max_iter=1000, n_jobs=-1)
  ]
}

In [21]:
%matplotlib agg

results = model.compute_results(
  datasets, 
  models, 
  features, 
  ['weight', 'tracker']
)

In [22]:
export.classification_results(results, root)

In [23]:
export.aggregated_classification_results(root)

In [25]:
for key in results.keys():
  for model in results.get(key):
    print (results.get(key))

{'LinearRegression': {'continuous': True, 'train_test': {'r2': 0.4196722529212139, 'mse': 0.2746998789011599, 'mae': 0.1995881916445514}, 'feature_importance': None, 'cross_validation': None}, 'RandomForestRegressor': {'continuous': True, 'train_test': {'r2': 0.8247079355292348, 'mse': 0.15097426638746608, 'mae': 0.0653291239193551}, 'feature_importance': None, 'cross_validation': None}, 'DecisionTreeClassifier': {'continuous': False, 'train_test': {'accuracy': 0.9973209501696731, 'precision': 0.9972657722366461, 'recall': 0.9972003761383046, 'f1_score': 0.9972330391825175}, 'feature_importance': None, 'cross_validation': None}, 'LogisticRegression': {'continuous': False, 'train_test': {'accuracy': 0.8720307197713878, 'precision': 0.8721675391698621, 'recall': 0.8620759120998094, 'f1_score': 0.8661614228757031}, 'feature_importance': None, 'cross_validation': None}}
{'LinearRegression': {'continuous': True, 'train_test': {'r2': 0.4196722529212139, 'mse': 0.2746998789011599, 'mae': 0.19