In [91]:
import pandas as pd
import sys
import sklearn
import time
import itertools
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier, plot_importance
import numpy as np
from numpy import sort
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import svm

In [138]:
# dataset path
dataset_path = './csv/diff_dataset.csv'

In [139]:
# read dataset
dataset = pd.read_csv(dataset_path, index_col=None, header=0)

In [140]:
dataset.shape

(1862, 661)

In [142]:
dataset.drop(columns=['Unnamed: 0.1'], inplace=True)
dataset.head()

Unnamed: 0.1,Unnamed: 0,p_/computes0/service/id,p_/computes0/vcpus_used,p_/computes0/vcpus,p_/computes0/memory_mb_used,p_/computes0/memory_mb,p_/computes0/cpu_info/topology/cores,p_/computes0/cpu_info/topology/cells,p_/computes0/cpu_info/topology/threads,p_/computes0/cpu_info/topology/sockets,...,v_/ports#link-tr-tr-a-1-x/metrics/network-outgoing-packets-rate,v_/ports#link-tr-tr-a-1-y/metrics/network-incoming-bytes,v_/ports#link-tr-tr-a-1-y/metrics/network-incoming-bytes-rate,v_/ports#link-tr-tr-a-1-y/metrics/network-incoming-packets,v_/ports#link-tr-tr-a-1-y/metrics/network-incoming-packets-rate,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-bytes,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-bytes-rate,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-packets,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-packets-rate,v_type_code
0,0,,,,,,,,,,...,,,,,,,,,,0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1e-05,4332.0,0.880124,36.0,0.000333,4332.0,2.713499,36.0,-0.000637,1
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.000679,4172.0,-0.568714,36.0,-1.7e-05,4448.0,-4.418877,38.0,0.016548,2
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.016052,3986.0,0.93373,35.0,0.016481,3986.0,2.805528,35.0,-0.01662,1


In [145]:
# drop unnamed column
dataset.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
# Delete the following states
# 0: ixnetwork-traffic-start
# 2: node-up
# 4: interface-up
# 6: tap-loss-stop
# 8: tap-delay-stop
# 10: ixnetwork-bgp-injection-stop
# 12: ixnetwork-bgp-hijacking-stop
data_drop_index = dataset[dataset['v_type_code'] % 2 == 0].index.tolist()
dataset.drop(index=data_drop_index, axis=0, inplace=True)
# Try to combine 5 and 7 into one type.
dataset['v_type_code'] = dataset['v_type_code'].apply(lambda x: 57 if x == 5 or x == 7 else x)

In [None]:
# train test split
column = dataset.columns
X_train = dataset[column[:-1]]
y_train = dataset[column[-1]]
X = pd.concat([X_train, X_test], axis=0, ignore_index=True, sort=False)

In [None]:
def random_forest(X_train, y_train, X_test, y_test, show=False, estimators=100):
    last_time = time.time()
    rf = RandomForestClassifier(n_estimators=estimators, max_depth=None, min_samples_split=2, random_state=0)

    rf.fit(X_train, y_train)
    middle_time = time.time()

    y_pred = rf.predict(X_test)

    current_time = time.time()
    print("n_estimators: %d" % estimators)
    print("RF Accuracy: %.2f" % accuracy_score(y_test, y_pred))
    print("train time： {}".format(middle_time - last_time))
    print("test time： {}".format(current_time - middle_time))
    if show:
        cm = confusion_matrix(y_test, y_pred)
        print('confusion matrix rf:')
        print(cm)
        print('classification report rf:')
        print(classification_report(y_test, y_pred))
    
    return rf

# Testset

In [150]:
testset_path = './csv/diff_sample_2799.csv'
testset = pd.read_csv(testset_path, index_col=None, header=0)
testset.drop(columns=['Unnamed: 0'], inplace=True)

In [151]:
test_drop_index = testset[testset['v_type_code'] % 2 == 0].index.tolist()
testset.drop(index=test_drop_index, axis=0, inplace=True)
testset['v_type_code'] = testset['v_type_code'].apply(lambda x: 57 if x == 5 or x == 7 else x)

print('dataset:')
print(dataset.shape)
print('testset:')
print(testset.shape)

dataset:
(930, 659)
testset:
(95, 661)


In [152]:
X_test = testset[column[:-1]]
y_test = testset[column[-1]]
Y = pd.concat([y_train, y_test], axis=0, ignore_index=True, sort=False)
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print(X.shape, Y.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1025, 658) (1025,)
(930, 658) (930,)
(95, 658) (95,)


In [153]:
rf = random_forest(X_train, y_train, X_test, y_test, show=True)

n_estimators: 100
RF Accuracy: 0.15
train time： 0.9458940029144287
test time： 0.014408111572265625
confusion matrix rf:
[[14  0  0]
 [ 0  0  0]
 [68 13  0]]
classification report rf:
              precision    recall  f1-score   support

           3       0.17      1.00      0.29        14
           9       0.00      0.00      0.00         0
          57       0.00      0.00      0.00        81

    accuracy                           0.15        95
   macro avg       0.06      0.33      0.10        95
weighted avg       0.03      0.15      0.04        95



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
