In [1]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics, tree
import pandas as pd
import collections
import json

In [2]:
# Read the input file and preprocess it
# - convert result dictionnaries to tuples for determinism
# - convert software versions to software names

input_data = list()

with open("../signatures/signatures_all.json", "r") as f:
    for line in f:
        # Load the entry
        entry = json.loads(line)
        for software in entry:
            # Keep the software name
            software_name = software.split("-")[0]
            # Go over each query round
            for round in entry[software]:
                # Store the processed entry
                round_processed = collections.defaultdict(dict)
                for testcase in entry[software][round]:
                    # Test results are dictionnaries, which is not deterministic
                    testresult = entry[software][round][testcase]
                    # Instead, we convert them into tuples
                    testresult_new = tuple(sorted(list(testresult.items())))
                    # Populate a dictionnary with all the tests for one round
                    round_processed[software_name][testcase] = testresult_new
                # Update the global dictionnary with all the input data
                input_data.append(dict(round_processed))

In [3]:
# When we cannot uniquely identify a piece of software, one signature can correspond to multiple labels
# In this case, the classifier will not work properly
# So, we need to combine those labels with a pipe ("|")

signature_labels = collections.defaultdict(list)
for entry in input_data:
    for software in entry:
        signature = tuple(sorted(list(entry[software].items())))
        signature_labels[signature].append(software)

input_data_processed = list()
for signature in signature_labels:
    labels_merged = "|".join(tuple(sorted(set(signature_labels[signature]))))
    signature_dictionnary = {i[0]:i[1] for i in signature}
    for _ in range(len(signature_labels[signature])):
        input_data_processed.append({labels_merged:signature_dictionnary})

In [4]:
# Prepare the data to be loaded to a DataFrame

# Get all the column names from one of the entries
column_names_features = [j for i in input_data_processed[0].values() for j in i]
column_names_all = ["label"] + column_names_features

input_data_flat = list()
for entry in input_data_processed:
    for software in entry:
        entry_flat = list()
        for column in column_names_all:
            if column == "label":
                entry_flat.append(software)
            else:
                entry_flat.append(entry[software][column])
        input_data_flat.append(entry_flat)

input_data_flat = input_data_flat * 2

# Load as a data frame
df = pd.DataFrame(input_data_flat, columns=column_names_all)
# Do the one hot encoding
df_one_hot = pd.get_dummies(data=df, columns=column_names_features)

In [5]:
# Split the dataset into features and target variables
X = df_one_hot.loc[:, df_one_hot.columns != 'label']
y = df_one_hot.label
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

In [6]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(random_state=1)
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
# Predict the response for test dataset
y_pred = clf.predict(X_test)

In [7]:
# Get some metrics
# In our example, the accuracy is always 0
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 1.0


In [8]:
# Our one-hot encoded feature names are too long, we need to try to shorten them
features_one_hot = df_one_hot.columns.difference(["label"], sort=False).tolist()

# Here we remove all the brackets and join by pipes
features_short = list()
# In this list we only keep testcase names
features_testcases = list()
for i in features_one_hot:
    features_short.append(i.replace("'","").replace("), (","_").replace(", ","-").replace("_((","---").replace("))","").replace("),)",""))
    features_testcases.append(i[:i.index("(")][:-1])

In [9]:
# Print the tree in a text form with feature names in a short format
text_representation = tree.export_text(clf,feature_names=features_short, max_depth=len(features_one_hot))
print(text_representation)

|--- baseline_A_HS_IQUERY_TC_RA---AA-0_ANCOUNT-0_ARCOUNT-0_NSCOUNT-0_Opcode-IQUERY_QDCOUNT-0_QR-1_RA-0_RCODE-NOTIMP_RD-0_TC-0 <= 0.50
|   |--- baseline_A_ANY_QUERY_AA---AA-0_ANCOUNT-0_ARCOUNT-0_NSCOUNT-0_Opcode-QUERY_QDCOUNT-1_QR-1_RA-1_RCODE-NOERROR_RD-0_TC-0 <= 0.50
|   |   |--- baseline_A_NONE_QUERY_AA_TC---AA-1_ANCOUNT-0_ARCOUNT-0_NSCOUNT-0_Opcode-QUERY_QDCOUNT-1_QR-1_RA-0_RCODE-FORMERR_RD-0_TC-0 <= 0.50
|   |   |   |--- baseline_A_IN_QUERY_AA---error-Timeout after 5 seconds <= 0.50
|   |   |   |   |--- baseline_A_IN_STATUS_AA_TC_RD_RA---AA-0_ANCOUNT-3_ARCOUNT-0_NSCOUNT-0_Opcode-STATUS_QDCOUNT-1_QR-1_RA-1_RCODE-NOERROR_RD-1_TC-0 <= 0.50
|   |   |   |   |   |--- baseline_A_RESERVED0_IQUERY_TC_RD_RA---error-Timeout after 5 seconds <= 0.50
|   |   |   |   |   |   |--- class: technitium
|   |   |   |   |   |--- baseline_A_RESERVED0_IQUERY_TC_RD_RA---error-Timeout after 5 seconds >  0.50
|   |   |   |   |   |   |--- class: windows
|   |   |   |   |--- baseline_A_IN_STATUS_AA_TC_RD_RA---

In [10]:
# Print the tree in a text form with feature names as testcases
text_representation = tree.export_text(clf,feature_names=features_testcases, max_depth=len(features_one_hot))
print(text_representation)

|--- baseline_A_HS_IQUERY_TC_RA <= 0.50
|   |--- baseline_A_ANY_QUERY_AA <= 0.50
|   |   |--- baseline_A_NONE_QUERY_AA_TC <= 0.50
|   |   |   |--- baseline_A_IN_QUERY_AA <= 0.50
|   |   |   |   |--- baseline_A_IN_STATUS_AA_TC_RD_RA <= 0.50
|   |   |   |   |   |--- baseline_A_RESERVED0_IQUERY_TC_RD_RA <= 0.50
|   |   |   |   |   |   |--- class: technitium
|   |   |   |   |   |--- baseline_A_RESERVED0_IQUERY_TC_RD_RA >  0.50
|   |   |   |   |   |   |--- class: windows
|   |   |   |   |--- baseline_A_IN_STATUS_AA_TC_RD_RA >  0.50
|   |   |   |   |   |--- class: knot
|   |   |   |--- baseline_A_IN_QUERY_AA >  0.50
|   |   |   |   |--- class: maradns
|   |   |--- baseline_A_NONE_QUERY_AA_TC >  0.50
|   |   |   |--- class: unbound
|   |--- baseline_A_ANY_QUERY_AA >  0.50
|   |   |--- class: pdns
|--- baseline_A_HS_IQUERY_TC_RA >  0.50
|   |--- class: bind9



In [11]:
# Analyze the feature importance, i.e. which ones were used to build the tree, and which ones not
feature_importances = pd.DataFrame(data=clf.feature_importances_,columns=["importance"],index=X_train.columns)

print(f"The total number of features: {feature_importances.shape[0]}")
print(f"  Important features: {feature_importances[feature_importances['importance'] != 0].shape[0]}")
print(f"  Not important features: {feature_importances[feature_importances['importance'] == 0].shape[0]}")

# Now we aggregate by the testcase names
testcases_all = set(i.split("_((")[0] for i in feature_importances.index.to_list())
testcases_important = set(i.split("_((")[0] for i in feature_importances[feature_importances['importance'] != 0].index.to_list())
testcases_not_important_all = set(i.split("_((")[0] for i in feature_importances[feature_importances['importance'] == 0].index.to_list())
testcases_not_important_unique = testcases_all - testcases_important

print("---")
print(f"The total number of testcases: {len(testcases_all)}")
print(f"  Important testcases: {len(testcases_important)}")
print(f"  Not important testcases: {len(testcases_not_important_unique)}")

# Also check how many unique versions we got out of all:
# versions_all = df_one_hot
labels_all = set(df_one_hot["label"].to_list())
labels_individual = [i for i in labels_all if "|" not in i]
versions_all = set(j for i in labels_all for j in i.split("|"))

print("---")
print(f"All versions: {len(versions_all)}")
print(f"    Individual versions: {len(labels_individual)}")

The total number of features: 3564
  Important features: 6
  Not important features: 3558
---
The total number of testcases: 768
  Important testcases: 6
  Not important testcases: 762
---
All versions: 7
    Individual versions: 7
