In [36]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics, tree
import pandas as pd
import collections
import json

In [37]:
# Read the input file and preprocess it
# - convert result dictionnaries to tuples for determinism
# - convert software versions to software names

input_data = list()
with open("../signatures/signatures_all.json", "r") as f:
    for line in f:
        # Load the old entry
        entry = json.loads(line)
        # Store the processed entry
        entry_processed = collections.defaultdict(dict)
        for software in entry:
            # Keep the software name, excluding the version
            if "bind9" in software: software_only = "bind9"
            elif "knot" in software: software_only = "knot-resolver"
            elif "maradns" in software: software_only = "maradns"
            elif "pdns" in software: software_only = "pdns-recursor"
            elif "technitium" in software: software_only = "technitium"
            elif "unbound" in software: software_only = "unbound"
            elif "windows" in software: software_only = "windows-server"
            # Test results are dictionnaries, which is not deterministic
            # Instead, we convert them into tuples
            for testcase in entry[software]:
                testresult = entry[software][testcase]
                if "error" in testresult:
                    testresult_new = tuple(sorted(list(testresult.items())))
                elif "answer" in testresult:
                    testresult_new = tuple(sorted(list(testresult["header"].items()) + list(testresult["answer"].items())))
                else:
                    testresult_new = tuple(sorted(list(testresult["header"].items())))
                entry_processed[software_only][testcase] = testresult_new

        input_data.append(dict(entry_processed))

In [38]:
# When we cannot uniquely identify a piece of software, one signature can correspond to multiple labels
# In this case, the classifier will not work properly
# So, we need to combine those labels with a pipe ("|")

signature_labels = collections.defaultdict(list)
for entry in input_data:
    for software in entry:
        signature = tuple((sorted(list(entry[software].items()))))
        signature_labels[signature].append(software)

input_data_processed = list()
for signature in signature_labels:
    labels_merged = "|".join(set(signature_labels[signature]))
    signature_dictionnary = {i[0]:i[1] for i in signature}
    for _ in range(len(signature_labels[signature])):
        input_data_processed.append({labels_merged:signature_dictionnary})

In [39]:
# Prepare the data to be loaded to a DataFrame

# Get all the column names from one of the entries
column_names_features = [j for i in input_data_processed[0].values() for j in i]
column_names_all = ["label"] + column_names_features

input_data_flat = list()
for entry in input_data_processed:
    for software in entry:
        entry_flat = list()
        for column in column_names_all:
            if column == "label":
                entry_flat.append(software)
            else:
                entry_flat.append(entry[software][column])
        input_data_flat.append(entry_flat)

# Load as a data frame
df = pd.DataFrame(input_data_flat, columns=column_names_all)
# Do the one hot encoding
df_one_hot = pd.get_dummies(data=df, columns=column_names_features)

In [40]:
# Split the dataset into features and target variables
X = df_one_hot.loc[:, df_one_hot.columns != 'label']
y = df_one_hot.label
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

In [41]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(random_state=1)
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
# Predict the response for test dataset
y_pred = clf.predict(X_test)

In [42]:
# Get some metrics
# In our example, the accuracy is always 0
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 1.0


In [43]:
# Our one-hot encoded feature names are too long, we need to try to shorten them
features_one_hot = df_one_hot.columns.difference(["label"], sort=False).tolist()

# Here we remove all the brackets and join by pipes
features_short = list()
# In this list we only keep testcase names
features_testcases = list()
for i in features_one_hot:
    features_short.append(i.replace("'","").replace("), (","|").replace(", ","-").replace("((","").replace("))","").replace("),)",""))
    features_testcases.append(i[:i.index("(")][:-1])

In [44]:
# Print the tree in a text form with feature names in a short format
text_representation = tree.export_text(clf,feature_names=features_short)
print(text_representation)

|--- test_chaos_rd_AA-0|AD-0|ANCOUNT-0|ARCOUNT-0|CD-0|NSCOUNT-0|Opcode-QUERY|QDCOUNT-1|QR-1|RA-0|RCODE-REFUSED|RD-1|TC-0 <= 0.50
|   |--- test_norec_AA-0|AD-0|ANCOUNT-0|ARCOUNT-0|CD-0|NSCOUNT-0|Opcode-QUERY|QDCOUNT-1|QR-1|RA-1|RCODE-NOERROR|RD-0|TC-0 <= 0.50
|   |   |--- test_31_172_AA-1|AD-0|ANCOUNT-0|ARCOUNT-1|CD-0|NSCOUNT-1|Opcode-QUERY|QDCOUNT-1|QR-1|RA-1|RCODE-NOERROR|RD-1|TC-0 <= 0.50
|   |   |   |--- test_norec_error-Timeout <= 0.50
|   |   |   |   |--- test_iquery_AA-0|AD-0|ANCOUNT-3|ARCOUNT-0|CD-0|NSCOUNT-0|Opcode-IQUERY|QDCOUNT-1|QR-1|RA-1|RCODE-NOERROR|RD-1|TC-0 <= 0.50
|   |   |   |   |   |--- test_chaos_rd_error-Timeout <= 0.50
|   |   |   |   |   |   |--- class: technitium
|   |   |   |   |   |--- test_chaos_rd_error-Timeout >  0.50
|   |   |   |   |   |   |--- class: windows-server
|   |   |   |   |--- test_iquery_AA-0|AD-0|ANCOUNT-3|ARCOUNT-0|CD-0|NSCOUNT-0|Opcode-IQUERY|QDCOUNT-1|QR-1|RA-1|RCODE-NOERROR|RD-1|TC-0 >  0.50
|   |   |   |   |   |--- class: knot-resolver
| 

In [45]:
# Print the tree in a text form with feature names as testcases
text_representation = tree.export_text(clf,feature_names=features_testcases)
print(text_representation)

|--- test_chaos_rd <= 0.50
|   |--- test_norec <= 0.50
|   |   |--- test_31_172 <= 0.50
|   |   |   |--- test_norec <= 0.50
|   |   |   |   |--- test_iquery <= 0.50
|   |   |   |   |   |--- test_chaos_rd <= 0.50
|   |   |   |   |   |   |--- class: technitium
|   |   |   |   |   |--- test_chaos_rd >  0.50
|   |   |   |   |   |   |--- class: windows-server
|   |   |   |   |--- test_iquery >  0.50
|   |   |   |   |   |--- class: knot-resolver
|   |   |   |--- test_norec >  0.50
|   |   |   |   |--- class: maradns
|   |   |--- test_31_172 >  0.50
|   |   |   |--- class: unbound
|   |--- test_norec >  0.50
|   |   |--- class: pdns-recursor
|--- test_chaos_rd >  0.50
|   |--- class: bind9



In [46]:
# Analyze the feature importance, i.e. which ones were used to build the tree, and which ones not
feature_importances = pd.DataFrame(data=clf.feature_importances_,columns=["importance"],index=X_train.columns)

print(f"The total number of features: {feature_importances.shape[0]}")
print(f"  Important features: {feature_importances[feature_importances['importance'] != 0].shape[0]}")
print(f"  Not important features: {feature_importances[feature_importances['importance'] == 0].shape[0]}")

# Now we aggregate by the testcase names
testcases_all = set(i[:i.index("(")][:-1] for i in feature_importances.index.to_list())
testcases_important = set(i[:i.index("(")][:-1] for i in feature_importances[feature_importances['importance'] != 0].index.to_list())
testcases_not_important_all = set(i[:i.index("(")][:-1] for i in feature_importances[feature_importances['importance'] == 0].index.to_list())
testcases_not_important_unique = testcases_all - testcases_important

print("---")
print(f"The total number of testcases: {len(testcases_all)}")
print(f"  Important testcases: {len(testcases_important)} ({', '.join(testcases_important)})")
print(f"  Not important testcases: {len(testcases_not_important_unique)} ({', '.join(testcases_not_important_unique)})")

The total number of features: 56
  Important features: 6
  Not important features: 50
---
The total number of testcases: 10
  Important testcases: 4 (test_iquery, test_31_172, test_norec, test_chaos_rd)
  Not important testcases: 6 (test_home_arpa, test_zero_ttl, test_is_response, test_baseline, test_tc, test_edns0)
