Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 397 lines (363 sloc) 29.1 KB
#!/usr/bin/python
###########################
# Oedipus Utility imports #
###########################
from Oedipus.utils.misc import *
from Oedipus.utils.graphics import *
from Oedipus.utils.data import *
###########################
# Oedipus Service imports #
###########################
from Oedipus.gadgets import classification
from Oedipus.gadgets import clustering
from Oedipus.gadgets import feature_extraction
from Oedipus.gadgets import data_visualization
from Oedipus.gadgets import program_generation
######################
# OS Utility imports #
######################
#from __future__ import division
import time, sys, os, subprocess
import shutil, glob, argparse, random
import numpy
from Levenshtein import distance
garbage = []
def defineArguments():
parser = argparse.ArgumentParser(prog="oedipus_tool.py", description="Uses the \"Oedipus\" platform to learn and cluster/classify Tigress-generated obfuscated programs according to the transformations they employ.", usage="python oedipus_tool.py [options]")
parser.add_argument("-m", "--mode", help="The requested mode of operation.", choices=["generate", "classify-exp1", "classify-exp2", "extract", "extract-from-traces", "visualize", "gather-stats", "filter-traces"], required=True, default="classify-exp1")
parser.add_argument("-g", "--statlogfile", help="The file containing a dump of classification results.", required=False, default="log.txt")
parser.add_argument("-s", "--sourcedir", help="The path to the directory containing the [un]obfuscated source code files.", required=False, default=".")
parser.add_argument("-p", "--originalprograms", help="The path to the directory containing the original, unobfuscated programs.", required=False, default=".")
parser.add_argument("-d", "--tigressdir", help="The path to the directory of \"tigress\".", required=False, default=".")
parser.add_argument("-f", "--obfuscationfunction", help="The name of the function to obfuscate e.g. \"main\"", required=False, default="main")
parser.add_argument("-r", "--filterfunction", help="The function to focus on during trace filteration.", required=False, default="main")
parser.add_argument("-o", "--obfuscationlevel", help="The number of obfuscation combinations to employ.", required=False, default=1)
parser.add_argument("-a", "--algorithm", help="The machine learning algorithm to use.", required=False, default="tree", choices=["bayes", "tree"])
parser.add_argument("-t", "--datatype", help="The format of data to consider e.g. numerical, traces, etc.", required=False, default="tfidf")
parser.add_argument("-l", "--datalabel", help="The type of the label to load", required=False, default="label", choices=["label", "metadata"])
parser.add_argument("-k", "--kfold", help="The number of folds to consider for K-fold cross validation.", required=False, default=10)
parser.add_argument("-x", "--dimension", help="The dimension to which the data is projected before plotting.", required=False, default=2)
parser.add_argument("-z", "--visualalgorithm", help="The algorithm used to project data into 2- or 3-dimensional space.", required=False, default="tsne", choices=["tsne", "pca"])
parser.add_argument("-v", "--verbose", help="Displays debug messages on the screen.", default="no", choices=["yes", "no"], required=False)
return parser
def main():
try:
argumentParser = defineArguments()
arguments = argumentParser.parse_args()
prettyPrint("Welcome to \"Oedipus\". Riddle me this!")
#################################################
# MODE 1: Generate obfuscated source code files #
#################################################
if arguments.mode == "generate":
if arguments.verbose == "yes":
prettyPrint("Generating obfusted programs for programs under \"%s\"" % arguments.sourcedir, "debug")
# Get programs from source directory [random/pre-existent]
sourceFiles = sorted(glob.glob("%s/*.c" % arguments.sourcedir))
if len(sourceFiles) < 1:
prettyPrint("No files were found in \"%s\". Exiting" % arguments.sourcedir, "error")
return
generationStatus = program_generation.generateObfuscatedPrograms(sourceFiles, arguments.tigressdir, int(arguments.obfuscationlevel), arguments.obfuscationfunction) # Generate obfuscated programs
prettyPrint("Successfully generated obfuscated programs")
#########################################################
# MODE 2: Extract features from obfuscated source files #
#########################################################
elif arguments.mode == "extract":
# Load obfuscated files
if not os.path.exists(arguments.sourcedir):
prettyPrint("Unable to locate \"%s\". Exiting" % arguments.sourcedir, "error")
return
sourceFiles = sorted(glob.glob("%s/*.c" % arguments.sourcedir))
if len(sourceFiles) < 1:
prettyPrint("No files were found in \"%s\". Exiting" % arguments.sourcedir)
# Remove source files without ".label" files
for targetFile in sourceFiles:
if not os.path.exists(targetFile.replace(".c", ".label")):
prettyPrint("File \"%s\" does not have a label/metadata file. Removing" % targetFile, "warning")
sourceFiles.pop( sourceFiles.index(targetFile) )
########################################################################
# (2.0) Extract TF-IDF features from GDB generated traces of KLEE inputs
prettyPrint("Extracting TF-IDF from GDB traces")
if not feature_extraction.extractTFIDF(arguments.sourcedir, sourceFiles):
prettyPrint("Could not extract features from source files. Exiting", "error")
return
########################################################################
prettyPrint("Alright!! Alles in Ordnung.", "info2")
cleanUp()
return
###########################################################
# MODE 3: Project data samples into <x>-dimensional space #
###########################################################
elif arguments.mode.find("visualize") != -1:
if arguments.mode == "visualize":
prettyPrint("Plotting data into %s-dimensional space with \"%s\" features." % (arguments.dimension, arguments.datatype))
data_visualization.visualizeData(arguments.sourcedir, arguments.datatype, arguments.dimension, algorithm=arguments.visualalgorithm)
else:
data_visualization.visualizeOriginal(arguments.sourcedir, arguments.datatype, arguments.dimension, algorithm=arguments.visualalgorithm)
##############################################################################
# MODE 4: Classify obfuscated programs using knowledge-based classification #
##############################################################################
elif arguments.mode == "classify-exp1":
# Check the requested algorithm
if arguments.algorithm == "bayes":
# Classify using Naive Bayes
if arguments.datatype.find("idf") == -1:
prettyPrint("Naive Bayes does not support the data type \"%s\". Exiting" % arguments.datatype, "warning")
#return
# Load data from source directory
X, y, allClasses = loadFeaturesFromDir(arguments.sourcedir, arguments.datatype, arguments.datalabel)
reductionMethod = raw_input("Please choose a dimensionality reduction method (selectkbest/pca): ").lower()
classificationLog = open("classificationlog_%s_exp1_%s_%s.txt" % (arguments.datatype, reductionMethod, arguments.algorithm), "a") # A file to log all classification labels
classificationLog.write("Experiment 1 - Algorithm: %s, Datatype: %s\n" % (arguments.algorithm, arguments.datatype))
if reductionMethod == "selectkbest":
accuracies, timings = [], []
targetDimensions = [8, 16, 32, 64, 128]#[64, 128, 256, 512, 1000]
for dimension in targetDimensions:
if arguments.verbose == "yes":
prettyPrint("Training a naive Bayes classifier with %s selected \"%s\" features" % (dimension, arguments.datatype), "debug")
accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyNaiveBayesKFold(X, y, kFold=int(arguments.kfold), reduceDim=reductionMethod, targetDim=dimension)
prettyPrint("Average classification accuracy: %s%%" % (averageList(accuracyRates)*100.0), "output")
accuracies.append(averageList(accuracyRates))
timings.append(averageList(allTimings))
# Log classifications
for foldIndex in range(len(predictedLabels)):
classificationLog.write("Target Dimensionality: %s\n" % dimension)
for labelIndex in range(len(predictedLabels[foldIndex])):
classificationLog.write("Class:%s,Predicted:%s\n" % (allClasses[groundTruthLabels[foldIndex][labelIndex]], allClasses[predictedLabels[foldIndex][labelIndex]]))
classificationLog.close()
# Plot accuracies graph
prettyPrint("Plotting accuracies")
data_visualization.plotAccuracyGraph(targetDimensions, accuracies, "Number of Selected Features", "Classification Accuracy", "Classification Accuracy: Selected Features (%s)" % arguments.datatype, "accuracy_%s_exp1_%s_selectkbest.pdf" % (arguments.datatype, arguments.algorithm))
# Plot performance graph
print timings
#prettyPrint("Plotting performance")
#data_visualization.plotAccuracyGraph(targetDimensions, timings, "Number of Selected Features", "Classification Timing (sec)", "Classification Timing: Selected Features (%s)" % arguments.datatype)
elif reductionMethod == "pca":
accuracies, timings = [], []
targetDimensions = [8, 16, 32, 64, 128]#[2, 4, 8, 16, 32, 64, 128, 256, 512, 1000]
for dimension in targetDimensions:
if arguments.verbose == "yes":
prettyPrint("Training a naive Bayes classifier with %s extracted \"%s\" features" % (dimension, arguments.datatype), "debug")
accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyNaiveBayesKFold(X, y, kFold=int(arguments.kfold), reduceDim=reductionMethod, targetDim=dimension)
prettyPrint("Average classification accuracy: %s%%" % (averageList(accuracyRates)*100.0), "output")
accuracies.append(averageList(accuracyRates))
timings.append(averageList(allTimings))
# Log classifications
for foldIndex in range(len(predictedLabels)):
classificationLog.write("Target Dimensionality: %s\n" % dimension)
for labelIndex in range(len(predictedLabels[foldIndex])):
classificationLog.write("Class:%s,Predicted:%s\n" % (allClasses[groundTruthLabels[foldIndex][labelIndex]], allClasses[predictedLabels[foldIndex][labelIndex]]))
classificationLog.close()
# Plot accuracies graph
prettyPrint("Plotting accuracies")
data_visualization.plotAccuracyGraph(targetDimensions, accuracies, "Number of Extracted Features", "Classification Accuracy", "Classification Accuracy: PCA (%s)" % arguments.datatype, "accuracy_%s_exp1_%s_pca.pdf" % (arguments.datatype, arguments.algorithm))
# Plot performance graph
print timings
#prettyPrint("Plotting performance")
#data_visualization.plotAccuracyGraph(targetDimensions, timings, "Number of Extracted Features", "Classification Timing (sec)", "Classification Timing: PCA (%s)" % arguments.datatype)
else:
accuracyRates, allProbabilities, allTimings, predictedLabels = classification.classifyNaiveBayes(X, y, kFold=int(arguments.kfold))
prettyPrint("Average classification accuracy: %s%%, achieved in an average of %s seconds" % (averageList(accuracyRates)*100.0, averageList(allTimings)), "output")
####################
# Using CART trees #
####################
elif arguments.algorithm == "tree":
# Classify using CART trees
if arguments.datatype != "triton":
prettyPrint("It is recommended to use \".triton\" features", "warning")
# Load data from source directory
X, y, allClasses = loadFeaturesFromDir(arguments.sourcedir, arguments.datatype, arguments.datalabel)
splittingCriterion = raw_input("Please choose a splitting criterion (gini/entropy): ")
classificationLog = open("classificationlog_%s_exp1_%s_%s.txt" % (arguments.datatype, splittingCriterion, arguments.algorithm), "a") # A file to log all classification labels
classificationLog.write("Experiment 1 - Algorithm: %s, Datatype: %s\n" % (arguments.algorithm, arguments.datatype))
#maxDepth = raw_input("Please choose a maximum depth for the tree (0 = Maximum Possible): ") # Should be (2,4,8,16)
accuracies, timings, allDepths = [], [], [2,3,4,5,6,7,8,10,12,14,16]#,32,64]
for maxDepth in allDepths:
if arguments.verbose == "yes":
prettyPrint("Training a \"CART\" with \"%s\" criterion and maximum depth of %s" % (splittingCriterion, maxDepth), "debug")
accuracyRates, allProbabilities, allTimings, groundTruthLabels, predictedLabels = classification.classifyTreeKFold(X, y, int(arguments.kfold), splittingCriterion, int(maxDepth), visualizeTree=False)
#print accuracyRates, allProbabilities
prettyPrint("Average classification accuracy: %s%%" % (averageList(accuracyRates)*100.0), "output")
accuracies.append(averageList(accuracyRates))
timings.append(averageList(allTimings))
# Log classifications
for foldIndex in range(len(predictedLabels)):
classificationLog.write("Tree Depth: %s\n" % maxDepth)
for labelIndex in range(len(predictedLabels[foldIndex])):
classificationLog.write("Class:%s,Predicted:%s\n" % (allClasses[groundTruthLabels[foldIndex][labelIndex]], allClasses[predictedLabels[foldIndex][labelIndex]]))
classificationLog.close()
# Plot accuracies graph
prettyPrint("Plotting accuracies for \"%s\" criterion" % splittingCriterion)
data_visualization.plotAccuracyGraph(allDepths, accuracies, "Maximum Tree Depth", "Classification Accuracy", "Classification Accuracy: %s (%s)" % (splittingCriterion, arguments.datatype), "accuracy_%s_exp1_%s_%s.pdf" % (arguments.datatype, splittingCriterion, arguments.algorithm))
# Plot performance graph
#prettyPrint("Plotting timings")
#data_visualization.plotAccuracyGraph(allDepths, timings, "Maximum Tree Depth", "Classification Timing (sec)", "Classification Timing: %s (%s)" % (splittingCriterion, arguments.datatype))
print timings
return
##################################################################
# MODE 6: Classify obfuscated programs using the 36-4 experiment #
##################################################################
elif arguments.mode == "classify-exp2":
# Retrieve the list of all programs
allPrograms = glob.glob("%s/*.c" % arguments.originalprograms)#list(set(sorted(glob.glob("%s/*.c" % arguments.sourcedir))) - set(sorted(glob.glob("%s/*-*.c" % arguments.sourcedir))))
allPrograms.sort() # Makes it easier to keep track of current programs in batch
totalPrograms = len(allPrograms)
prettyPrint("Successfully retrieved %s original programs" % totalPrograms)
chunkSize = totalPrograms/int(arguments.kfold) # 4 = 40 / 10 (default)
if arguments.algorithm == "tree":
criterion = raw_input("Please choose a splitting criterion (gini/entropy): ")
allValues = [2,3,4,5,6,7,8,10,12,14,16]#,32,64] # The allowed depths of the tree
elif arguments.algorithm == "bayes":
criterion = raw_input("Please choose a dimensionality reduction method (SelectKBest/PCA): ").lower()
allValues = [8,16,32,64,128]# if criterion.lower() == "selectkbest" else [8,16,32,64,128]
# Define the structure of the accuracy and timing matrices
allAccuracyRates, allTimings = numpy.zeros((int(arguments.kfold), len(allValues))), numpy.zeros((int(arguments.kfold), len(allValues)))
classificationLog = open("classificationlog_%s_exp2_%s_%s.txt" % (arguments.datatype, criterion, arguments.algorithm), "a") # A file to log all classification labels
classificationLog.write("Experiment 2 - Algorithm: %s, Datatype: %s\n" % (arguments.algorithm, arguments.datatype))
for currentCycle in range(10):
prettyPrint("Cycle #%s out of %s cycles" % (currentCycle+1, int(arguments.kfold)))
trainingPrograms, testPrograms = [] + allPrograms, []
# Specify the indices of the training and test datasets
testStartIndex = (totalPrograms + (chunkSize * currentCycle)) % totalPrograms
testStopIndex = testStartIndex + chunkSize
if arguments.verbose == "yes":
prettyPrint("Retrieving training and test programs for the current cycle", "debug")
# Populate the test dataset
testPrograms = trainingPrograms[testStartIndex:testStopIndex]
# Remove the indices from trainingPrograms
trainingPrograms = [x for x in trainingPrograms if not x in trainingPrograms[testStartIndex:testStopIndex]]
if arguments.verbose == "yes":
prettyPrint("Original training programs: %s, original test programs: %s" % (len(trainingPrograms), len(testPrograms)), "debug")
# Now load the training and test samples from the source directory
# 1- First we need to retrieve the obfuscated versions of the
tempTraining, tempTest = [], []
for program in trainingPrograms:
programName = program.replace(arguments.originalprograms, "").replace("/","") # Isolate program name
# TODO: Important: For 40 programs, programs are like "anagram_1231231231_12.c"
# TODO: for "obf" programs, programs are like "empty-Seed1-Random......-addOpaque16.c"
separator = "_" if arguments.sourcedir.find("40programs") != - 1 else "-"
#print "%s/%s%s*.%s" % (arguments.sourcedir, programName.replace(".c", ""), separator, arguments.datatype)
obfuscatedVersions = glob.glob("%s/%s%s*.%s" % (arguments.sourcedir, programName.replace(".c", ""), separator, arguments.datatype))
#print programName, len(obfuscatedVersions)
#print "%s/%s_*.%s" % (arguments.sourcedir, programName.replace(".c", ""), arguments.datatype)
if len(obfuscatedVersions) > 0:
tempTraining += obfuscatedVersions
#print programName, len(obfuscatedVersions)
for program in testPrograms:
programName = program.replace(arguments.originalprograms, "").replace("/","") # Isolate program name
# TODO: Important: For 40 programs, programs are like "anagram_1231231231_12.c"
# TODO: for "obf" programs, programs are like "empty-Seed1-Random......-addOpaque16.c"
separator = "_" if arguments.sourcedir.find("40programs") != - 1 else "-"
obfuscatedVersions = glob.glob("%s/%s%s*.%s" % (arguments.sourcedir, programName.replace(".c", ""), separator, arguments.datatype))
if len(obfuscatedVersions) > 0:
tempTest += obfuscatedVersions
trainingPrograms, testPrograms = tempTraining, tempTest # Update the training and test programs
if arguments.verbose == "yes":
prettyPrint("Successfully retrieved %s training and %s test programs" % (len(trainingPrograms), len(testPrograms)), "debug")
# (Added January 15): Generate the TF-IDF features on the fly
if arguments.verbose == "yes":
prettyPrint("Generating TF-IDF features for the current training and test traces", "debug")
if feature_extraction.extractTFIDFMemoryFriendly(trainingPrograms, arguments.datatype, 128, "%s_tr" % arguments.datatype):
prettyPrint("Successfully generated TF-IDF features for the current training batch")
else:
prettyPrint("Unable to generate TF-IDF features for the current training batch", "warning")
continue
# Now for the test batch
if feature_extraction.extractTFIDFMemoryFriendly(testPrograms, arguments.datatype, 128, "%s_te" % arguments.datatype):
prettyPrint("Successfully generated TF-IDF features for the current test batch")
else:
prettyPrint("Unable to generate TF-IDF features for the current test batch", "warning")
continue
# Now load the programs of the given datatype
prettyPrint("Loading training and test instances")
Xtr, ytr, allClassestr = loadFeaturesFromList(trainingPrograms, "%s_tr" % arguments.datatype, arguments.datalabel)
Xte, yte, allClasseste = loadFeaturesFromList(testPrograms, "%s_te" % arguments.datatype, arguments.datalabel, allClassestr)
# Now apply the classification algorithm
for value in allValues:
##############
# CART Trees #
##############
if arguments.algorithm == "tree":
prettyPrint("Training a \"CART\" with \"%s\" criterion and maximum depth of %s" % (criterion, value), "debug")
currentAccuracyRate, currentTiming, currentProbabilities, predictedLabels = classification.classifyTree(Xtr, ytr, Xte, yte, criterion, int(value), visualizeTree=False)
prettyPrint("Classification accuracy with \"%s\" and \"%s\" is: %s%%" % (criterion, value, (currentAccuracyRate*100.0)), "output")
#print "before!!!! currentCycle: %s, value: %s, allValues.index(value): %s" % (currentCycle, value, allValues.index(value))
allAccuracyRates[currentCycle][allValues.index(value)] = currentAccuracyRate
allTimings[currentCycle][allValues.index(value)] = currentTiming
#print "after assignments"
# Log the results
classificationLog.write("Depth: %s\n" % value)
#print len(yte), len(predictedLabels), len(testPrograms)
for index in range(len(testPrograms)):
classificationLog.write("%s: Class: %s, Predicted: %s\n" % (testPrograms[index], allClasseste[yte[index]], allClasseste[predictedLabels[index]]))
#print "after writing"
###########################
# Multinomial Naive Bayes #
###########################
elif arguments.algorithm == "bayes":
prettyPrint("Training a \"Multinomial Naive Bayes\" with \"%s\" criterion and dimensionality of %s" % (criterion, value), "debug")
currentAccuracyRate, currentTiming, currentProbabilities, predictedLabels = classification.classifyNaiveBayes(Xtr, ytr, Xte, yte, criterion, int(value))
#print accuracyRates, allProbabilities
prettyPrint("Classification accuracy with \"%s\" and \"%s\" is: %s%%" % (criterion, value, (currentAccuracyRate*100.0)), "output")
allAccuracyRates[currentCycle][allValues.index(value)] = currentAccuracyRate
allTimings[currentCycle][allValues.index(value)] = currentTiming
# Log the results
classificationLog.write("Dimensionality: %s\n" % value)
#print len(yte), len(predictedLabels), len(testPrograms)
for index in range(len(testPrograms)):
classificationLog.write("%s: Class: %s, Predicted: %s\n" % (testPrograms[index], allClasseste[yte[index]], allClasseste[predictedLabels[index]]))
# TODO (Added January 15): Remove all TF-IDF files of the current batch
if arguments.verbose == "yes":
prettyPrint("Removing all TF-IDF files of the current batch", "debug")
rmCounter = 0
for featureFile in glob.glob("%s/*.%s_t*" % (arguments.sourcedir, arguments.datatype)): # TODO: This will remove tfidf_both you stupid fuck!!
os.unlink(featureFile)
rmCounter += 1
prettyPrint("Successfully removed %s files" % rmCounter)
classificationLog.close()
# Now average the scored results stored in the matrices
pointsX, pointsYacc, pointsYtime = [], [], []
for value in allValues:
pointsX.append(value)
pointsYacc.append(averageList(allAccuracyRates[:,allValues.index(value)]))
pointsYtime.append(averageList(allTimings[:,allValues.index(value)]))
# Plot accuracies and timings graphs
if arguments.algorithm == "tree":
xAxisLabel = "Maximum Tree Depth"
elif arguments.algorithm == "bayes":
xAxisLabel = "Selected Features" if criterion == "select" else "Extracted Features"
prettyPrint("Plotting accuracies for \"%s\" criterion" % criterion)
data_visualization.plotAccuracyGraph(pointsX, pointsYacc, xAxisLabel, "Classification Accuracy", "Classification Accuracy: %s (%s)" % (criterion, arguments.datatype), "accuracy_%s_exp2_%s_%s.pdf" % (arguments.datatype, criterion, arguments.algorithm))
#prettyPrint("Plotting timings")
#data_visualization.plotAccuracyGraph(pointsX, pointsYtime, "Maximum Tree Depth", "Classification Timing (sec)", "Classification Timing: %s (%s)" % (criterion, arguments.datatype))
####################################
# MODE X : Filter generated traces #
####################################
elif arguments.mode == "filter-traces":
# Retrieve the necessary parameters
inExtension = raw_input("Input extension (Default: dyndis): ")
outExtension = raw_input("Output extension (Default: dyndis_raw): ")
filterMode = raw_input("Filteration mode {raw (Default), mem, both}: ")
if filterTraces(arguments.sourcedir, inExtension, filterMode, outExtension, arguments.filterfunction):
prettyPrint("Successfully filtered \"%s\" traces to \"%s\" traces using the \"%s\" filter" % (inExtension, outExtension, filterMode))
else:
prettyPrint("Some error occurred during filteration", "warning")
########################################################
# MODE XI: Generate TF-IDF feature vectors from traces #
########################################################
elif arguments.mode == "extract-from-traces":
# Retrieve the necessary paramters
inExtension = raw_input("Input extension (Default: dyndis): ")
outExtension = raw_input("Output extension (Default: tfidf_raw): ")
maxFeatures = int(raw_input("Maximum features: "))
if feature_extraction.extractTFIDFMemoryFriendly(arguments.sourcedir, inExtension, maxFeatures, outExtension):
prettyPrint("Successfully extracted %s TF-IDF features from traces with \"%s\" extension" % (maxFeatures, inExtension))
else:
prettyPrint("Some error occurred during TF-IDF feature extraction", "warning")
except Exception as e:
#global garbage
prettyPrint("Error encountered in \"main\": %s at line %s" % (e, sys.exc_info()[2].tb_lineno), "error")
#print garbage
cleanUp()
return
if __name__ == "__main__":
main()
You can’t perform that action at this time.