In [82]:
# アセンブリの記されたテキストファイルからニーモニックのみを抽出しjsonファイルを作成する
import argparse
import json
import os
import subprocess
import sys
import pickle
import csv
import re

In [83]:
# 全体的な処理の流れ
# マルウェア全ファイルの逆アセンブル結果を「assemblyTxt」に保存
# 「assemblyTxt」内のファイルをすべてパースしワードリストを作成する
# ワードリストをもとに各マルウェアの特徴量の抽出を行う

In [84]:
# 引数のニーモニックリストをn単語ごとに区切ったものを返す
def getNgram(mnemonicList,n):
    ngram = []
    result = []
    for mindex in range(len(mnemonicList) - n + 1):
        ngramWord = mnemonicList[mindex:mindex + n]
        ngram.append(ngramWord)
        
#     ngram = getOnlyWords(ngram)
    return ngram

In [85]:
# 引数の二次元のリストの重複する要素の削除をする
def getOnlyWords(targetList):
    result = []
    for val in targetList:
        if val not in result:
            result.append(val)

    return result

In [86]:
# objdumpで逆アセンブルを行い結果をパースし、jsonを返す
def reverseAssembly(filePath):
    
    cmd = ['objdump','--disassemble','--no-show-raw-insn',filePath]
    try:
        assembly = subprocess.run(cmd,stdout = subprocess.PIPE, stderr = subprocess.PIPE)
        print('complete  reversing')

    except:
        print('can\'t reverse assembly ')
    
    retJson= getMalJson(filePath,assembly.stdout.decode('utf8'))
    
    return retJson

In [87]:
def checkFileType(filePath):
        cmd = ['file',filePath]
        try:
            fileTypeStr = subprocess.run(cmd,stdout = subprocess.PIPE, stderr = subprocess.PIPE)
            fileType = parseFileType(fileTypeStr.stdout.decode('utf8'))
        except:
            print('can not check fileType')
            fileType = ''
        return  fileType

In [88]:
def parseFileType(parseStr):
    ret = parseStr.rsplit(':')[1].strip()
    return ret

In [89]:
# ディレクトリの生成
def makeDir(dirName):
    try:
        if(not(os.path.exists(dirName))):
            mkdirArg = ['mkdir',dirName]
            subprocess.check_call(mkdirArg)
        else:
             print(' {} already exists'.format(dirName))
    except:
        sys.exit('can\'t make directory')

In [90]:
# jsonファイルを指定したファイルに保存する
def writeJson(assemblyJson,filePath):
    with open(filePath,'w') as f:
        fileName = os.path.basename(filePath)
        try:
            json.dump(assemblyJson,f,indent = 4)
            print('complete  writing ')
        except:
            print('can\'t output {}'.format(fileName))

In [91]:
#逆アセンブルの結果をパースしニーモニックをjsonとして返す
def getMalJson(filePath,assembly):
    
    lines = assembly.split('\n')
    lines.append(' ')
    
    sectionName= ''
    mnemonics = []
    results = {}
    section = {}
    sectionName = ''
    sectionNumber = 0
    
    fileName = os.path.basename(filePath)
    results['FileName'] = fileName
    results['Filetype '] = checkFileType(filePath)
    
    print('parsing  ')
    
    for line in lines:
        if not line:
            continue
        line = line.split('#')[0].strip('\n')#コメント削除
        
        
        if(re.findall('.*:.*file format',line)):
            continue
    
        if (re.findall('Disassembly.*:', line)):
            continue
        
        
        if(re.findall('<.*>',line)):
            sectionName = re.findall('<.*>',line)
            if(len(mnemonics) >= 1):
                section.update({sectionName[0] + '_{}'.format(sectionNumber) :mnemonics})
                sectionNumber += 1
                mnemonics = []
            continue
        
        words = line.split()
        if(len(words) >=2):
            mnemonics.append(words[1])

    section.update({sectionName[0] + '_{}'.format(sectionNumber):mnemonics})
            
    results['mnemonics']=section
    
    return results


#     print(json.dumps(results,indent = 4))

In [92]:
def writePickle(obj,filePath):
    fileName = os.path.basename(filePath)
    try:
        with open(filePath,'wb') as f : 
            pickle.dump(obj,f)
        print('writing {} success'.format(fileName))
    except:
        print('failed writing {}'.format(fileName))

In [93]:
def main():

#     parser = argparse.ArgumentParser()
#     parser.add_argument('dirPath')
    
#     malDir = 'byteFiles/'
    outPutDir = 'results/'

    makeDir(outPutDir)
    errorHashList = []
    
    
# 実行時は'~$assemblyToJson malwareDir' 
#     args = parser.parse_args(args=[malDir])
    malDir = sys.argv[1]
    malDir = '/media/sf_VirtualBox_share/malware_samples/samples/samples/ransomware/'
    
    dir = 'assemblyTxt'
    jsonDir = os.path.join(outPutDir,dir)
    makeDir(jsonDir)
    
    
    for dirpath,dirnames,filenames in os.walk(malDir):
        fileNamesLen = len(filenames)
        idx = 1
        for filename in filenames:
            print('{0}/{1}'.format(idx,fileNamesLen))
            try:
                assembly = reverseAssembly(os.path.join(malDir , filename))
                writeJson(assembly , os.path.join(jsonDir,os.path.splitext(filename)[0])+ '.json')
            except:
                print('can not complete process')
                errorHashList.append(filename)
            idx += 1
    
    print(errorHashList)
    
    with open(os.path.join(outPutDir,'errorList.csv') , 'w') as f:
        writer = csv.writer(f)
        writer.writerow(errorHashList)
main()

 results/ already exists
 results/assemblyTxt already exists
1/2000
complete  reversing
parsing  
can not complete process
2/2000
complete  reversing
parsing  
complete  writing 
3/2000
complete  reversing
parsing  
complete  writing 
4/2000
complete  reversing
parsing  
complete  writing 
5/2000
complete  reversing
parsing  
complete  writing 
6/2000
complete  reversing
parsing  
complete  writing 
7/2000
complete  reversing
parsing  
complete  writing 
8/2000
complete  reversing
parsing  
complete  writing 
9/2000
complete  reversing
parsing  
complete  writing 
10/2000
complete  reversing
parsing  
complete  writing 
11/2000
complete  reversing
parsing  
complete  writing 
12/2000
complete  reversing
parsing  
complete  writing 
13/2000
complete  reversing
parsing  
complete  writing 
14/2000
complete  reversing
parsing  
complete  writing 
15/2000
complete  reversing
parsing  
complete  writing 
16/2000
complete  reversing
parsing  
complete  writing 
17/2000
complete  reversing
pa

complete  reversing
parsing  
complete  writing 
144/2000
complete  reversing
parsing  
complete  writing 
145/2000
complete  reversing
parsing  
complete  writing 
146/2000
complete  reversing
parsing  
complete  writing 
147/2000
complete  reversing
parsing  
complete  writing 
148/2000
complete  reversing
parsing  
complete  writing 
149/2000
complete  reversing
parsing  
complete  writing 
150/2000
complete  reversing
parsing  
complete  writing 
151/2000
complete  reversing
parsing  
complete  writing 
152/2000
complete  reversing
parsing  
complete  writing 
153/2000
complete  reversing
parsing  
complete  writing 
154/2000
complete  reversing
parsing  
can not complete process
155/2000
complete  reversing
parsing  
complete  writing 
156/2000
complete  reversing
parsing  
complete  writing 
157/2000
complete  reversing
parsing  
complete  writing 
158/2000
complete  reversing
parsing  
complete  writing 
159/2000
complete  reversing
parsing  
complete  writing 
160/2000
complete

complete  writing 
284/2000
complete  reversing
parsing  
complete  writing 
285/2000
complete  reversing
parsing  
complete  writing 
286/2000
complete  reversing
parsing  
complete  writing 
287/2000
complete  reversing
parsing  
can not complete process
288/2000
complete  reversing
parsing  
complete  writing 
289/2000
complete  reversing
parsing  
complete  writing 
290/2000
complete  reversing
parsing  
complete  writing 
291/2000
complete  reversing
parsing  
complete  writing 
292/2000
complete  reversing
parsing  
complete  writing 
293/2000
complete  reversing
parsing  
complete  writing 
294/2000
complete  reversing
parsing  
complete  writing 
295/2000
complete  reversing
parsing  
complete  writing 
296/2000
complete  reversing
parsing  
complete  writing 
297/2000
complete  reversing
parsing  
complete  writing 
298/2000
complete  reversing
parsing  
complete  writing 
299/2000
complete  reversing
parsing  
complete  writing 
300/2000
complete  reversing
parsing  
complete

complete  writing 
424/2000
complete  reversing
parsing  
complete  writing 
425/2000
complete  reversing
parsing  
complete  writing 
426/2000
complete  reversing
parsing  
complete  writing 
427/2000
complete  reversing
parsing  
complete  writing 
428/2000
complete  reversing
parsing  
complete  writing 
429/2000
complete  reversing
parsing  
complete  writing 
430/2000
complete  reversing
parsing  
complete  writing 
431/2000
complete  reversing
parsing  
complete  writing 
432/2000
complete  reversing
parsing  
complete  writing 
433/2000
complete  reversing
parsing  
complete  writing 
434/2000
complete  reversing
parsing  
complete  writing 
435/2000
complete  reversing
parsing  
complete  writing 
436/2000
complete  reversing
parsing  
complete  writing 
437/2000
complete  reversing
parsing  
complete  writing 
438/2000
complete  reversing
parsing  
complete  writing 
439/2000
complete  reversing
parsing  
complete  writing 
440/2000
complete  reversing
parsing  
can not comple

complete  writing 
565/2000
complete  reversing
parsing  
complete  writing 
566/2000
complete  reversing
parsing  
complete  writing 
567/2000
complete  reversing
parsing  
complete  writing 
568/2000
complete  reversing
parsing  
complete  writing 
569/2000
complete  reversing
parsing  
complete  writing 
570/2000
complete  reversing
parsing  
complete  writing 
571/2000
complete  reversing
parsing  
complete  writing 
572/2000
complete  reversing
parsing  
complete  writing 
573/2000
complete  reversing
parsing  
complete  writing 
574/2000
complete  reversing
parsing  
complete  writing 
575/2000
complete  reversing
parsing  
complete  writing 
576/2000
complete  reversing
parsing  
complete  writing 
577/2000
complete  reversing
parsing  
complete  writing 
578/2000
complete  reversing
parsing  
complete  writing 
579/2000
complete  reversing
parsing  
complete  writing 
580/2000
complete  reversing
parsing  
complete  writing 
581/2000
complete  reversing
parsing  
complete  writ

complete  writing 
706/2000
complete  reversing
parsing  
can not complete process
707/2000
complete  reversing
parsing  
complete  writing 
708/2000
complete  reversing
parsing  
complete  writing 
709/2000
complete  reversing
parsing  
can not complete process
710/2000
complete  reversing
parsing  
complete  writing 
711/2000
complete  reversing
parsing  
complete  writing 
712/2000
complete  reversing
parsing  
complete  writing 
713/2000
complete  reversing
parsing  
complete  writing 
714/2000
complete  reversing
parsing  
complete  writing 
715/2000
complete  reversing
parsing  
complete  writing 
716/2000
complete  reversing
parsing  
complete  writing 
717/2000
complete  reversing
parsing  
complete  writing 
718/2000
complete  reversing
parsing  
complete  writing 
719/2000
complete  reversing
parsing  
can not complete process
720/2000
complete  reversing
parsing  
complete  writing 
721/2000
complete  reversing
parsing  
complete  writing 
722/2000
complete  reversing
parsin

complete  writing 
848/2000
complete  reversing
parsing  
complete  writing 
849/2000
complete  reversing
parsing  
complete  writing 
850/2000
complete  reversing
parsing  
complete  writing 
851/2000
complete  reversing
parsing  
complete  writing 
852/2000
complete  reversing
parsing  
complete  writing 
853/2000
complete  reversing
parsing  
complete  writing 
854/2000
complete  reversing
parsing  
can not complete process
855/2000
complete  reversing
parsing  
complete  writing 
856/2000
complete  reversing
parsing  
complete  writing 
857/2000
complete  reversing
parsing  
complete  writing 
858/2000
complete  reversing
parsing  
complete  writing 
859/2000
complete  reversing
parsing  
complete  writing 
860/2000
complete  reversing
parsing  
complete  writing 
861/2000
complete  reversing
parsing  
complete  writing 
862/2000
complete  reversing
parsing  
complete  writing 
863/2000
complete  reversing
parsing  
complete  writing 
864/2000
complete  reversing
parsing  
complete

complete  writing 
990/2000
complete  reversing
parsing  
complete  writing 
991/2000
complete  reversing
parsing  
complete  writing 
992/2000
complete  reversing
parsing  
complete  writing 
993/2000
complete  reversing
parsing  
complete  writing 
994/2000
complete  reversing
parsing  
complete  writing 
995/2000
complete  reversing
parsing  
complete  writing 
996/2000
complete  reversing
parsing  
complete  writing 
997/2000
complete  reversing
parsing  
complete  writing 
998/2000
complete  reversing
parsing  
can not complete process
999/2000
complete  reversing
parsing  
complete  writing 
1000/2000
complete  reversing
parsing  
complete  writing 
1001/2000
complete  reversing
parsing  
complete  writing 
1002/2000
complete  reversing
parsing  
complete  writing 
1003/2000
complete  reversing
parsing  
complete  writing 
1004/2000
complete  reversing
parsing  
complete  writing 
1005/2000
complete  reversing
parsing  
complete  writing 
1006/2000
complete  reversing
parsing  
c

complete  writing 
1128/2000
complete  reversing
parsing  
complete  writing 
1129/2000
complete  reversing
parsing  
complete  writing 
1130/2000
complete  reversing
parsing  
complete  writing 
1131/2000
complete  reversing
parsing  
complete  writing 
1132/2000
complete  reversing
parsing  
complete  writing 
1133/2000
complete  reversing
parsing  
complete  writing 
1134/2000
complete  reversing
parsing  
complete  writing 
1135/2000
complete  reversing
parsing  
complete  writing 
1136/2000
complete  reversing
parsing  
complete  writing 
1137/2000
complete  reversing
parsing  
complete  writing 
1138/2000
complete  reversing
parsing  
complete  writing 
1139/2000
complete  reversing
parsing  
complete  writing 
1140/2000
complete  reversing
parsing  
can not complete process
1141/2000
complete  reversing
parsing  
complete  writing 
1142/2000
complete  reversing
parsing  
complete  writing 
1143/2000
complete  reversing
parsing  
complete  writing 
1144/2000
complete  reversing
p

complete  writing 
1266/2000
complete  reversing
parsing  
complete  writing 
1267/2000
complete  reversing
parsing  
complete  writing 
1268/2000
complete  reversing
parsing  
complete  writing 
1269/2000
complete  reversing
parsing  
complete  writing 
1270/2000
complete  reversing
parsing  
complete  writing 
1271/2000
complete  reversing
parsing  
complete  writing 
1272/2000
complete  reversing
parsing  
complete  writing 
1273/2000
complete  reversing
parsing  
complete  writing 
1274/2000
complete  reversing
parsing  
can not complete process
1275/2000
complete  reversing
parsing  
complete  writing 
1276/2000
complete  reversing
parsing  
can not complete process
1277/2000
complete  reversing
parsing  
complete  writing 
1278/2000
complete  reversing
parsing  
complete  writing 
1279/2000
complete  reversing
parsing  
can not complete process
1280/2000
complete  reversing
parsing  
complete  writing 
1281/2000
complete  reversing
parsing  
complete  writing 
1282/2000
complete 

complete  writing 
1404/2000
complete  reversing
parsing  
complete  writing 
1405/2000
complete  reversing
parsing  
complete  writing 
1406/2000
complete  reversing
parsing  
complete  writing 
1407/2000
complete  reversing
parsing  
complete  writing 
1408/2000
complete  reversing
parsing  
complete  writing 
1409/2000
complete  reversing
parsing  
complete  writing 
1410/2000
complete  reversing
parsing  
complete  writing 
1411/2000
complete  reversing
parsing  
complete  writing 
1412/2000
complete  reversing
parsing  
complete  writing 
1413/2000
complete  reversing
parsing  
complete  writing 
1414/2000
complete  reversing
parsing  
can not complete process
1415/2000
complete  reversing
parsing  
complete  writing 
1416/2000
complete  reversing
parsing  
complete  writing 
1417/2000
complete  reversing
parsing  
complete  writing 
1418/2000
complete  reversing
parsing  
complete  writing 
1419/2000
complete  reversing
parsing  
complete  writing 
1420/2000
complete  reversing
p

complete  writing 
1543/2000
complete  reversing
parsing  
can not complete process
1544/2000
complete  reversing
parsing  
complete  writing 
1545/2000
complete  reversing
parsing  
complete  writing 
1546/2000
complete  reversing
parsing  
can not complete process
1547/2000
complete  reversing
parsing  
complete  writing 
1548/2000
complete  reversing
parsing  
complete  writing 
1549/2000
complete  reversing
parsing  
complete  writing 
1550/2000
complete  reversing
parsing  
complete  writing 
1551/2000
complete  reversing
parsing  
can not complete process
1552/2000
complete  reversing
parsing  
complete  writing 
1553/2000
complete  reversing
parsing  
complete  writing 
1554/2000
complete  reversing
parsing  
complete  writing 
1555/2000
complete  reversing
parsing  
complete  writing 
1556/2000
complete  reversing
parsing  
complete  writing 
1557/2000
complete  reversing
parsing  
complete  writing 
1558/2000
complete  reversing
parsing  
complete  writing 
1559/2000
complete 

complete  writing 
1680/2000
complete  reversing
parsing  
complete  writing 
1681/2000
complete  reversing
parsing  
complete  writing 
1682/2000
complete  reversing
parsing  
complete  writing 
1683/2000
complete  reversing
parsing  
complete  writing 
1684/2000
complete  reversing
parsing  
can not complete process
1685/2000
complete  reversing
parsing  
complete  writing 
1686/2000
complete  reversing
parsing  
complete  writing 
1687/2000
complete  reversing
parsing  
complete  writing 
1688/2000
complete  reversing
parsing  
can not complete process
1689/2000
complete  reversing
parsing  
complete  writing 
1690/2000
complete  reversing
parsing  
complete  writing 
1691/2000
complete  reversing
parsing  
complete  writing 
1692/2000
complete  reversing
parsing  
complete  writing 
1693/2000
complete  reversing
parsing  
complete  writing 
1694/2000
complete  reversing
parsing  
complete  writing 
1695/2000
complete  reversing
parsing  
complete  writing 
1696/2000
complete  rever

complete  writing 
1818/2000
complete  reversing
parsing  
can not complete process
1819/2000
complete  reversing
parsing  
complete  writing 
1820/2000
complete  reversing
parsing  
complete  writing 
1821/2000
complete  reversing
parsing  
complete  writing 
1822/2000
complete  reversing
parsing  
complete  writing 
1823/2000
complete  reversing
parsing  
complete  writing 
1824/2000
complete  reversing
parsing  
complete  writing 
1825/2000
complete  reversing
parsing  
complete  writing 
1826/2000
complete  reversing
parsing  
complete  writing 
1827/2000
complete  reversing
parsing  
complete  writing 
1828/2000
complete  reversing
parsing  
complete  writing 
1829/2000
complete  reversing
parsing  
complete  writing 
1830/2000
complete  reversing
parsing  
complete  writing 
1831/2000
complete  reversing
parsing  
complete  writing 
1832/2000
complete  reversing
parsing  
complete  writing 
1833/2000
complete  reversing
parsing  
complete  writing 
1834/2000
complete  reversing
p

parsing  
complete  writing 
1957/2000
complete  reversing
parsing  
complete  writing 
1958/2000
complete  reversing
parsing  
complete  writing 
1959/2000
complete  reversing
parsing  
complete  writing 
1960/2000
complete  reversing
parsing  
complete  writing 
1961/2000
complete  reversing
parsing  
complete  writing 
1962/2000
complete  reversing
parsing  
complete  writing 
1963/2000
complete  reversing
parsing  
complete  writing 
1964/2000
complete  reversing
parsing  
complete  writing 
1965/2000
complete  reversing
parsing  
complete  writing 
1966/2000
complete  reversing
parsing  
complete  writing 
1967/2000
complete  reversing
parsing  
complete  writing 
1968/2000
complete  reversing
parsing  
complete  writing 
1969/2000
complete  reversing
parsing  
complete  writing 
1970/2000
complete  reversing
parsing  
complete  writing 
1971/2000
complete  reversing
parsing  
complete  writing 
1972/2000
complete  reversing
parsing  
complete  writing 
1973/2000
complete  reversi