## Parse w32_mal_report and hooklogs
* w32_mal_report is from Virustotal (total 357 malware/binaries)
* hooklohgs are from dynamic analysis (total 420 files/processes)

In [15]:
import os
import json

w32_mal_report = 'data/w32_mal_report/'
tag = '484'
hooklogs = 'data/'+tag+'hooklogs/'

# MIKE: if text_mining is True, the result strings will be stripped.
text_mining = True
csv_file = 'output/w32_mal_report_with_hooklog_tm_'+tag+'.csv' if text_mining else 'output/w32_mal_report_with_hooklog_'+tag+'.csv'

In [16]:
import string
def _isHex(s):
    if len(s) == 0:
        return False
    for c in s:
        if c not in string.hexdigits:
            return False
    return True

### w32_mal_report
First run, get malware list

In [17]:
malware_file_list = next(os.walk(w32_mal_report))[2]
mal_in_rep_list = list()
for r in malware_file_list:
    mal_in_rep_list.append(r[:-len('.txt')])

print 'mal_in_rep_list', len(mal_in_rep_list) # number of malware in the w32_mal_report

#=======

# But not all the malware in the report is Win32, so ...
for f in malware_file_list:
    h = open(w32_mal_report + f, 'r')
    json_report = h.read()
    if h: h.close()
    malware = f[:-len('.txt')]
    
    d = json.loads(json_report)
    if(d['type'] != 'Win32 EXE'):
        print malware, 'is not win32. Remove from mal_in_rep_list.'
        mal_in_rep_list.remove(malware)

print 'mal_in_rep_list', len(mal_in_rep_list) # number of malware in the w32_mal_report

mal_in_rep_list 357
d0660e9ed67a7d1e4002e28de5e487ff is not win32. Remove from mal_in_rep_list.
d7f7aa54c3f2cdd3f53537519eeb48d9 is not win32. Remove from mal_in_rep_list.
d46a7a29ecfaa4f070ca41bda03c68c1 is not win32. Remove from mal_in_rep_list.
32edf2a4f065d16a1a1df79586aad70a is not win32. Remove from mal_in_rep_list.
010ac3a6f8c7dba927a2ca5d91fe224e is not win32. Remove from mal_in_rep_list.
2acff8031efafc4f434dfbd4e818fb92 is not win32. Remove from mal_in_rep_list.
mal_in_rep_list 351


### hooklogs
First run, get process list and the malware list. Note that the some hooklog has shorten name.

In [18]:
process_file_list = next(os.walk(hooklogs))[2]
proc_in_log_list = list()
mal_in_log_set = set()
for p in process_file_list:
    m_p = p[:-len('.trace.hooklog')]
    proc_in_log_list.append(m_p)
    mal_in_log_set.add(m_p.split('_')[0])

print 'mal_in_log_set', len(mal_in_log_set) # number of malware in the hooklog
print 'proc_in_log_list', len(proc_in_log_list) # number of processes in the hooklog

mal_in_log_set 306
proc_in_log_list 484


### Intersection
I have no idea which list is correct, so make an intersection to make sure the malware/process list is correct.

In [19]:
#MIKE: if hooklog's name is not shorten, the line below is good enough. However, ...
#Find correct malware, than correct process
#inter_list = set(mal_in_rep_list).intersection(mal_in_log_set)
#print len(inter_list)

# Make a dict, d[sort name] = long_name; Make a true process list
proc_list = proc_in_log_list
m_set = mal_in_log_set
mal_dict = dict()

#os.path.getsize(hooklogs + process + '.trace.hooklog') < 2000:

for m in list(m_set): # could be short name
    _done = False
    for n in mal_in_rep_list: # long name    #==============not in rep 不用for
        if m == n:
            mal_dict[m] = n
            _done = True
            break
        elif m in n:
            mal_dict[m] = n
            _done = True
            break
    if not _done: # cannot find corresponding item
        print 'no such', m, 'in report.'
        m_set.remove(m)

for m_p in proc_in_log_list:
    m = m_p.split('_')[0]
    if m not in mal_dict.keys():
        proc_list.remove(m_p)

print 'mal_dict', len(mal_dict) # MIKE: the malware we want to deal with
print 'proc_list', len(proc_list) # MIKE: the corresponding processes

mal_dict 306
proc_list 484


In [20]:
#=======
# 1) check again, using long_name
# 2) do text_mining ds at the same time

import re
tag_dict = dict()
delimiter = '\,|!|\(|\)|\[|\]|@|:|/|\.| '

av_set = set()
for malware in mal_dict.values():
    h = open(w32_mal_report + malware + '.txt', 'r')
    json_report = h.read()
    if h: h.close()
    
    d = json.loads(json_report)
    for c, s in enumerate(d['scans'].keys()):
        scan_result = d['scans'].get(s)
        if scan_result.get("detected") == True:
            engine = s.encode('ascii', 'ignore')
            result = scan_result.get("result").encode('ascii', 'ignore')
            result = result.replace(',', '') # special replacement for csv
            av_set.add(engine)
            
            if text_mining:
                tokens = re.split(delimiter, result.lower())
                for k in tokens:
                    if len(k) == 0:
                        continue
                    if k in tag_dict:
                        tag_dict[k] += 1
                    else:
                        tag_dict[k] = 1

av_list = list(av_set)
print 'av_list', len(av_list), av_list

av_list 56 ['Bkav', 'TotalDefense', 'MicroWorld-eScan', 'nProtect', 'CMC', 'CAT-QuickHeal', 'McAfee', 'Malwarebytes', 'Zillya', 'AegisLab', 'K7AntiVirus', 'K7GW', 'TheHacker', 'Agnitum', 'F-Prot', 'Symantec', 'Norman', 'ESET-NOD32', 'TrendMicro-HouseCall', 'Avast', 'ClamAV', 'Kaspersky', 'BitDefender', 'NANO-Antivirus', 'ViRobot', 'ByteHero', 'Tencent', 'Ad-Aware', 'Emsisoft', 'Comodo', 'F-Secure', 'DrWeb', 'VIPRE', 'AntiVir', 'TrendMicro', 'McAfee-GW-Edition', 'Sophos', 'Cyren', 'Jiangmin', 'Avira', 'Antiy-AVL', 'Kingsoft', 'Microsoft', 'SUPERAntiSpyware', 'GData', 'Commtouch', 'AhnLab-V3', 'VBA32', 'AVware', 'Panda', 'Rising', 'Ikarus', 'Fortinet', 'AVG', 'Baidu-International', 'Qihoo-360']


In [21]:
#Special handel for tag_dict
general_string = ['win32','trojan','adware','generic','application','variant','downloader','not-a-virus','downware',
                 'unwanted-program','heur','troj','bundler','antifw','riskware','optional','malware','behaveslike',
                 'kcloud','agent','trojandownloader','appl','trojware','installer','trojan-downloader','virus',
                 'backdoor','injector','malware-cryptor','dropper','cryptor','bundleapp','suspicious','antifwk',
                 'adinstaller','crypt','bundleinstaller','xpack', 'hacktool','patcher','troj_gen','grayware',
                 'software','install','click','heuristic','packed','unknown','applicunwnt','dropped','trojan-clicker',
                 'net-worm','monitoringtool','worm','tool','toolbar','eldorado','autorun','hw32', 'trojan-dropper']
special_string = ['kdz', 'ipz', 'lmn']
tag_key_list =  tag_dict.keys()

In [22]:
# Remove tags that have only 1 count, or ...

for k in tag_key_list:
    if  k in special_string:
        continue
    elif _isHex(k):
        del tag_dict[k]
    elif tag_dict[k] <= 1 or len(k) <= 3:
        del tag_dict[k]
    elif k in general_string:
        del tag_dict[k]
     
print 'tag_dict', len(tag_dict), tag_dict.keys()

tag_dict 426 ['nirsoft', 'pay-per', 'undef', 'patched', 'ibryte', 'r047c0pkb13', 'adownloader', 'wasamalax', 'spambot', 'dcpsef', 'dgtr', 'generic-s', 'spysweep', 'kryptik', 'gmunpackerinstaller', 'trojan2', 'installerex-bi', 'spyware-actualspy', 'bund', 'delf', 'qvm11', 'czjhac', 'qvm18', 'qvm19', 'xpack-hie', 'asxpzpcc', 'bfoddr', 'better', 'adw_dealply', 'adload', 'sisproc', 'dropper-nnb', 'cukhmp', 'aidb', 'firseria-a', 'firseria-c', 'generic7', 'qvm20', 'installc', 'shopper', 'cvtdtw', 'crmvtp', 'a-2991a3af', 'cvyseb', 'suspicious-pkr', 'sdbot', 'hlux', 'sgeneric', 'bscope', 'r047b01ju14', 'yicb-7003', 'autoit', 'xiaoho', 'generic35', 'generic34', 'malcrypt', 'downloadware', 'domalq', 'trojan-psw', 'cwhyud', 'optimum', 'r02sc0cc414', 'chromepass', 'delphi', 'suspicious-bay', 'mloader', 'webpi', 'somoto-o', 'cxkjch', 'infostealer', 'sefnit', 'nrbc', 'cgbf', 'agent-743705', 'kazy', 'bitcoinminer', 'dldr', 'mazel', 'pup-ffe', 'pup-ffd', 'firseria', 'pup-fft', 'core', '08gn14', 'civuz

### Make csv and pickle files

In [23]:
o = open(csv_file, 'w')
# header
o.write("%s,%s,%s,%s,%s" % ('Hash', 'Type', 'First_Seen', 'Scan', 'Tag'))
map(lambda s: o.write(',%s' % s), av_list)
o.write('\n')
import pickle
malwareFami_dict={}    #==========

#each malware
for malware in mal_dict.values():
    h = open(w32_mal_report + malware + '.txt', 'r')
    json_report = h.read()
    if h: h.close()
    
    d = json.loads(json_report)
    # meta for each malware
    o.write("%s,%s,%s,%d/%d" % (malware, d['type'], d['first_seen'], d['positives'], len(d['scans'])))
    
    # meta Tag, no ','
    tags = ''
    this_tag_set = set()
    
    # arrange result
    candidate_dict = dict()
    for c, s in enumerate(d['scans'].keys()):
        scan_result = d['scans'].get(s)
        if scan_result.get("detected") == True:
            engine = s.encode('ascii', 'ignore')
            result = scan_result.get("result").encode('ascii', 'ignore')
            candidate_dict[engine] = result
            
    # output av result for this malware
    out = ''
    for av in av_list:
        if av in candidate_dict:
            result = candidate_dict[av].replace(',', '') # special replacement for csv
            if text_mining:
                result_tag = ''
                tokens = re.split(delimiter, result.lower())
                for k in tokens:
                    if k in tag_dict:
                        if k not in this_tag_set:
                            this_tag_set.add(k)
                            tags += (k+';')
                        result_tag += (k+'.')
                if result_tag:
                    result_tag = result_tag[:-1] # remove last .
                out += (',' + result_tag)
            else:
                out += (',' + result)
        else:
            out += ','
    if tags:
        tags = tags[:-1] # remove last ;
    
    print malware, tags.split(';')[0]  #==========
    malwareFami_dict[malware] = tags.split(';')[0]  #==========
    o.write(',' + tags + out + '\n')

if o: o.close()

with open('pickle/av_list-mal_dict-family_dist_'+tag+'.pickle', 'wb') as o:
    pickle.dump(malwareFami_dict, o)
if o: o.close()

b12e66515990aba2ca11fd4e72959f42 loadmoney
5375fd472c9c4dbb143a89796571cc9e firseria
b67c317f6397fde0790290046ab3ff1b firseria
81f727efa7f0977b86ed3581e159b219 optimuminstaller
26914ea7c65538ef9954b779ab7dc280 artemis
c181c5ad7bee8a1f0e87bf31701c8a43 hype
23ed4c1867cf18a06fed1b8fa7ec523e installcore
5116f8d8c31a4e458c4a633c04082ebd graftor
84dffc6d32371bbbfa701551e363af30 artemis
35a94ca04d07c9427c78d1befc431ea6 firseria
27f348177560314182f161d9ee90b999 appsinstaller
94d0442c0563b5bfddc6d683bd0d9087 firseria
528360b3ff12866c8de7fb90369cf6a4 downloader-frk
d8762c8cea86083ce9a3aec9aedb109f sality
3002038e55383b4e1d14abc7182a1e78 graftor
4037115559746cfd5c865a9f713c2625 famvt
c097e3603b5903d6f7c65038d69b7285 loadmoney
28e17f1e110f3508ceb611839986a4f4 fiseria
811c47bd9f7347318794f59a8230c180 downloadadmin
35a2de57da22cd4d9674b6bfd282a9fc artemis
33a38040289cb673ebcce042471d35ff artemis
b435648fcba6306bccc01c50c0785651 dealply
a41cc0879fc07afd76ada851d99667e4 firseria
109384695f396badeedd8e

In [24]:
import pickle

data = [av_list, mal_dict, proc_list]

with open('pickle/av_list-mal_dict-proc_list_'+tag+'.pickle', 'wb') as o:
    pickle.dump(data, o)