### parseVTRep2CSV Example

Given a directory having virustotal's json report
Output a CSV table of detection name (column: anti-virus engine; row: malware)

In [1]:
import os
import json
import pandas as pd
from numpy import nan as NaN

In [2]:
# input
in_directory = "C:/test/VTReport/" # your VTreport dir
in_hooklog_directory = "C:/Users/hsiao/Downloads/GitHub/MotifAnalysis/hooklogs/somoto_woj/" # a hooklog dir, or None
in_tag = "somoto_woj"
in_first_seen = True
in_save_to_subdir_firstseen = True

# output
out_tag = in_tag

# outfiles
out_csvfile = 'output/VTRepo_'+ out_tag + '.csv' # original
out_wn_csvfile = 'output/VTRepo_wn_'+ out_tag + '.csv' # winnowed

In [3]:
# MIKE: 20170822, hack for TXT (VT report) or hooklog
run_directory = in_hooklog_directory if in_hooklog_directory != None else in_directory

# iter the directory
file_list = next(os.walk(run_directory))[2]
hash_set = set(t.split('.')[0].split("_")[0] for t in file_list)
ext = file_list[0].split('.')[-1].lower() 

print("%d files" % len(file_list))
print("%d hashes" % len(hash_set))
print(ext)
print("save to", out_csvfile)
print("save to", out_wn_csvfile)

59 files
23 hashes
hooklog
save to output/VTRepo_somoto_woj.csv
save to output/VTRepo_wn_somoto_woj.csv


In [4]:
# find all anti-virus engines and corresponding detection strings

av_set = set() # set of all anti-virus engines
csv_dict = dict()

for h in hash_set:
    # open txt file and load it as json
    with open(os.path.join(in_directory, h + '.txt')) as txt_file:    
        json_report = json.load(txt_file)
        
    # create a dictionary _dict = {engine: "detection_name"}
    _dict = dict()
    for engine in json_report['scans'].keys(): 
        scan_result = json_report['scans'].get(engine)
        if scan_result.get("detected") == True:
            result = scan_result.get("result").encode('ascii', 'ignore')
            result = result.decode("ascii").replace(',', '') # special replacement for csv
            av_set.add(engine)
            
            _dict[engine] = result
    
    # if you don't need first_seen, set in_first_seen as Fasle
    if in_first_seen:
        _dict["first_seen"] = json_report['first_seen']
    
    # attach this dictioary to csv_dict = {hash: _dict}
    csv_dict[h] = _dict

In [5]:
df = pd.DataFrame(csv_dict).T

In [6]:
# You can print the df here
df.head()
#df['AVG']

Unnamed: 0,AVG,AVware,Ad-Aware,AegisLab,Agnitum,AhnLab-V3,AntiVir,Antiy-AVL,Arcabit,Avast,...,Symantec,Tencent,TrendMicro,TrendMicro-HouseCall,VBA32,VIPRE,ViRobot,Zillya,first_seen,nProtect
25fbb984c2ebcd7fb69404a06ac4838a3a0e64293bb3a339f16fe6bfa2347a2e,AdInstaller.Somoto,Trojan.Win32.Generic!BT,Application.Bundler.Somoto.I,,PUA.Somoto!,Win-PUP/Somoto,,,,Win32:PUP-gen [PUP],...,WS.Reputation.1,,,,Signed-AdWare.BetterInternet.SomotoLtd,Trojan.Win32.Generic!BT,,Adware.BetterInternet.Win32.1234,2013-08-16 05:54:57,
31b164b4dc1d2744bf11183604572648,Downloader.ATW,Trojan.Win32.Generic!BT,Application.Bundler.Somoto.I,,,Win-AppCare/Somoto.236936.B,,Trojan/Win32.CF.gen,Application.Bundler.Somoto.I,,...,PUA.BetterInstaller,,,,,Trojan.Win32.Generic!BT,,Downloader.MazelCRTD.Win32.1058,2014-03-23 03:01:57,
38fe6f891182ca08e3f6de18ea2153a6,AdLoad.S,Trojan.Win32.Generic!BT,Application.Bundler.Somoto.AG,Troj.Downloader.W32!c,,PUP/Win32.Somoto,,Trojan/Generic.ASMalwNS.2749,Application.Bundler.Somoto.AG,,...,SAPE.Heur.A6182,Win32.Downloader.Bp-somato.Rkbk,ADW_TOMOS.SMN,ADW_TOMOS.SMN,,Trojan.Win32.Generic!BT,,,2016-01-21 16:52:45,
4f2483f23c8eae37a77fdbcf9133d38ba51f8edac4b8fb7f8a9c4f991134cafe,Downloader.ATW,Trojan.Win32.Generic!BT,Application.Bundler.Somoto.I,Troj.Agent.mt2x,,Unwanted/Win32.Somoto.N1167800769,,Trojan/Win32.CF.gen,Application.Bundler.Somoto.I,Win32:Somoto-J [PUP],...,PUA.BetterInstaller,,ADW_TOMOS.SMN,ADW_TOMOS.SMN,Signed-AdWare.BetterInternet.SomotoLtd,Trojan.Win32.Generic!BT,,Adware.SomotoCRTD.Win32.1117,2014-04-28 20:06:52,
5169113d8207a5605ff57604a00c030a,Downloader.ATW,Trojan.Win32.Generic!BT,Application.Bundler.Somoto.I,,,Win-AppCare/Somoto.236952.E,,Trojan/Win32.CF.gen,Application.Bundler.Somoto.I,Win32:Somoto-J [PUP],...,Trojan.ADH.2,,ADW_TOMOS.SMN,ADW_TOMOS.SMN,Signed-AdWare.BetterInternet.SomotoLtd,Trojan.Win32.Generic!BT,,,2014-03-27 06:10:06,


In [7]:
# output
df.to_csv(out_csvfile)

#### Winnowing

In [8]:
# MIKE: 20170731 some hacks for winnowing

In [9]:
# delimiter is used for spliting tokens
import re
delimiter = '\,|!|\(|\)|\[|\]|@|:|/|\.| '

# general_string to remove
general_string = ['win32','trojan','adware','generic','application','variant','downloader','not-a-virus','downware',
                 'unwanted-program','heur','troj','bundler','antifw','riskware','optional','malware','behaveslike',
                 'kcloud','agent','trojandownloader','appl','trojware','installer','trojan-downloader','virus',
                 'backdoor','injector','malware-cryptor','dropper','cryptor','bundleapp','suspicious','antifwk',
                 'adinstaller','crypt','bundleinstaller','xpack', 'hacktool','patcher','troj_gen','grayware',
                 'software','install','click','heuristic','packed','unknown','applicunwnt','dropped','trojan-clicker',
                 'net-worm','monitoringtool','worm','tool','toolbar','eldorado','autorun','hw32', 'trojan-dropper']

# short family strings that should be kept
short_family_string = ['kdz', 'ipz', 'lmn']

import string
def is_hex(s):
    return all(c in string.hexdigits for c in s)

def tk_winnow(t):
    if len(t) <= 3 and t not in short_family_string:
        return None
    elif t in general_string:
        return None
    elif is_hex(t):
        return None
    
    return t

def VT_winnow(s):
    if s is NaN: return NaN
    
    tokens = re.split(delimiter, s.lower())
    ret_tokens = list(filter(lambda x : x if x is not False else True, [tk_winnow(t) for t in tokens]))
    return ret_tokens if len(ret_tokens) != 0 else NaN

In [10]:
df_nw = df.copy().applymap(VT_winnow)

In [11]:
df_nw.head()

Unnamed: 0,AVG,AVware,Ad-Aware,AegisLab,Agnitum,AhnLab-V3,AntiVir,Antiy-AVL,Arcabit,Avast,...,Symantec,Tencent,TrendMicro,TrendMicro-HouseCall,VBA32,VIPRE,ViRobot,Zillya,first_seen,nProtect
25fbb984c2ebcd7fb69404a06ac4838a3a0e64293bb3a339f16fe6bfa2347a2e,[somoto],,[somoto],,[somoto],"[win-pup, somoto]",,,,[pup-gen],...,[reputation],,,,"[signed-adware, betterinternet, somotoltd]",,,[betterinternet],[2013-08-16],
31b164b4dc1d2744bf11183604572648,,,[somoto],,,"[win-appcare, somoto]",,,[somoto],,...,[betterinstaller],,,,,,,[mazelcrtd],[2014-03-23],
38fe6f891182ca08e3f6de18ea2153a6,[adload],,[somoto],,,[somoto],,[asmalwns],[somoto],,...,[sape],"[bp-somato, rkbk]",[adw_tomos],[adw_tomos],,,,,[2016-01-21],
4f2483f23c8eae37a77fdbcf9133d38ba51f8edac4b8fb7f8a9c4f991134cafe,,,[somoto],[mt2x],,"[unwanted, somoto, n1167800769]",,,[somoto],[somoto-j],...,[betterinstaller],,[adw_tomos],[adw_tomos],"[signed-adware, betterinternet, somotoltd]",,,[somotocrtd],[2014-04-28],
5169113d8207a5605ff57604a00c030a,,,[somoto],,,"[win-appcare, somoto]",,,[somoto],[somoto-j],...,,,[adw_tomos],[adw_tomos],"[signed-adware, betterinternet, somotoltd]",,,,[2014-03-27],


In [12]:
# output
df_nw.to_csv(out_wn_csvfile)

## Save files to new directory based on first-seen year

In [13]:
import shutil

#MIKE: set the interval of years 
in_years = [(0, 2000), (2001, 2010), (2011, 2014), (2015, 2017)]

def max_year(y):
    global in_years
    for min_year, max_year in in_years:
        if min_year <= y <= max_year:
            return max_year
    return -1

In [15]:
if in_save_to_subdir_firstseen and in_hooklog_directory and ext == "hooklog":
    first_seen_year_series = df['first_seen'].apply(lambda f: int(f.split('-')[0]))
    save_year_series = first_seen_year_series.apply(max_year)
    save_year_series.name = "save_year"
    
    for f in file_list:
        
        name = f.split('_')[0]
        save_year = save_year_series[name]
        
        new_dir = "hooklogs/" + in_tag + "_year" + str(save_year) +"/"
        if not os.path.isdir(new_dir):
            os.makedirs(new_dir)
            
        shutil.copy(os.path.join(in_hooklog_directory, f), os.path.join(new_dir, f))
        
    print("new files with year is saved to", "hooklogs/" + in_tag + "_year")

new files with year is saved to hooklogs/somoto_woj_year
