In [1]:
import os
import pickle
import ast
from collections import defaultdict
from tqdm import tqdm

cve_evi_path = os.path.join('/data/zhaohan/adv-reasoning/data/cyberkg-raw', 'cve-evidence')


load_paths = []
for root, _, filenames in os.walk(cve_evi_path):
    load_paths.extend([os.path.join(root, filename) for filename in filenames])

evi_info = defaultdict(dict)
# reltype2fq = {}
for f in tqdm(load_paths, desc='parsing CVE-related evidence'):
    cve_id = f.split('/')[-1].split('.')[0]
    with open(f, 'r') as file:
        data = ast.literal_eval(file.read().replace('\n', ''))
        for ent in data['entities']:
            evi = ent['canonicalForm']
            if len(evi) == 0 or \
            len(evi.strip('1234567890.,v-; ')) == 0 or \
            len(evi.strip('CVE1234567890-, ')) == 0:
                continue
            if evi not in evi_info:
                evi_info[evi] = {
                    'type2subtype': defaultdict(set),
                    'subType': set(),
                    'cve-ids': set(),
                }
            evi_info[evi]['type2subtype'][ent['type']].add(ent['subType'])
            evi_info[evi]['subType'].add(ent['subType'])
            evi_info[evi]['cve-ids'].add(cve_id)
            
evi2fq = {}
for evi in evi_info:
    evi2fq[evi] = len(evi_info[evi]['cve-ids'])

type2evi = defaultdict(set)
subtype2evi = defaultdict(set)
type2subtype = defaultdict(set)
for evi, fq in evi2fq.items():
    if fq >= 5:
        for t in evi_info[evi]['type2subtype'].keys():
            type2evi[t].add(evi)
            for subt in evi_info[evi]['type2subtype'][t]:
                subtype2evi[subt].add(evi)
                type2subtype[t].add(subt)


parsing CVE-related evidence: 100%|██████████| 82726/82726 [01:42<00:00, 810.64it/s] 


In [8]:
for tp, evi in type2evi.items():
    print(tp, len(evi))

ThreatActor 58
MiscEntity 703
TTP 91
Identity 471
DomainName 37
Vulnerability 329
Product 435
MalwareFamily 43
Campaign 90
Person 26
FileName 187
AvSignature 14
FilePath 64
Protocol 13
Location 20
BugId 13
IpAddress 33
Hash 1
Endpoint 1
ASN 1


In [1]:
import os
import pickle

train_q = pickle.load(open(os.path.join('./basic', 'train_queries.pkl'), 'rb'))
train_a = pickle.load(open(os.path.join('./basic', 'train_answers.pkl'), 'rb'))
entid2cate = pickle.load(open(os.path.join('./basic', 'entid2cate.pkl'), 'rb'))

cates = set()
for q_struc, qs in train_q.items():
    if len(q_struc)==2 and q_struc[-1]==('r',):
        print(q_struc, len(qs))
        for q in qs:
            for a in train_a[q]:
                cates.add(entid2cate[a])
cates

((('e', ('r',)), ('e', ('r',))), ('r',)) 45953
((('e', ('r',)), ('e', ('r',)), ('e', ('r',))), ('r',)) 63998
((('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',))), ('r',)) 72165


{'mitigation'}

### BRON technique-centric evidence

In [7]:
import os
import json
import pickle

str_entset = pickle.load(open(os.path.join('./basic', 'entset.pkl'), 'rb'))

bron_path = '/data/zhaohan/BRON/full_data/full_output_data'
bron_graph = json.load(open(os.path.join(bron_path, 'BRON.json'), 'rb'))


cve2bron_idmap = json.load(open(os.path.join(bron_path, 
                                             'BRON/original_id_to_bron_id', 
                                             'cve_id_bron_id.json'), 'rb'))
bron2cve_idmap = {v:k for k, v in cve2bron_idmap.items()}
    

cve_bronid = set()
for edge in bron_graph['edges']:
    h, t, _ = edge
    if h.startswith('cve_'):
        cve_bronid.add(h.split('_')[-1])
    elif t.startswith('cve_'):
        cve_bronid.add(t.split('_')[-1])

cve_ids_in_BRON = set()
for bron_id in cve_bronid:
    cve_ids_in_BRON.add(bron2cve_idmap[bron_id])
    
print(len(str_entset['cve-id']), len(cve_ids_in_BRON), len(cve_ids_in_BRON & str_entset['cve-id']))

25095 146857 24512


In [4]:
from collections import defaultdict

# check BRON graph contains all reverse edge
# we only need to consider one direction later
bron_facts = set()
for edge in bron_graph['edges']:
    h, t, _ = edge
    bron_facts.add((h, t))
    
for f in bron_facts:
    assert (f[1], f[0]) in bron_facts
    
edge_types = defaultdict(int)
for edge in bron_graph['edges']:
    h, t, r = edge
    src_prefix = h.split('_')[0]
    dst_prefix = t.split('_')[0]
    edge_types[(src_prefix, dst_prefix)] += 1
edge_types

defaultdict(int,
            {('technique', 'tactic'): 733,
             ('tactic', 'technique'): 733,
             ('technique', 'capec'): 157,
             ('capec', 'technique'): 157,
             ('capec', 'cwe'): 1153,
             ('cwe', 'capec'): 1153,
             ('cwe', 'cve'): 109872,
             ('cve', 'cpe'): 1248927,
             ('cve', 'cwe'): 109872,
             ('cpe', 'cve'): 1248927})

In [6]:
kept_cveid = cve_ids_in_BRON & str_entset['cve-id']
kept_cve_bronid = set()
for cve_id in kept_cveid:
    kept_cve_bronid.add(cve2bron_idmap[cve_id])  # no prefix in BRON
    
# from cve to cwe
n_fact = 0
kept_cwe_bronid = set()
for edge in bron_graph['edges']:
    h, t, _ = edge
    cve_id, cwe_id = None, None
    if h.startswith('cve') and t.startswith('cwe'):
        cve_id = h.split('_')[-1]
        cwe_id = t.split('_')[-1]
    elif h.startswith('cwe') and t.startswith('cve'):
        cwe_id = h.split('_')[-1]
        cve_id = t.split('_')[-1]
    if cve_id in kept_cve_bronid and cwe_id is not None:
        kept_cwe_bronid.add(cwe_id)  # no prefix in BRON
        n_fact += 1

cwe2bron_idmap = json.load(open(os.path.join(bron_path, 
                                             'BRON/original_id_to_bron_id', 
                                             'cwe_id_to_bron_id.json'), 'rb'))
print('keep CWE id %d/%d, facts %d' % (len(kept_cwe_bronid), len(cwe2bron_idmap), n_fact))

# add more cwe
import pandas as pd

cve_paths = [os.path.join('/data/zhaohan/adv-reasoning/data/cyberkg-raw/', 'cve', str(y)) 
             for y in range(2000, 2021+1)]
load_paths = []
for path in cve_paths:
    for root, _, filenames in os.walk(path):
        load_paths.extend([os.path.join(root, filename) for filename in filenames])
    
kept_cwe_bronid_ours = set()
for path in load_paths:
    data = pd.read_csv(path, delimiter='|',  index_col=0, header=0)
    for index, row in data.iterrows():
        cve_id = row['cve-id']
        cwe_id = row['cwe-id']
        if cve_id in kept_cveid and cwe_id in cwe2bron_idmap:
            kept_cwe_bronid_ours.add(cwe2bron_idmap[cwe_id])
            
kept_cwe_bronid |= kept_cwe_bronid_ours
print('keep CWE id %d/%d, facts %d' % (len(kept_cwe_bronid), len(cwe2bron_idmap), n_fact))

# from cwe to attack pattern
n_fact = 0
kept_capec_bronid = set()
for edge in bron_graph['edges']:
    h, t, _ = edge
    cwe_id, capec_id = None, None
    if h.startswith('cwe') and t.startswith('capec'):
        cwe_id = h.split('_')[-1]
        capec_id = t.split('_')[-1]
    elif h.startswith('capec') and t.startswith('cwe'):
        capec_id = h.split('_')[-1]
        cwe_id = t.split('_')[-1]
    if cwe_id in kept_cwe_bronid and capec_id is not None:
        kept_capec_bronid.add(capec_id)  # no prefix in BRON
        n_fact += 1
        
capec2bron_idmap = json.load(open(os.path.join(bron_path, 
                                             'BRON/original_id_to_bron_id', 
                                             'capec_id_to_bron_id.json'), 'rb'))
print('keep CAPEC id %d/%d, facts %d' % (len(kept_capec_bronid), len(capec2bron_idmap), n_fact))

# from attack pattern to technique
n_fact = 0
kept_tech_bronid = set()
for edge in bron_graph['edges']:
    h, t, _ = edge
    capec_id, tech_id = None, None
    if h.startswith('capec') and t.startswith('technique'):
        capec_id = h.split('_')[-1]
        tech_id = t.split('_')[-1]
    elif h.startswith('technique') and t.startswith('capec'):
        tech_id = h.split('_')[-1]
        capec_id = t.split('_')[-1]
    if capec_id in kept_capec_bronid and tech_id is not None:
        kept_tech_bronid.add(tech_id)  # no prefix in BRON
        n_fact += 1
        
tech2bron_idmap = json.load(open(os.path.join(bron_path, 
                                             'BRON/original_id_to_bron_id', 
                                             'technique_id_to_bron_id.json'), 'rb'))
print('keep Technique id %d/%d, facts %d' % (len(kept_tech_bronid), len(tech2bron_idmap), n_fact))

# from technique to tactic
n_fact = 0
kept_tac_bronid = set()
for edge in bron_graph['edges']:
    h, t, _ = edge
    tech_id, tac_id = None, None
    if h.startswith('technique') and t.startswith('tac'):
        tech_id = h.split('_')[-1]
        tac_id = t.split('_')[-1]
    elif h.startswith('tac') and t.startswith('technique'):
        tac_id = h.split('_')[-1]
        tech_id = t.split('_')[-1]
    if tech_id in kept_tech_bronid and tac_id is not None:
        kept_tac_bronid.add(tac_id)  # no prefix in BRON
        n_fact += 1
        
tac2bron_idmap = json.load(open(os.path.join(bron_path, 
                                             'BRON/original_id_to_bron_id', 
                                             'tactic_name_to_bron_id.json'), 'rb'))
print('keep Tactic id %d/%d, facts %d' % (len(kept_tac_bronid), len(tac2bron_idmap), n_fact))

# NOTE: in KG, also use kept CWE in our data source

keep CWE id 150/436, facts 40636
keep CWE id 151/436, facts 40636
keep CAPEC id 315/597, facts 1132
keep Technique id 96/693, facts 216
keep Tactic id 10/14, facts 240


In [8]:
e = [24512, 588, 2994, 66492, 10, 96, 315, 151, 46, 13226]
f = [3109, 66492, 31978, 46560, 306356, 40939, 240, 216, 1132, 40636, 7690, 102589]
sum(e), sum(f)

(108430, 647937)

In [1]:
import os, sys
sys.path.append(os.path.abspath('../..'))

import re
import random
import pickle
from collections import defaultdict
import genkg.cyberkg_utils as cyberkg
from helper.qa_util import query_name_dict, name_query_dict

kg_path = './basic'
tmp1, tmp2, tmp3 = cyberkg.get_pd_centric_evi(kg_path)
cve_pd_dict, cve_ver_dict = tmp1
pd_cve_dict, ver_cve_dict = tmp2
pd_cve_rpath, ver_cve_rpath = tmp3

tmp1, tmp2, tmp3 = cyberkg.get_tech_centric_evi(kg_path)
cve_cam_dict, cve_ap_dict, cve_tech_dict = tmp1
cam_cve_dict, ap_cve_dict, tech_cve_dict = tmp2
cam_cve_rpath, ap_cve_rpath, tech_cve_rpath = tmp3

# verification: each cve in cve_evi_dict must have non-empty evi
for cve_evi_dict in [cve_pd_dict, cve_ver_dict, cve_cam_dict, cve_ap_dict, cve_tech_dict]:
    for v in cve_evi_dict.values():
        assert len(v) > 0

common_cves = set(cve_pd_dict.keys()) & set(cve_ver_dict.keys()) \
    & set(cve_cam_dict.keys()) & set(cve_ap_dict.keys()) & set(cve_tech_dict.keys())
print('Number of CVEs that have all kinds of evidence %d' % len(common_cves))  

print('Genenrating queries/answers for basic use case -- query threat codes(CVE-IDs)')
cve_evi_dicts = {
    'pd': cve_pd_dict, 'ver': cve_ver_dict, 'cam': cve_cam_dict, 'ap': cve_ap_dict, 'tech': cve_tech_dict
}
evi_cve_dicts = {
    'pd': pd_cve_dict, 'ver': ver_cve_dict, 'cam': cam_cve_dict, 'ap': ap_cve_dict, 'tech': tech_cve_dict
}
evi_cve_rpaths = {
    'pd': pd_cve_rpath, 'ver': ver_cve_rpath, 'cam': cam_cve_rpath, 'ap': ap_cve_rpath, 'tech': tech_cve_rpath
}
reqs = [ # basic use case only
        ['3i', 500], ['5i', 500], ['7i', 500],
        ['1pp.3i', 500], ['2pp.5i', 500], ['3pp.7i', 500],
        ['1ppp.3i', 500], ['1ppp.1pp.5i', 500], ['1ppp.2pp.7i', 500],
    ]

test_q = defaultdict(set)
for req in reqs:
    struc_name, q_num = req[0], req[1]
    evi_num = int(struc_name.strip('i').split('.')[-1])

    if bool(re.match(r'\d*i', struc_name)):   # xi
        q_struc = name_query_dict[struc_name]
        random.shuffle(list(common_cves))
        for cve_id in common_cves:
            logics = cyberkg.gen_1_xi_q(kg_path, cve_evi_dicts, evi_cve_rpaths, cve_id, evi_num)
            if logics is not None:
                test_q[q_struc].add(logics)
                if len(test_q[q_struc]) >= q_num: break

    elif bool(re.match(r'\d*pp.\d*i', struc_name)):  # npp.xi
        pp_num = int(struc_name.split('.')[0].strip('p'))
        q_struc = name_query_dict[struc_name]
        random.shuffle(list(common_cves))
        for cve_id in common_cves:
            logics = cyberkg.gen_1_npp_xi_q(kg_path, cve_evi_dicts, evi_cve_rpaths, cve_id, evi_num, pp_num)
            if logics is not None:
                test_q[q_struc].add(logics)
                if len(test_q[q_struc]) >= q_num: break

    elif bool(re.match(r'\d*ppp.\d*i', struc_name)): # nppp.xi
        ppp_num = int(struc_name.split('.')[0].strip('p'))
        q_struc = name_query_dict[struc_name]
        random.shuffle(list(common_cves))
        for cve_id in common_cves:
            logics = cyberkg.gen_1_nppp_mpp_xi_q(kg_path, cve_evi_dicts, evi_cve_rpaths, cve_id, evi_num, ppp_num, 0)
            if logics is not None:
                test_q[q_struc].add(logics)
                if len(test_q[q_struc]) >= q_num: break

    elif bool(re.match(r'\d*ppp.\d*pp.\d*i', struc_name)):  # nppp.mpp.xi
        ppp_num = int(struc_name.split('.')[0].strip('p'))
        pp_num = int(struc_name.split('.')[1].strip('p'))
        q_struc = name_query_dict[struc_name]
        random.shuffle(list(common_cves))
        for cve_id in common_cves:
            logics = cyberkg.gen_1_nppp_mpp_xi_q(kg_path, cve_evi_dicts, evi_cve_rpaths, cve_id, evi_num, ppp_num, pp_num)
            if logics is not None:
                test_q[q_struc].add(logics)
                if len(test_q[q_struc]) >= q_num: break
    else:
        raise NotImplementedError('Not implement query generation of %s structure' % struc_name)
print('Done')

Number of CVEs that have all kinds of evidence 9584
Genenrating queries/answers for basic use case -- query threat codes(CVE-IDs)
Done


In [3]:
for q_struc, qs in test_q.items():
    print(q_struc, len(qs))
qs

(('e', ('r',)), ('e', ('r',)), ('e', ('r',))) 500
(('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',))) 500
(('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',))) 500
(('e', ('r', 'r')), ('e', ('r',)), ('e', ('r',))) 500
(('e', ('r', 'r')), ('e', ('r', 'r')), ('e', ('r',)), ('e', ('r',)), ('e', ('r',))) 500
(('e', ('r', 'r')), ('e', ('r', 'r')), ('e', ('r', 'r')), ('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',))) 500
(('e', ('r', 'r', 'r')), ('e', ('r',)), ('e', ('r',))) 500
(('e', ('r', 'r', 'r')), ('e', ('r', 'r')), ('e', ('r',)), ('e', ('r',)), ('e', ('r',))) 500
(('e', ('r', 'r', 'r')), ('e', ('r', 'r')), ('e', ('r', 'r')), ('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',))) 500


{((380824, (18,)),
  (222521, (17,)),
  (199763, (15,)),
  (179135, (15,)),
  (381100, (8, 0)),
  (381047, (8, 0)),
  (381283, (9, 8, 0))),
 ((380824, (18,)),
  (222521, (17,)),
  (221238, (15,)),
  (202416, (15,)),
  (381171, (8, 0)),
  (381059, (8, 0)),
  (381208, (9, 8, 0))),
 ((380824, (18,)),
  (222591, (17,)),
  (204094, (15,)),
  (299352, (17,)),
  (381100, (8, 0)),
  (381118, (8, 0)),
  (381208, (9, 8, 0))),
 ((380824, (18,)),
  (224068, (17,)),
  (197186, (15,)),
  (263758, (17,)),
  (381195, (8, 0)),
  (380864, (8, 0)),
  (381283, (9, 8, 0))),
 ((380824, (18,)),
  (224071, (17,)),
  (188389, (15,)),
  (204014, (15,)),
  (381051, (8, 0)),
  (380920, (8, 0)),
  (381211, (9, 8, 0))),
 ((380824, (18,)),
  (225197, (17,)),
  (216767, (15,)),
  (373992, (17,)),
  (380868, (8, 0)),
  (380872, (8, 0)),
  (381211, (9, 8, 0))),
 ((380824, (18,)),
  (229086, (17,)),
  (193729, (15,)),
  (192676, (15,)),
  (380852, (8, 0)),
  (381023, (8, 0)),
  (381272, (9, 8, 0))),
 ((380824, (18,)),
 

In [10]:
import os, pickle
test_q = pickle.load(open(os.path.join('./basic', 'train_queries_cve_basic.pkl'), 'rb'))
test_a = pickle.load(open(os.path.join('./basic', 'train_answers_cve_basic.pkl'), 'rb'))
entid2cate = pickle.load(open(os.path.join('./basic', 'entid2cate.pkl'), 'rb'))

all_qs = set()
for q_struc, qs in test_q.items():
    print(q_struc, len(qs))
    all_qs |= qs
all_qs - set(test_a.keys()), set(test_a.keys()) - all_qs

for q, ans in test_a.items():
    for a in ans:
        assert entid2cate[a] == 'cve-id'

(('e', ('r',)), ('e', ('r',))) 491
(('e', ('r',)), ('e', ('r',)), ('e', ('r',))) 490
(('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',))) 252
(('e', ('r', 'r')), ('e', ('r',))) 195
(('e', ('r', 'r')), ('e', ('r',)), ('e', ('r',))) 498
(('e', ('r', 'r')), ('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',))) 291
(('e', ('r', 'r')), ('e', ('r', 'r')), ('e', ('r',)), ('e', ('r',)), ('e', ('r',))) 471
(('e', ('r', 'r', 'r')), ('e', ('r',))) 102
(('e', ('r', 'r', 'r')), ('e', ('r',)), ('e', ('r',))) 494
(('e', ('r', 'r', 'r')), ('e', ('r',)), ('e', ('r',)), ('e', ('r',)), ('e', ('r',))) 347
(('e', ('r', 'r', 'r')), ('e', ('r', 'r')), ('e', ('r',)), ('e', ('r',)), ('e', ('r',))) 499


In [3]:
import os, pickle

use_case = 'cyberkg'

# q_org = pickle.load(open(os.path.join('./%s' % use_case, 'test_queries_cve_%s.pkl' % use_case), 'rb'))
# a_org = pickle.load(open(os.path.join('./%s' % use_case, 'test_answers_cve_%s.pkl' % use_case), 'rb'))
q_org = pickle.load(open(os.path.join('./%s' % use_case, 'train_queries_%s.pkl' % use_case), 'rb'))
a_org = pickle.load(open(os.path.join('./%s' % use_case, 'train_answers_%s.pkl' % use_case), 'rb'))

In [4]:
flatten=lambda l: sum(map(flatten, l),[]) if isinstance(l,tuple) else [l]

id2rel = pickle.load(open(os.path.join('./%s' % use_case, 'id2rel.pkl'), 'rb'))
id2ent = pickle.load(open(os.path.join('./%s' % use_case, 'id2ent.pkl'), 'rb'))
for q_struc, qs in q_org.items():
    flat_q_struc = flatten(q_struc)
    for q in qs:
        assert len(q_struc) == len(q)
        flat_q = flatten(q)
        assert len(flat_q) == len(flat_q_struc), (flat_q, flat_q_struc)
        for i in range(len(flat_q)):
            if flat_q_struc[i] == 'e':
                assert flat_q[i] in id2ent, (flat_q, flat_q_struc)
            elif flat_q_struc[i] == 'r':
                assert flat_q[i] in id2rel, (flat_q, flat_q_struc)
        for a in a_org[q]:
            assert type(a) == int

In [30]:
# import os, sys
# sys.path.append(os.path.abspath('../..'))

# from collections import defaultdict
# from helper.qa_util import query_name_dict, name_query_dict

# train_q, train_a = defaultdict(set), defaultdict(set)

# for q_struc, qs in train_q_org.items():
#     struc_name = query_name_dict[q_struc]
# #     if not struc_name.endswith('ip'):
# #         print(struc_name)
#     if len(struc_name.split('.')) > 1:
#         continue
#     print(struc_name)
#     for q in qs:
#         train_q[q_struc].add(q)
#         train_a[q] |= train_a_org[q]
# with open(os.path.join('./basic', 'train_queries_basic.pkl' ), 'wb') as pklfile:
#     pickle.dump(train_q_org, pklfile, protocol=pickle.HIGHEST_PROTOCOL)
        
# with open(os.path.join('./basic', 'train_answers_basic.pkl'), 'wb') as pklfile:
#     pickle.dump(train_a_org, pklfile, protocol=pickle.HIGHEST_PROTOCOL)


2i
3i
5i
2ip
3ip
5ip


In [8]:
entset = pickle.load(open(os.path.join('./basic', 'id_entset.pkl'), 'rb'))
for k, v in entset.items():
    print(k, min(v), max(v))
print() 
id2rel = pickle.load(open(os.path.join('./basic', 'id2rel.pkl'), 'rb'))
for k, v in id2rel.items():
    print(v, k)

cve-id 0 156024
weakness 156025 156300
vendor 156301 178491
product 178492 222395
version 222396 380823
campaign 380824 380836
attack-pattern 380837 381196
technique 381197 381296
tactic 381297 381307
mitigation 381308 754497

CWE:includes:CVE 0
VD:has:PD 1
CVE:affects:VD 2
CVE:affects:PD 3
PD:has:VER 4
CVE:affects:VER 5
CVE:has:propose:CAMP 6
CVE:is:related:to:CVE 7
AP:is:related:to:CWE 8
TECH:leverages:AP 9
TA:includes:TECH 10
MITI:mitigates:CVE 11
reverse:CWE:includes:CVE 12
reverse:VD:has:PD 13
reverse:CVE:affects:VD 14
reverse:CVE:affects:PD 15
reverse:PD:has:VER 16
reverse:CVE:affects:VER 17
reverse:CVE:has:propose:CAMP 18
reverse:AP:is:related:to:CWE 19
reverse:TECH:leverages:AP 20
reverse:TA:includes:TECH 21
reverse:MITI:mitigates:CVE 22
