In [1]:
import os
import json
import pickle


# check how many CVEs in BRON and our KG
str_entset = pickle.load(open(os.path.join('./cyberkg_L', 'entset.pkl'), 'rb'))

bron_path = '/data/zhaohan/BRON/full_data/full_output_data'
bron_graph = json.load(open(os.path.join(bron_path, 'BRON.json'), 'rb'))


cve2bron_idmap = json.load(open(os.path.join(bron_path, 
                                             'BRON/original_id_to_bron_id', 
                                             'cve_id_bron_id.json'), 'rb'))
bron2cve_idmap = {v:k for k, v in cve2bron_idmap.items()}
    

cve_bronid = set()
for edge in bron_graph['edges']:
    h, t, _ = edge
    if h.startswith('cve_'):
        cve_bronid.add(h.split('_')[-1])
    elif t.startswith('cve_'):
        cve_bronid.add(t.split('_')[-1])

cve_ids_in_BRON = set()
for bron_id in cve_bronid:
    cve_ids_in_BRON.add(bron2cve_idmap[bron_id])
    
print(len(str_entset['cve-id']), len(cve_ids_in_BRON), len(cve_ids_in_BRON & str_entset['cve-id']))

156025 146857 144938


In [2]:
from collections import defaultdict

# check BRON graph contains all reverse edge
# we only need to consider one direction later
bron_facts = set()
for edge in bron_graph['edges']:
    h, t, _ = edge
    bron_facts.add((h, t))
    
for f in bron_facts:
    assert (f[1], f[0]) in bron_facts
    
edge_types = defaultdict(int)
for edge in bron_graph['edges']:
    h, t, r = edge
    src_prefix = h.split('_')[0]
    dst_prefix = t.split('_')[0]
    edge_types[(src_prefix, dst_prefix)] += 1
edge_types

defaultdict(int,
            {('technique', 'tactic'): 733,
             ('tactic', 'technique'): 733,
             ('technique', 'capec'): 157,
             ('capec', 'technique'): 157,
             ('capec', 'cwe'): 1153,
             ('cwe', 'capec'): 1153,
             ('cwe', 'cve'): 109872,
             ('cve', 'cpe'): 1248927,
             ('cve', 'cwe'): 109872,
             ('cpe', 'cve'): 1248927})

In [3]:
# 1. count CVE num under each tactic
# 2. lifecycle: count CVE num among multiple tactics

from collections import defaultdict

# all are bron id with prefix, e.g., 'tactic_'
tac2tech = defaultdict(set)
tech2capec = defaultdict(set)
capec2cwe = defaultdict(set)
cwe2cve = defaultdict(set)

for edge in bron_graph['edges']:
    h, t, _ = edge
    if h.startswith('tactic') and t.startswith('technique'):
        tac2tech[h].add(t)
    if h.startswith('technique') and t.startswith('capec'):
        tech2capec[h].add(t)
    if h.startswith('capec') and t.startswith('cwe'):
        capec2cwe[h].add(t)
    if h.startswith('cwe') and t.startswith('cve'):
        cwe2cve[h].add(t)
# print(len(tac2tech), len(tech2capec), len(capec2cwe), len(cwe2cve))

# 1. count CVE num under each tactic

tac2capec = defaultdict(set)
tac2cwe = defaultdict(set)
tac2cve = defaultdict(set)

for tac in tac2tech:
    for tech in tac2tech[tac]:
        tac2capec[tac] |= tech2capec[tech]
        for capec in tech2capec[tech]:
            tac2cwe[tac] |= capec2cwe[capec]
            for cwe in capec2cwe[capec]:
                tac2cve[tac] |= cwe2cve[cwe]

    
# 2. lifecycle: count CVE num among multiple tactics
tacname2code = {
    "collection": "TA0009",
    "command-and-control": "TA0011",
    "credential-access": "TA0006",
    "defense-evasion": "TA0005",
    "discovery": "TA0007",
    "execution": "TA0002",
    "exfiltration": "TA0010",
    "impact": "TA0040",
    "initial-access": "TA0001",
    "lateral-movement": "TA0008",
    "persistence": "TA0003",
    "privilege-escalation": "TA0004",
    "reconnaissance": "TA0043",
    "resource-development": "TA0042"
}
tacname2bronid = {
    "command-and-control": "00003", 
    "credential-access": "00009", 
    "collection": "00020", 
    "defense-evasion": "00022", 
    "discovery": "00024", 
    "exfiltration": "00029", 
    "lateral-movement": "00043", 
    "privilege-escalation": "00068", 
    "persistence": "00069", 
    "execution": "00094", 
    "initial-access": "00175", 
    "impact": "00395", 
    "resource-development": "00608", 
    "reconnaissance": "00641"
}
taccode2bronid = { code: 'tactic_'+tacname2bronid[name] for name, code in tacname2code.items()}
tacbronid2code = {v:k for k, v in taccode2bronid.items()}

lifecycle2cve = defaultdict(set) # {[TAs]: set(cves)} TA is org code, cves are bron id with 'cve_'
lifecycle_full = ['TA0043', 'TA0042', 'TA0001', 'TA0002', 'TA0003', 'TA0004', 'TA0005',
                  'TA0006', 'TA0007', 'TA0008', 'TA0009', 'TA0011', 'TA0010', 'TA0040',]
print('Tactic\tTech\tCAPEC\tCWE\tCVE')

for tac in lifecycle_full:
    print(tac, '\t',
          len(tac2tech[taccode2bronid[tac]]),  '\t',
          len(tac2capec[taccode2bronid[tac]]), '\t',
          len(tac2cwe[taccode2bronid[tac]]),   '\t',
          len(tac2cve[taccode2bronid[tac]]))


Tactic	Tech	CAPEC	CWE	CVE
TA0043 	 41 	 0 	 0 	 0
TA0042 	 38 	 1 	 0 	 0
TA0001 	 19 	 9 	 12 	 1369
TA0002 	 38 	 2 	 1 	 44
TA0003 	 108 	 23 	 31 	 21760
TA0004 	 96 	 20 	 30 	 21715
TA0005 	 158 	 29 	 48 	 22152
TA0006 	 55 	 12 	 19 	 3305
TA0007 	 39 	 18 	 10 	 7945
TA0008 	 23 	 9 	 20 	 11512
TA0009 	 35 	 8 	 7 	 2397
TA0011 	 40 	 1 	 0 	 0
TA0010 	 17 	 1 	 1 	 292
TA0040 	 26 	 11 	 4 	 701


In [4]:
# cve instances {range: tactics}
cve2tac = defaultdict(set)
for tac, cves in tac2cve.items():
    for cve in cves:
        cve2tac[cve].add(tac)
freq = [len(v) for v in cve2tac.values()]

from collections import Counter
print(len(cve2tac), Counter(freq))

lifecycle2cve = defaultdict(set) # {['TA0001', 'TA0002',..]: {CVE-xxxx}} org code
for cve, lifecycle in cve2tac.items():
    lifecycle = [tacbronid2code[tac] for tac in lifecycle]  # org code
    lifecycle = sorted([lifecycle_full.index(tac) for tac in lifecycle]) # sorted idx
    lifecycle = [lifecycle_full[idx] for idx in lifecycle]  # sorted code
    lifecycle2cve[','.join(lifecycle)].add(cve)
    


23872 Counter({3: 9910, 5: 7350, 6: 2813, 4: 1704, 1: 1487, 2: 545, 7: 63})


In [5]:
for k in sorted(lifecycle2cve.keys()):
    print(k, len(lifecycle2cve[k]))

TA0001,TA0003,TA0004,TA0005 596
TA0001,TA0003,TA0004,TA0005,TA0006 105
TA0001,TA0003,TA0004,TA0005,TA0006,TA0007,TA0008 20
TA0001,TA0003,TA0004,TA0005,TA0006,TA0008 575
TA0001,TA0003,TA0004,TA0005,TA0006,TA0008,TA0009 5
TA0001,TA0003,TA0004,TA0005,TA0006,TA0008,TA0010 22
TA0001,TA0003,TA0004,TA0005,TA0007,TA0008 1
TA0001,TA0003,TA0005,TA0006,TA0008,TA0009 45
TA0002,TA0008 43
TA0002,TA0008,TA0010 1
TA0003,TA0004,TA0005 9908
TA0003,TA0004,TA0005,TA0006,TA0007,TA0008 5
TA0003,TA0004,TA0005,TA0006,TA0007,TA0008,TA0009 11
TA0003,TA0004,TA0005,TA0006,TA0008,TA0009 2184
TA0003,TA0004,TA0005,TA0006,TA0008,TA0009,TA0010 4
TA0003,TA0004,TA0005,TA0006,TA0008,TA0009,TA0040 1
TA0003,TA0004,TA0005,TA0007 36
TA0003,TA0004,TA0005,TA0007,TA0008 7245
TA0003,TA0004,TA0005,TA0007,TA0008,TA0010 1
TA0003,TA0004,TA0005,TA0007,TA0008,TA0040 2
TA0003,TA0004,TA0005,TA0008 986
TA0003,TA0004,TA0005,TA0040 8
TA0005 31
TA0005,TA0006,TA0008,TA0009 78
TA0005,TA0008 283
TA0006 30
TA0006,TA0007 150
TA0006,TA0007,TA0010

In [12]:
l1 = [bron2cve_idmap[cve.split('_')[-1]] for cve in lifecycle2cve['TA0003,TA0004,TA0005']]
l2 = [bron2cve_idmap[cve.split('_')[-1]] for cve in lifecycle2cve['TA0006,TA0007,TA0010']]

l1 = sorted(l1, reverse=True)
l2 = sorted(l2, reverse=True)
print(l1[:3], l2[0])

['CVE-2020-9977', 'CVE-2020-9931', 'CVE-2020-9914'] CVE-2019-18800
