## select attack targeted (cve-id)

In [1]:
import os
import pickle
from collections import defaultdict
from tqdm import tqdm
import pandas as pd

raw_path = '/data/zhaohan/adv-reasoning/data/cyberkg-raw'
start_year, end_year = 2000, 2021

cve_paths = [os.path.join(raw_path, 'cve', str(y)) for y in range(start_year, end_year+1)]
load_paths = []
for path in cve_paths:
    for root, _, filenames in os.walk(path):
        load_paths.extend([os.path.join(root, filename) for filename in filenames])
    
cve2score = {}  # checked: each collected cve only has one score
pd2cve = defaultdict(set)
for path in tqdm(load_paths, desc='parsing crawled CVE data'):
    data = pd.read_csv(path, delimiter='|',  index_col=0, header=0)
    for index, row in data.iterrows():
        cve_id = row['cve-id']
        score = float(row['score'])
        cve2score[cve_id] = score
        
        for p in set(row['pd-info'].split(';')):
            if 'None' in p:
                continue
            _, vendor, product, version = p.split(',')
            pd2cve[product].add(cve_id)

parsing crawled CVE data: 100%|█████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:08<00:00,  5.08it/s]


In [2]:
# build attack-pattern & technique to CVE dict

import os, sys
sys.path.append(os.path.abspath('../..'))

import pickle
from collections import defaultdict
import genkg.cyberkg_utils as cyber

rel_dict = cyber.rel_dict
rev_rel_prefix = cyber.rev_rel_prefix

kg_path = '/data/zhaohan/adv-reasoning/save/data/cyberkg'
str_entset = pickle.load(open(os.path.join(kg_path, 'entset.pkl'), 'rb'))
str_factset = pickle.load(open(os.path.join(kg_path, 'factset.pkl'), 'rb'))

ap_ents, tech_ents = str_entset['attack-pattern'], str_entset['technique']

print('all we have for AP %d and Tech %d ' % (len(ap_ents), len(tech_ents)))

tech_ap_map = defaultdict(set)
for h, r, t in str_factset[rel_dict['technique:attack-pattern']]:
    assert h in str_entset['technique']
    assert t in str_entset['attack-pattern']
    tech_ap_map[h].add(t)
    
ap_cwe_map = defaultdict(set)
for h, r, t in str_factset[rel_dict['attack-pattern:weakness']]:
    assert h in str_entset['attack-pattern']
    assert t in str_entset['weakness']
    ap_cwe_map[h].add(t)
        
cwe_cve_map = defaultdict(set)
for h, r, t in str_factset[rel_dict['weakness:cve-id']]:
    assert h in str_entset['weakness']
    assert t in str_entset['cve-id']
    cwe_cve_map[h].add(t)
    
    
cve_ap_map = defaultdict(set)
for ap_name in ap_ents:
    for cwe_name in ap_cwe_map[ap_name]:
        for cve in cwe_cve_map[cwe_name]:
            cve_ap_map[cve].add(ap_name)
            
cve_tech_map = defaultdict(set)
for tech_name in tech_ents:
    n_cve = 0
    for ap_name in tech_ap_map[tech_name]:
        for cwe_name in ap_cwe_map[ap_name]:
            for cve in cwe_cve_map[cwe_name]:
                cve_tech_map[cve].add(tech_name)

all we have for AP 424 and Tech 596 


In [5]:
import os, sys
sys.path.append(os.path.abspath('../..'))
import pickle
from genkg.cyberkg_utils import gen_cve_miti_dict

tar_evi = 'Chrome'

# for k in pd2cve.keys():
#     if tar_evi in k:
#         print(k)

# step1: find cve in kg who has miti (>=thre num)
kg_path = '/data/zhaohan/adv-reasoning/save/data/cyberkg'
factset = pickle.load(open(os.path.join(kg_path, 'factset.pkl'), 'rb'))
id_factset = pickle.load(open(os.path.join(kg_path, 'id_factset.pkl'), 'rb'))
id2ent = pickle.load(open(os.path.join(kg_path, 'id2ent.pkl'), 'rb'))
ent2id = pickle.load(open(os.path.join(kg_path, 'ent2id.pkl'), 'rb'))
cve_miti_dict = gen_cve_miti_dict(kg_path)
inkg_cve = [id2ent[cve_id] for cve_id in cve_miti_dict.keys()]  # str cve name

# step2: intersect with tar_evi related cve
conj_cve = set(inkg_cve) & set(pd2cve[tar_evi])   # str cve name

# step3: find a cve who has smaller score
conj_cve2score = {}
for cve_name in conj_cve:
    conj_cve2score[cve_name] = cve2score[cve_name]

    
def factnum(ent_id):
    count = 0
    for rel, facts in id_factset.items():
        for h, r, t in facts:
            if h==ent_id or t==ent_id:
                count += 1
    return count

# print(sorted(conj_cve2score.values()))
score_thre = 5
for cve_name, score in conj_cve2score.items():
    ap_num = len(cve_ap_map[cve_name])
    tech_num = len(cve_tech_map[cve_name])
    if score <= score_thre and ap_num > 0 and tech_num > 0:
        cve_id = ent2id[cve_name]
        print('%s\tscore %.1f\tmitigation num %d\trelated factnum %d\thave AP %d\thave TECH %d' %
              (cve_name, score, len(cve_miti_dict[cve_id]), factnum(cve_id), ap_num, tech_num))

CVE-2016-1692	score 4.3	mitigation num 10	related factnum 74	have AP 17	have TECH 23
CVE-2020-6472	score 4.3	mitigation num 8	related factnum 50	have AP 59	have TECH 19
CVE-2012-2815	score 5.0	mitigation num 6	related factnum 104	have AP 59	have TECH 19
CVE-2015-1244	score 5.0	mitigation num 8	related factnum 40	have AP 59	have TECH 19
CVE-2018-18351	score 4.3	mitigation num 5	related factnum 38	have AP 51	have TECH 5
CVE-2018-6082	score 4.3	mitigation num 4	related factnum 36	have AP 59	have TECH 19
CVE-2020-6473	score 4.3	mitigation num 8	related factnum 52	have AP 60	have TECH 19
CVE-2018-6109	score 4.3	mitigation num 5	related factnum 40	have AP 59	have TECH 19
CVE-2020-6482	score 4.3	mitigation num 8	related factnum 50	have AP 3	have TECH 1
CVE-2018-6099	score 4.3	mitigation num 5	related factnum 40	have AP 59	have TECH 19
CVE-2012-2820	score 5.0	mitigation num 2	related factnum 96	have AP 51	have TECH 5
CVE-2020-15983	score 4.4	mitigation num 6	related factnum 42	have AP 51	have 

In [13]:
# check all ents related to tar_evi

str_entset = pickle.load(open(os.path.join(kg_path, 'entset.pkl'), 'rb'))

for e_name in str_entset['product']:
    if tar_evi in e_name:
        print(e_name, ent2id[e_name])
        

PD:Chrome 110883
PD:Chrome Os 112227


In [1]:
# check surrounding neighbors
import os, sys
sys.path.append(os.path.abspath('../..'))
import genkg.cyberkg_utils as cyber

kg_path = '/data/zhaohan/adv-reasoning/save/data/cyberkg'

fact_dict = cyber.gen_factdict(kg_path)

evi_id = 110883
sur_ents = set()
for ents in fact_dict[evi_id].values():
    sur_ents |= ents
print(len(sur_ents))

sur_sur_ents = set()
for e in sur_ents:
    for ents in fact_dict[e].values():
        sur_sur_ents |= ents
print(len(sur_sur_ents))

665
2550


In [15]:
# check 1-hop neighbors of tar_evi

import genkg.cyberkg_utils as cyber

fact_dict = cyber.gen_factdict(kg_path)
one_hop_neighbor = set()  # 1-hop neighbors from the tar_path anchors

for r, n_eids in fact_dict[112227].items():
    one_hop_neighbor |= n_eids
print(len(one_hop_neighbor))
for eid in one_hop_neighbor:
    print(id2ent[eid])

50
VER:Chrome Os:ver:33.0.1750.16
VER:Chrome Os:ver:0.10.142.3
VER:Chrome Os:ver:0.12.433.28
VER:Chrome Os:ver:0.9.126.0
VER:Chrome Os:ver:0.10.156.4
VER:Chrome Os:ver:0.10.156.50
VER:Chrome Os:ver:8.0.552.343
CVE-2014-1710
CVE-2011-1042
VER:Chrome Os:ver:0.10.146.1
CVE-2011-0482
VER:Chrome Os:ver:33.0.1750.112
VER:Chrome Os:ver:0.11.257.32
VER:Chrome Os:ver:0.12.433.9
VER:Chrome Os:ver:0.11.227.0
VER:Chrome Os:ver:0.10.156.34
VER:Chrome Os:ver:0.10.156.30
VER:Chrome Os:ver:33.0.1750.124
VER:Chrome Os:ver:0.12.397.0
VER:Chrome Os:ver:33.0.1750.93
CVE-2011-2169
VER:Chrome Os:ver:33.0.1750.29
CVE-2010-4577
VER:Chrome Os:ver:33.0.1750.58
VER:Chrome Os:ver:0.12.433.14
VER:Chrome Os:ver:0.10.156.54
CVE-2011-0480
VER:Chrome Os:ver:0.11.257.14
VER:Chrome Os:ver:8.0.552.344
CVE-2014-3188
VER:Chrome Os:ver:0.10.156.20
VER:Chrome Os:ver:0.11.257.91
VER:Chrome Os:ver:33.0.1750.2
VER:Chrome Os:ver:0.12.362.2
VER:Chrome Os:ver:0.10.156.18
VER:Chrome Os:ver:0.11.257.3
VER:Chrome Os:ver:33.0.1750.51


In [6]:
# given a targeted cve, check each related ap has how many cves and has how many surrounding entities
cve = 'CVE-2021-21189'
print(len(cve_ap_map[cve]))

ent2id = pickle.load(open(os.path.join(kg_path, 'ent2id.pkl'), 'rb'))
fact_dict = cyber.gen_factdict(kg_path)
ap_cve_map = defaultdict(set)
for ap_name in cve_ap_map[cve]:
    n_cve = 0
    for cwe_name in ap_cwe_map[ap_name]:
        n_cve += len(cwe_cve_map[cwe_name])
        ap_cve_map[ap_name] |= cwe_cve_map[cwe_name]
        
    ap_id = ent2id[ap_name]
    sur_ents = set()
    for ents in fact_dict[ap_id].values():
        sur_ents |= ents

    print(ap_name, '\t', n_cve, '\t', len(sur_ents))

10
AP:593 	 2222 	 2
AP:115 	 2222 	 1
AP:151 	 2222 	 1
AP:57 	 2259 	 3
AP:22 	 17199 	 5
AP:650 	 2222 	 4
AP:633 	 2222 	 3
AP:94 	 2348 	 6
AP:114 	 2222 	 2
AP:194 	 2222 	 1


In [7]:
# Given a cve, check each tech is related to how many cves and has how many surrounding facts

cve = 'CVE-2021-21189'
print(len(cve_tech_map[cve]))

ent2id = pickle.load(open(os.path.join(kg_path, 'ent2id.pkl'), 'rb'))
fact_dict = cyber.gen_factdict(kg_path)
tech_cve_map = defaultdict(set)
for tech_name in cve_tech_map[cve]:
    n_cve = 0
    for ap_name in tech_ap_map[tech_name]:
        for cwe_name in ap_cwe_map[ap_name]:
            n_cve += len(cwe_cve_map[cwe_name])
            tech_cve_map[tech_name] |= cwe_cve_map[cwe_name]
        
    tech_id = ent2id[tech_name]
    sur_ents = set()
    for ents in fact_dict[tech_id].values():
        sur_ents |= ents

    print(tech_name, '\t', n_cve, '\t', len(sur_ents))



5
TECH:T1505.003 	 2222 	 2
TECH:T1550.001 	 2222 	 3
TECH:T1100 	 2222 	 1
TECH:T1134 	 2222 	 3
TECH:T1557 	 2348 	 3
