In [208]:
# Imports
import json
from pprint import pprint
from collections import OrderedDict, defaultdict
from rdflib import Graph, Namespace
from rdflib.term import Literal, URIRef#, _castPythonToLiteral
from rdflib.namespace import RDF, RDFS, OWL, XSD
import uuid
from urllib.parse import quote, unquote
import logging
import re

In [280]:
### Constants
incident_dir = "/Users/v685573/Documents/Development/VCDB/data/json/validated/"
schema_filename = "/Users/v685573/Documents/Development/vzrisk/flow/verisc-owl.json"
attack_flow_namespace = "https://vz-risk.github.io/flow/attack-flow#"
veris_namespace = "https://veriscommunity.net/attack-flow#"

incident_filenames = ["61E18813-E1B8-424E-BD4A-BA4D665A4A6F.json", # no event chain
"9a244892-42ca-45ff-9307-4ac6aced0c4d.json", # 1 step
"f247f550-6f38-46f4-9a32-a45cf89d47e3.json", # duplicate steps
"13247e27-dc13-4c6b-bcac-c0973cba2c50.json", # duplicate but only 1 asset
"bf7a414b-2d56-4709-8d2d-911af426664e.json"  # no duplicates
]




In [268]:
### Functions
def recurse_instances(d, lbl, owl=owl, exclusions=[]):
    
    for k, v in d.items():
#        if len(lbl) > 0 and lbl[0] == "asset":
#            print("label: {0}, key: {1}, value: {2}".format(lbl, k, v))
        try:
            if type(v) in [OrderedDict, dict]:
                #keys = keys.union(recurse_keys(v, (lbl + (k,)), keys))
                recurse_instances(v, (lbl + (k,)), owl, exclusions=exclusions)
            elif type(v) is list: 
                for item in v:
                    if type(item) == dict:
                        #print("label: {0}, key: {1}, item: {2}".format(lbl, k, item))
                        recurse_instances(item, (lbl + (k,)), owl, exclusions=exclusions)
                    elif k == "variety":
                        # convert it to a class instance of the parent class
                        # add it to the incident
                        enum_iterator[item] += 1
                        instance_name = quote(item + "_" + str(enum_iterator[item]))
                        
                        # define instance as an instance and an instance of something
                        owl.add((i_ns[instance_name], RDF.type, OWL.NamedIndividual))
                        owl.add((i_ns[instance_name], RDF.type, anchor_map.get(".".join(lbl), veris_ns[quote(".".join(lbl + (k,item)))])))
                        
                        # Connect instance to flow
                        owl.add((i_ns[instance_name], af_ns['flow'], flowURI))

                        # if action:
                        if lbl[0] == "action":
                            # (type) = 'action'
                            # name = instance_name
                            # description
                            owl.add((i_ns[instance_name], af_ns["description.action"], Literal(incident["action"][lbl[1]].get("notes", "no decription"))))
                            # logic_operator = ""
                            owl.add((i_ns[instance_name], af_ns['logic_operator'], Literal("OR")))
            elif k == "variety":
                # convert it to a class instance of the parent class
                # add it to the incident
                instance_name = re.sub("[^0-9a-zA-Z_.\-~]+", "_", v)
                if lbl[0] == "asset" and v not in ["Unknown", "Other"]:
                    instance_name = instance_name[4:]
                enum_iterator[instance_name] += 1
                instance_name = quote(instance_name + "_" + str(enum_iterator[instance_name]))

                # define instance as an instance and an instance of something
                owl.add((i_ns[instance_name], RDF.type, OWL.NamedIndividual))
                owl.add((i_ns[instance_name], RDF.type, anchor_map.get(".".join(lbl), veris_ns[quote(".".join(lbl + (k,v)))])))

                # Connect instance to flow
                owl.add((i_ns[instance_name], af_ns['flow'], flowURI))
        except:
            print("label: {0}, key: {1}, value: {2}".format(lbl, k, v))
            raise
                      
    return owl
                        
                        
def recurse_properties(d, lbl, owl, exclusions=[]):
    for k, v in d.items():
        try:
            if type(v) in [OrderedDict, dict]:
                owl = recurse_properties(v, (lbl + (k,)), owl, exclusions=exclusions)
                
            elif k == "variety":
                pass # varieties are all instances and should already be handled
            
            elif (type(v) is list):
                for item in v:
                    if type(item) == dict:
                        recurse_properties(item, (lbl + (k,)), owl, exclusions=exclusions)
                    else:
                        # define it's flow
                        owl.add((veris_ns[quote(".".join(lbl + (k, item)))], af_ns['flow'], flowURI))
                        
                        # if we know what instance it goes to, connect it.
                        if str(veris_ns[quote(".".join(lbl))]) in instances.keys() and len(instances[str(veris_ns[quote(".".join(lbl))])]) == 1:
                            owl.add((instances[str(veris_ns[quote(".".join(lbl))])][0], veris_ns[quote(".".join(lbl + (k, )))], veris_ns[quote(".".join(lbl + (k, item)))]))
            elif (".".join((lbl + (k,str(v)))) in exclusions):
                pass
            
            else:
                if quote(".".join(lbl + (k,))) in obj_props:
                    if str(veris_ns[quote(".".join(lbl))]) in instances.keys() and len(instances[str(veris_ns[quote(".".join(lbl))])]) == 1:
                        owl.add((instances[str(veris_ns[quote(".".join(lbl))])][0], veris_ns[quote(".".join(lbl + (k, )))], veris_ns[quote(".".join(lbl + (k, v)))]))
                    else:
                        owl.add((veris_ns[quote(".".join(lbl[:-1]))], af_ns['flow'], flowURI))
                        owl.add((veris_ns[quote(".".join(lbl[:-1]))], veris_ns[quote(".".join(lbl + (k, )))], veris_ns[quote(".".join(lbl + (k, v)))]))
                elif quote(".".join(lbl + (k,))) in data_props:
                    if str(veris_ns[quote(".".join(lbl))]) in instances.keys() and len(instances[str(veris_ns[quote(".".join(lbl))])]) == 1:
                        owl.add((instances[str(veris_ns[quote(".".join(lbl))])][0], veris_ns[quote(".".join(lbl + (k, )))], Literal(v)))
                    else:
                        owl.add((veris_ns[quote(".".join(lbl))], af_ns['flow'], flowURI))
                        owl.add((veris_ns[quote(".".join(lbl))], veris_ns[quote(".".join(lbl + (k, )))], Literal(v)))
                else:
                    logging.warning("{0} is not in the object property or datatype property lists.".format(".".join(lbl + (k,))))
                              
        except:
            print("label: {0}, key: {1}, value: {2}".format(lbl, k, v))
            raise
               
    return owl
                              
                              

In [308]:
def guess_temporal_relationships(incident, owl):
    event_chain_lookup = {
        "ext": "external", "int": "internal", "prt": "partner", "unk": "Unknown",
        "env": "environmental", "err": "Error", "hak": "hacking", "soc": "Social", 
        "mal": "malware", "mis": "misuse", "phy": "Physical",
        "au": "availability", "cp": "confidentiality", "ia": "integrity",
        "emb": "embedded", "med": "media", "net": "network", "ppl": "people",
        "srv": "server", "ter": "public%20terminal", "usr": "User%device"
    }
    
    
    # collect action information but try event_chain first
    query = ("""SELECT DISTINCT  ?inst ?thing
    WHERE { 
      ?inst rdf:type owl:NamedIndividual .
      ?inst rdf:type ?thing .
       FILTER (?thing != owl:NamedIndividual)
    }""")
    qres = oincident.query(query)
    res = list(qres)
    actions = [item[0] for item in res if item[1].split("#")[1].startswith("action")]
    assets = [item[0] for item in res if item[1].split("#")[1].startswith("asset")]
    attributes = [item[0] for item in res if item[1].split("#")[1].startswith("attribute")]

    # If there is a event chain, 
    if "event_chain" in incident['plus']
        occurrence_counts = {
            "incident": {
                "action": defaultdict(int),
                "asset": defaultdict(int),
                "attribute": defaultdict(int)
            },
            "oincident": {
                "action": defaultdict(int),
                "asset": defaultdict(int),
                "attribute": defaultdict(int)
            }
        }

        for action in actions:
            occurrence_counts['oincident']['action'][action.split(".")[1]] += 1
        for asset in assets:
            occurrence_counts['oincident']['asset'][asset.split(".")[3].split("%20-%20")[0]] += 1
        for step in incident['plus']['event_chain']:
            occurrence_counts['incident']['action'][step.get("action", "Unknown")] += 1
            occurrence_counts['incident']['asset'][step.get("asset", "Unknown")] += 1
            occurrence_counts['incident']['attribute'][step.get("attribute", "Unknown")] += 1

        if len(incident['plus']['event_chain']) == 1:
            print("can parse because only 1 step")

        elif (all([v == 1 for k,v in occurrence_counts['incident']['action'].items()]) and 
              all([v == 1 for k,v in occurrence_counts['incident']['asset'].items()]) and
              all([v == 1 for k,v in occurrence_counts['incident']['attribute'].items()])):
            print("can parse because each item only occurs once")

        elif True: # TODO
            print("can parse because even though duplicate items, they link to only one instance")

    
    # if only one action & asset, you can assume the sequence
    elif len(assets) == 1 and len(actions) == 1:
        for attribute in attributes:
            owl.add((URIRef(attribute), RDFS.subPropertyOf, af_ns['state_change']))
            owl.add((URIRef(actions[0]), URIRef(attribute), URIRef(assets[0])))
            
    else:
        logging.info("No available logic to sequence actions and assets.")
    
    return(owl)

SyntaxError: invalid syntax (<ipython-input-308-6d0674c19485>, line 22)

In [274]:
def incident_to_owl(incident):
    global obj_props, data_props, enum_iterator, anchor_map, flowURI, instances, asset_map
    
    exclusions = ["incident_id", "plus.master_id", "plus.created", "plus.analyst", "summary"]
    incident_filename = "/Users/v685573/Documents/Development/VCDB/data/json/validated/61E18813-E1B8-424E-BD4A-BA4D665A4A6F.json"
    schema_filename = "/Users/v685573/Documents/Development/vzrisk/flow/verisc-owl.jsonLD"
    attack_flow_namespace = "https://vz-risk.github.io/flow/attack-flow#"
    veris_namespace = "https://veriscommunity.net/attack-flow#"
    
    # create namespace from victim_id
    i_ns = incident['victim'].get('victim_id', uuid.uuid4()).lower()
    i_ns = re.sub("[^0-9a-zA-Z_.\-~]+", "_", i_ns)
    i_ns = Namespace("urn:absolute:" + quote(i_ns) + "#")
    af_ns = Namespace(attack_flow_namespace)
    veris_ns = Namespace(veris_namespace)
    
    # open veris schema
    veris = Graph()
    veris.parse(schema_filename)
    # Get object and data properties so we know which are which when parsing them out of the incident
    query = ("""SELECT DISTINCT  ?p 
    WHERE { 
      ?p rdf:type owl:ObjectProperty .
    }""")
    qres = veris.query(query)
    obj_props = list(qres)
    obj_props = [item[0].split("#")[1] for item in obj_props]
    query = ("""SELECT DISTINCT  ?p 
    WHERE { 
      ?p rdf:type owl:DatatypeProperty .
    }""")
    qres = veris.query(query)
    data_props = list(qres)
    data_props = [item[0].split("#")[1] for item in data_props]   
    # all we needed were the property lists
    del(veris)
    
    # to number instances
    enum_iterator = defaultdict(int)

    # to map from veris_ns to attack flow ns
    anchor_map = {
        "action": af_ns["action"],
        "asset": af_ns["asset"],
        "extra": af_ns["property"]
    }
    
    # start the incident's graph
    owl = Graph()
    
    ### create any manditory fields in AF
    # Create flow instance, flow id
    flowURI = i_ns[incident['plus']['master_id']] # to object
    owl.add((flowURI, RDF.type, OWL.NamedIndividual))
    owl.add((flowURI, RDF.type, af_ns['attack-flow']))
    # flow name literal
    owl.add((flowURI, af_ns['name.attack-flow'], Literal(incident['incident_id'])))
    # flow created literal
    owl.add((flowURI, af_ns['created'], Literal(incident['plus'].get("created", "1970-01-01T01:00:00Z"))))
    # flow author literal
    owl.add((flowURI, af_ns['author'], Literal(incident['plus'].get("analyst", "Unknown"))))
    # flow description literal
    owl.add((flowURI, af_ns['description.attack-flow'], Literal(incident['summary'])))
    
    recurse_instances(incident, (), owl, exclusions=exclusions)
    
    query = ("""SELECT DISTINCT  ?inst ?thing
    WHERE { 
      ?inst rdf:type owl:NamedIndividual .
      ?inst rdf:type ?thing .
       FILTER (?thing != owl:NamedIndividual)
    }""")
    qres = oincident.query(query)
    instances = defaultdict(set)
    for inst,thing in qres:
        instances[str(thing)].add(str(inst))
    instances = dict()
    instances = {".".join(k.split(".")[:2]):list(v) for k,v in instances.items()}
    
    recurse_properties(incident, (), owl, exclusions=exclusions)    
    
    # Determine causal linkages between actions if possible (use value.chain and or single-action)
    owl = guess_temporal_relationships(incident, owl)
    
    return(owl)

In [306]:
with open(incident_dir + incident_filenames[2], 'r') as filehandle:
    incident = json.load(filehandle)

In [276]:
oincident = incident_to_owl(incident)



In [178]:
with open("/Users/v685573/Documents/Development/vzrisk/flow/61E18813-E1B8-424E-BD4A-BA4D665A4A6F.owl", 'w') as filehandle:
    filehandle.write(oincident.serialize(format="xml"))

In [24]:
schema = Graph()
schema.parse(schema_filename)

<Graph identifier=N745bdef112eb4a909071cbf235a1de1d (<class 'rdflib.graph.Graph'>)>

In [285]:
incident

{'action': {'error': {'variety': ['Misconfiguration'],
   'vector': ['Carelessness']}},
 'actor': {'internal': {'motive': ['NA'], 'variety': ['System admin']}},
 'asset': {'assets': [{'amount': 1, 'variety': 'S - Database'}],
  'cloud': ['External Cloud Asset(s)'],
  'country': ['US'],
  'role': ['IT'],
  'total_amount': 1},
 'attribute': {'confidentiality': {'data': [{'amount': 260000,
     'variety': 'Personal'}],
   'data_disclosure': 'Yes',
   'data_total': 260000,
   'data_victim': ['Customer'],
   'state': ['Stored unencrypted']}},
 'confidence': 'High',
 'discovery_method': {'external': {'variety': ['Security researcher']}},
 'impact': {'overall_rating': 'Insignificant'},
 'incident_id': 'd7168500-0a54-11eb-a175-e162319ad19d',
 'plus': {'analysis_status': 'Reviewed',
  'analyst': 'gbassett',
  'asset_os': ['Other'],
  'attribute': {'confidentiality': {'data_abuse': 'No'}},
  'created': '2020-10-09T22:39:34.600Z',
  'dbir_year': 2021,
  'event_chain': [{'action': 'err',
    'acto

In [277]:
print(oincident.serialize(format="json-ld"))

[
  {
    "@id": "https://veriscommunity.net/attack-flow#victim",
    "https://veriscommunity.net/attack-flow#victim.industry": [
      {
        "@value": "000"
      }
    ],
    "https://veriscommunity.net/attack-flow#victim.victim_id": [
      {
        "@value": "Mass Event"
      }
    ],
    "https://vz-risk.github.io/flow/attack-flow#flow": [
      {
        "@id": "urn:absolute:mass_event#61E18813-E1B8-424E-BD4A-BA4D665A4A6F"
      }
    ]
  },
  {
    "@id": "urn:absolute:mass%20event#Defacement_1",
    "@type": [
      "http://www.w3.org/2002/07/owl#NamedIndividual",
      "https://veriscommunity.net/attack-flow#attribute.integrity.variety.Defacement"
    ],
    "http://www.w3.org/2000/01/rdf-schema#subPropertyOf": [
      {
        "@id": "https://vz-risk.github.io/flow/attack-flow#state_change"
      }
    ],
    "https://vz-risk.github.io/flow/attack-flow#flow": [
      {
        "@id": "urn:absolute:mass_event#61E18813-E1B8-424E-BD4A-BA4D665A4A6F"
      }
    ]
  },
  {
