# CAPEC Data Analysis
__Ivan Ulloa - 1/29/2021__

- This nootebok imports an xml file containing CAPEC information.<br>
- The 4 fields used are ID, Description, Relations to other attacks and Relations to CWE.<br>
- The resulting descriptions are stored in CAPEC.txt to further process using Autophrase.

In [1]:
import xml.etree.ElementTree as ET
import xmltodict
import json
import os
import pandas as pd
import spacy

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_lg")
import pprint
pp = pprint.PrettyPrinter(indent=4)

##  Load CAPEC dataset and save to JSON

In [2]:
tree = ET.parse('data/1000.xml')
xml_data = tree.getroot()
# Change the encoding type to be able to set it to the one you need
xmlstr = ET.tostring(xml_data, encoding='utf-8', method='xml')

In [3]:
capec = xmltodict.parse(xmlstr)
with open('data/capec_data.json', 'w') as f:
    f.write(json.dumps(capec))

In [4]:
data_dict = dict(xmltodict.parse(xmlstr))

In [5]:
## Extract ID, Descriptions, CAPEC relationships, and CWE Relation

In [6]:
ID = []
Desc = []
Rel_CAPEC = []
Rel_CWE = []
for i in range(len(capec['ns0:Attack_Pattern_Catalog']['ns0:Attack_Patterns']['ns0:Attack_Pattern'])):
    ID.append(capec['ns0:Attack_Pattern_Catalog']['ns0:Attack_Patterns']['ns0:Attack_Pattern'][i]['@ID'])
    Desc.append(capec['ns0:Attack_Pattern_Catalog']['ns0:Attack_Patterns']['ns0:Attack_Pattern'][i]['ns0:Description'])
    try:
        Rel_CAPEC.append(capec['ns0:Attack_Pattern_Catalog']['ns0:Attack_Patterns']['ns0:Attack_Pattern'][i]['ns0:Related_Attack_Patterns']['ns0:Related_Attack_Pattern'])
    except:
        Rel_CAPEC.append('None')
    try:
        Rel_CWE.append(capec['ns0:Attack_Pattern_Catalog']['ns0:Attack_Patterns']['ns0:Attack_Pattern'][i]['ns0:Related_Weaknesses']['ns0:Related_Weakness'])
    except:
        Rel_CWE.append('None')
dict = {'ID': ID, 'Description': Desc, 'rel_CAPEC':Rel_CAPEC, 'rel_CWE':Rel_CWE}

In [7]:
CAPEC_df = pd.DataFrame(dict)
CAPEC_df

Unnamed: 0,ID,Description,rel_CAPEC,rel_CWE
0,1,"In applications, particularly web applications...","[{'@CAPEC_ID': '122', '@Nature': 'ChildOf'}, {...","[{'@CWE_ID': '276'}, {'@CWE_ID': '285'}, {'@CW..."
1,10,This attack pattern involves causing a buffer ...,"{'@CAPEC_ID': '100', '@Nature': 'ChildOf'}","[{'@CWE_ID': '120'}, {'@CWE_ID': '302'}, {'@CW..."
2,100,Buffer Overflow attacks target improper or mis...,"{'@CAPEC_ID': '123', '@Nature': 'ChildOf'}","[{'@CWE_ID': '120'}, {'@CWE_ID': '119'}, {'@CW..."
3,101,An attacker can use Server Side Include (SSI) ...,"{'@CAPEC_ID': '253', '@Nature': 'ChildOf'}","[{'@CWE_ID': '97'}, {'@CWE_ID': '74'}, {'@CWE_..."
4,102,Session sidejacking takes advantage of an unen...,"{'@CAPEC_ID': '593', '@Nature': 'ChildOf'}","[{'@CWE_ID': '294'}, {'@CWE_ID': '522'}, {'@CW..."
...,...,...,...,...
522,94,This type of attack targets the communication ...,,"[{'@CWE_ID': '300'}, {'@CWE_ID': '290'}, {'@CW..."
523,95,This attack targets the WSDL interface made av...,"{'@CAPEC_ID': '54', '@Nature': 'ChildOf'}",{'@CWE_ID': '538'}
524,96,An application typically makes calls to functi...,"{'@CAPEC_ID': '603', '@Nature': 'ChildOf', 'ns...","[{'@CWE_ID': '589'}, {'@CWE_ID': '227'}]"
525,97,Cryptanalysis is a process of finding weakness...,"[{'@CAPEC_ID': '192', '@Nature': 'ChildOf'}, {...","[{'@CWE_ID': '327'}, {'@CWE_ID': '1240'}, {'@C..."


In [8]:
# Create corpus from CAPEC descriptions
corpus = ''
counter = 0

for desc in Desc:
    try:
        corpus += desc +'\n'
    except:
        if desc:
            corpus += desc['html:p'][0] +'\n'
    counter += 1

In [9]:
doc = nlp(corpus)

In [10]:
dash = '-' * 80
print(dash)
print("{:<50}{:<10}{:<10}".format("TEXT", "POS", "DEP"))
print(dash)
for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    #print(f"{token_text:<50}{token_pos:<10}{token_dep:<10}")

--------------------------------------------------------------------------------
TEXT                                              POS       DEP       
--------------------------------------------------------------------------------


In [11]:
dash = '-' * 80
print(dash)
print("{:<50}{:<10}".format("TEXT", "ENTITY"))
print(dash)    
#for ent in doc.ents:
#    # Print the entity text and its label
#    print(f"{ent.text:<50}{ent.label_:<10}")

--------------------------------------------------------------------------------
TEXT                                              ENTITY    
--------------------------------------------------------------------------------


In [12]:
# Analyze syntax
print("Noun phrases:")
#pp.pprint(set([chunk.text for chunk in doc.noun_chunks]))

Noun phrases:


In [13]:
print("Verbs:")
#pp.pprint(set([token.lemma_ for token in doc if token.pos_ == "VERB"]))

Verbs:


In [14]:
text_file = open("Data/CAPEC_descriptions.txt", "w")
text_file.write(corpus)
text_file.close()

## Explore Corpus

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize  

## Remove stopwords, punctuation, and make lower case

In [16]:
stop_words = set(stopwords.words('english'))  

word_tokens = word_tokenize(corpus.lower())
  
filtered_corpus = [w for w in word_tokens if not w in stop_words]  
  
filtered_corpus = []  
  
for w in word_tokens:  
    if w not in stop_words and w.isalpha():  
        filtered_corpus.append(w)  
        
#print(filtered_corpus) 

## Make co-occurrence matrix

In [17]:
import numpy as np
import nltk
from nltk import bigrams
import itertools
import pandas as pd
 
def generate_co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    vocab_index = {word: i for i, word in enumerate(vocab)}
 
    # Create bigrams from all words in corpus
    bi_grams = list(bigrams(corpus))
 
    # Frequency distribution of bigrams ((word1, word2), num_occurrences)
    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))
 
    # Initialise co-occurrence matrix
    # co_occurrence_matrix[current][previous]
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))
 
    # Loop through the bigrams taking the current and previous word,
    # and the number of occurrences of the bigram.
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]
        pos_current = vocab_index[current]
        pos_previous = vocab_index[previous]
        co_occurrence_matrix[pos_current][pos_previous] = count
    co_occurrence_matrix = np.matrix(co_occurrence_matrix)
 
    # return the matrix and the index
    return co_occurrence_matrix, vocab_index

In [18]:
matrix, vocab_index = generate_co_occurrence_matrix(filtered_corpus)

In [19]:
data_matrix = pd.DataFrame(matrix, index=vocab_index,
                             columns=vocab_index)
print(data_matrix)

                happens  fill  administrative  filters  symlinks  reluctance  \
happens             0.0   0.0             0.0      0.0       0.0         0.0   
fill                0.0   0.0             0.0      0.0       0.0         0.0   
administrative      0.0   0.0             0.0      0.0       0.0         0.0   
filters             0.0   0.0             0.0      3.0       0.0         0.0   
symlinks            0.0   0.0             0.0      0.0       0.0         0.0   
...                 ...   ...             ...      ...       ...         ...   
inherent            0.0   0.0             0.0      0.0       0.0         0.0   
subvert             0.0   0.0             0.0      0.0       0.0         0.0   
engineering         0.0   0.0             0.0      0.0       0.0         0.0   
intelligent         0.0   0.0             0.0      0.0       0.0         0.0   
circumvented        0.0   0.0             0.0      0.0       0.0         0.0   

                ram  able  redirecting 

## Save co-occurrence matrix to csv

In [20]:
data_matrix.to_csv('Data/CAPEC_coocurrence_matrix.csv')