## CVE Explore

In [54]:
import json
#import pandas as pd
#import urllib.request
#import xml.etree.ElementTree as ET
#import xmltodict
#import spacy
import numpy as np
import nltk
#import itertools
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

## Load CVE Data

In [55]:
with open('Data/all_CVE_data.json') as f:
    cve_dict = json.load(f)

## Get CVE Descriptions

In [56]:
cve_desc_list = []

# About 202,000 Items
file_name = 'Data/all_CVE_descriptions.txt'
with open(file_name, 'w') as f:
    for i in range(len(cve_dict['cve']['item'])):
        cve_desc_list.append(cve_dict['cve']['item'][i]['desc'])
        f.write(cve_dict['cve']['item'][i]['desc'] +'\n')
f.close()

## Generate corpus

In [57]:
# Create corpus from CVE descriptions
corpus = ''

for desc in cve_desc_list:
    try:
        corpus += desc + ' '
    except:
        if desc:
            corpus += desc['html:p'][0]

## Remove stopwords, punctuation, and make lower case

In [58]:
stop_words = set(stopwords.words('english'))  

word_tokens = word_tokenize(corpus.lower())
  
filtered_corpus = [w for w in word_tokens if not w in stop_words]  
  
filtered_corpus = []  
  
for w in word_tokens:  
    if w not in stop_words and w.isalpha():  
        filtered_corpus.append(w)  

## Function to make co-occurrence matrix

In [86]:
def generate_co_occurrence_matrix(corpus,window):
    vocab = set(corpus)
    vocab = list(vocab)
    vocab_index = {word: i for i, word in enumerate(vocab)}
 
    # Create bigrams from all words in corpus
    #bi_grams = list(bigrams(corpus))
    #print(bi_grams[0:10])
    
    # Ivan - Modification - Window N
    print('[START] Create word pairs')
    N_grams = []
    for i in range(len(corpus)):
        for j in range(1,N+1):
            if i+j<len(corpus):
                N_grams.append((corpus[i],corpus[i+j]))
            if i-j>=0:
                N_grams.append((corpus[i],corpus[i-j]))
    print('[COMPLETE] Create word pairs')
    # Frequency distribution of bigrams ((word1, word2), num_occurrences)
    print('[START] Calculate number of occurrences')
    ngram_freq = nltk.FreqDist(N_grams).most_common(len(N_grams))
    print('[COMPLETE] Calculate number of occurrences')
    # Initialise co-occurrence matrix
    # co_occurrence_matrix[current][previous]
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))
 
    # Loop through the bigrams taking the current and previous word,
    # and the number of occurrences of the bigram.
    print('[START] Generate Co-occurrence Matrix')
    for ngram in ngram_freq:
        current = ngram[0][1]
        previous = ngram[0][0]
        count = ngram[1]
        pos_current = vocab_index[current]
        pos_previous = vocab_index[previous]
        co_occurrence_matrix[pos_current][pos_previous] = count
    co_occurrence_matrix = np.matrix(co_occurrence_matrix)
    print('[COMPLETE] Generate Co-occurrence Matrix')
    
    # return the matrix and the index
    return co_occurrence_matrix, vocab_index

## Generate co-occurrence matrix

In [87]:
matrix, vocab_index = generate_co_occurrence_matrix(filtered_corpus,5)

[START] Create word pairs
[COMPLETE] Create word pairs
4321275
43212720
[START] Calculate number of occurrences
[COMPLETE] Calculate number of occurrences
[START] Generate Co-occurrence Matrix
[COMPLETE] Generate Co-occurrence Matrix


In [88]:
data_matrix = pd.DataFrame(matrix, index=vocab_index,
                             columns=vocab_index)
print(data_matrix)

                       bevywise  team  zaptel  joomlapraise  fledrcms  cuve  \
bevywise                    0.0   0.0     0.0           0.0       0.0   0.0   
team                        0.0   4.0     0.0           0.0       0.0   0.0   
zaptel                      0.0   0.0     0.0           0.0       0.0   0.0   
joomlapraise                0.0   0.0     0.0           0.0       0.0   0.0   
fledrcms                    0.0   0.0     0.0           0.0       0.0   0.0   
...                         ...   ...     ...           ...       ...   ...   
processors                  0.0   0.0     0.0           0.0       0.0   0.0   
pami                        0.0   0.0     0.0           0.0       0.0   0.0   
importgrayquantumtype       0.0   0.0     0.0           0.0       0.0   0.0   
ragged                      0.0   0.0     0.0           0.0       0.0   0.0   
benefit                     0.0   0.0     0.0           0.0       0.0   0.0   

                       module  hasxdmxauth  simulta

In [89]:
## Save matrix 

In [90]:
data_matrix.to_csv('Data/CVE_coocurrence_matrix.csv')

KeyboardInterrupt: 