# Classification of Security-Relevant Configuration Settings Using Topic Modeling and Latent Dirichlet Allocation

This notebook is part of the paper *Automated Identification of Security-Relevant Configuration Settings Using NLP* submitted to the [**37<sup>th</sup> IEEE/ACM International Conference on Automated Software Engineering (ASE)**](https://conf.researchr.org/track/ase-2022/ase-2022-industry-showcase).

The other notebooks can be found here

- [Sentiment Analysis](https://www.kaggle.com/tumin4/sentiment-analysis/)
- [Transformer-based Machine Learning](https://www.kaggle.com/tumin4/transformer-based-machine-learning)

and on [GitHub](https://github.com/tum-i4/Automated-Identification-of-Security-Relevant-Configuration-Settings-Using-NLP/)

## Contact

If you have any questions, please contact [Patrick Stöckle](mailto:patrick.stoeckle@tum.de?subject=Kaggle%20Notebook%20%22Topic%20Modeling%20and%20Latent%20Dirichlet%20Allocation%22).


1. Acknowledgements
2. Import libraries
3. Load Data
4. Prepare Data
5. Generating security stop words
6. Coherence evaluation
6. Latent Dirichlet Allocation - Topic model generation
7. Classification
8. Evaluation
9. Visualization

### Acknowledgements

Create topic clusters from security settings with latent dirichlet allocation

Reference: 
* https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
* https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24#:~:text=Latent%20Dirichlet%20Allocation%20(LDA)%20is,and%20split%20them%20into%20topics.

### Import libraries

In [None]:
from json import load
import matplotlib.pyplot as plt
from gensim.corpora import Dictionary
from pandas import DataFrame
import numpy as np
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk import Text, word_tokenize, FreqDist
from random import shuffle
from gensim import corpora, models
import gensim
import pyLDAvis.gensim
import pickle
from gensim import models
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score

stop_w = gensim.parsing.preprocessing.STOPWORDS
np.random.seed(2018)

### Load Data

In [None]:
def load_prepare_data(path, name):
    """
    1. Load labeled configuration settings from ase2022 dataset
    2. Remove Internet Explorer settings
    3. Prepare dataframe for binary classification
    
    Args:
    path: path to data
    name: name of system variant
    
    Returns: 
    A dataframe of the system variant
    
    """
    with open(path) as f_read:
        docs_all = load(f_read)
    shuffle(docs_all)
    df = DataFrame([doc for doc in docs_all if doc["text"].lower().find("windows components. internet explorer.")==-1])
    df['sec']=df['is_security_relevant'].apply(lambda x: 1 if x==True else 0)
    df = df.drop('is_security_relevant', 1)
    print(f"{name} settings: {len(df)}")
    return df.text.tolist()

In [None]:
docs: dict = {"CIS_18_sec": load_prepare_data("../input/ase2022/docs/cis/1803/sec_docs.json","SEC CIS 1803"),
              "CIS_18_non_sec": load_prepare_data("../input/ase2022/docs/cis/1803/non_sec_docs.json","NON-SEC CIS 1803"),
              "CIS_19_sec": load_prepare_data("../input/ase2022/docs/cis/1909/sec_docs.json", "SEC CIS 1909"), 
              "CIS_19_non_sec": load_prepare_data("../input/ase2022/docs/cis/1909/non_sec_docs.json", "NON-SEC CIS 1909"), 
              "CIS_Server_sec": load_prepare_data("../input/ase2022/docs/cis/server2016/sec_docs.json", "SEC CIS Server2016" ), 
              "CIS_Server_non_sec": load_prepare_data("../input/ase2022/docs/cis/server2016/non_sec_docs.json", "NON-SEC CIS Server2016" ), 
              "SIE_19_sec": load_prepare_data("../input/ase2022/docs/siemens/1909/sec_docs.json", "SEC Siemens 1909"), 
              "SIE_19_non_sec": load_prepare_data("../input/ase2022/docs/siemens/1909/non_sec_docs.json", "NON-SEC Siemens 1909"), 
              "SIE_Server_sec": load_prepare_data("../input/ase2022/docs/siemens/server2016/sec_docs.json","SEC Siemens Server2016"),
              "SIE_Server_non_sec": load_prepare_data("../input/ase2022/docs/siemens/server2016/non_sec_docs.json","NON-SEC Siemens Server2016")}

### Data preprocessing

In [None]:
def lemmatize_stemming(text):
    """
    1. apply WordNetLemmatizer on text input
    2. apply SnowballStemmer
    """
    return SnowballStemmer("english").stem(WordNetLemmatizer().lemmatize(text))

In [None]:
def preprocess(text):
    """
    1. apply simple_preprocess
    2. remove stop words
    3. keep words w of length 2 < w < 16
    4. apply stemmer/lemmatizer
    """
    result = [lemmatize_stemming(token) for token in gensim.utils.simple_preprocess(text) if (token not in gensim.parsing.preprocessing.STOPWORDS) and (len(token) > 2) and (len(token)<16)]
    return [token for token in result if (token not in security_stop_words)]

### Generating security stop words:


##### Most common words in security vocabulary

In [None]:
sec_lemma_tokens= Text(lemmatize_stemming(token) for document in docs['CIS_19_sec'] for token in word_tokenize(document.lower()) if token.isalpha() and (token not in gensim.parsing.preprocessing.STOPWORDS))
f_dist_sec_lemma_tokens = FreqDist(sec_lemma_tokens)
most_common = f_dist_sec_lemma_tokens.most_common(300)
most_common_words = [w for (w,c) in most_common]
print(most_common_words)

##### Manual extraction of security stop words: frequent, but not security identifying words.

In [None]:
security_stop_words={'abl', 'accept', 'activ', 'add', 'addit', 'affect', 'algorithm', 'allow', 'alreadi', 'also', 'alway', 'appear', 'avail', 'back', 'bar', 'behavior', 'box', 'charact', 'check', 'choos', 'client', 'client', 'com', 'command', 'comput', 'comput', 'configur', 'configur', 'control', 'creat', 'decid', 'default', 'desktop', 'determin', 'disabl', 'domain', 'edg', 'effect', 'employe', 'enabl', 'exampl', 'field', 'fix', 'follow', 'host', 'howev', 'includ', 'input', 'instead', 'kilobyt', 'let', 'local', 'longer', 'make', 'manag', 'may', 'megabyt', 'method', 'method', 'microsoft', 'ms', 'must', 'new', 'offer', 'one', 'oper', 'option', 'option', 'order', 'polici', 'polici', 'powershel', 'prevent', 'process', 'prompt', 'properti', 'public', 'rdp', 'reach', 'remov', 'requir', 'resum', 'select', 'set', 'set', 'specifi', 'state', 'still', 'system', 'take', 'task', 'temporari', 'terabyt', 'tool', 'tpm', 'turn', 'type', 'usb', 'use', 'user', 'user', 'valu', 'via', 'vista', 'want', 'whether', 'window', 'winrm', 'without', 'folder', 'devic', 'server', 'remot', 'password', 'data', 'featur'}
print(security_stop_words)

### Coherence evaluation

In [None]:
# https://www.tutorialspoint.com/gensim/gensim_documents_and_lda_model.htm
# https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0#:~:text=Topic%20Coherence%20measures%20score%20a,are%20artifacts%20of%20statistical%20inference.
def coherence_values_computation(dictionary, corpus, texts, limit, start=2, step=3):
    """
    build coherence models for different numbers of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=4, alpha='auto', per_word_topics=True)
        model_list.append(model)
        coherencemodel = gensim.models.coherencemodel.CoherenceModel(model=model, texts=texts, dictionary=dictionary)
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
def coherence_evaluation(docs_input): 
    """
    evaluate and plot coherence values
    """
    text_data = [preprocess(doc.lower()) for doc in docs_input]
    dictionary = gensim.corpora.Dictionary.load('../input/ase2022/dictionary.gensim')
    corpus = pickle.load(open('../input/ase2022/corpus.pkl', 'rb'))
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    model_list, coherence_values = coherence_values_computation (dictionary=dictionary, corpus=corpus_tfidf, texts=text_data, start=1, limit=30)
    x = range(1, 30, 3)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend("coherence_values", loc='best')
    plt.savefig('CoherenceEval.png', bbox_inches='tight')
    plt.show()
    for m, cv in zip(x, coherence_values):
        print("Num Topics =", m, " is having Coherence Value of", round(cv, 4))
    return model_list
coherence_evaluation(docs['CIS_19_sec'])

### Latent Dirichlet Allocation - Topic model generation

In [None]:
def lda_topic_model_generation(docs_input):
    """
    build lda topic model with 9 topics using TF-IDF model
    """
    text_data = [preprocess(doc.lower()) for doc in docs_input]
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    pickle.dump(corpus, open('corpusNew.pkl', 'wb'))
    dictionary.save('dictionaryNew.gensim')
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    n_topics = 9
    ldamodel = gensim.models.LdaModel(corpus_tfidf, num_topics=n_topics, id2word=dictionary,passes=4, alpha='auto')
    topics = ldamodel.print_topics(num_words=10)
    for topic in topics:
        print(topic)
    ldamodel.save(f"modelNew.gensim")  

    return ldamodel

# Uncomment to train a new topic model
# model = lda_topic_model_generation(docs['CIS_19_sec'])

### Classification

In [None]:
def classification(docs_input: [], model, threshold, trained_dict, tfidf): 
    """
    classify input documents: if topic probability for one of the topics is > threshold -> doc is sec-relevant
    """
    sec_counter = []
    for doc in docs_input:
        sec_relevant = False
        new_doc = preprocess(doc.lower())
        new_doc_bow = trained_dict.doc2bow(new_doc)
        doc_tfidf = tfidf[new_doc_bow]
        res = model.get_document_topics(doc_tfidf)
        for (t,r) in res:
            if r>threshold:
                sec_relevant = True
        sec_counter.append(1) if sec_relevant else sec_counter.append(0)
    return sec_counter

### Evaluation

In [None]:
def evaluation(sec_docs: [], non_sec_docs: [],model, threshold, trained_dict):
    """
    evaluate the model on the input data
    """
    all_docs = sec_docs+non_sec_docs
    text_data = [preprocess(doc.lower()) for doc in all_docs]
    corpus = [trained_dict.doc2bow(text) for text in text_data]
    tfidf = models.TfidfModel(corpus)
    
    test_y = [1 for i in sec_docs]+[0 for i in non_sec_docs]
    y_pred_sec = classification(sec_docs, model, threshold, trained_dict, tfidf)
    y_pred_non_sec = classification(non_sec_docs, model, threshold, trained_dict, tfidf)
    y_pred=y_pred_sec+y_pred_non_sec
    
    prec = precision_score(test_y, y_pred, zero_division=0)
    rec = recall_score(test_y, y_pred, zero_division=0)
    f1 = f1_score(test_y,y_pred, zero_division=0)
    bal_acc = balanced_accuracy_score(test_y,y_pred)
    
    print('Precision: {:4.2f}'.format(prec))
    print('Recall: {:4.2f}'.format(rec))
    print('F-Score: {:4.2f}'.format(f1))
    print('Balanced accuracy: {:4.2f}'.format(bal_acc))

In [None]:
def classify_and_evaluate(eval_sec, eval_non_sec, name):
    """
    classify input docs with threshold of 0.7 and evaluate
    """
    dictionary = gensim.corpora.Dictionary.load('../input/ase2022/dictionary.gensim')
    ldamodel = gensim.models.ldamodel.LdaModel.load('../input/ase2022/model.gensim')
    t=0.7
    print(f"\nThreshold {t}:")
    print(name)
    evaluation(eval_sec, eval_non_sec, ldamodel, t, dictionary)

In [None]:
classify_and_evaluate(docs['CIS_18_sec'], docs['CIS_18_non_sec'], "CIS 1803: ")
classify_and_evaluate(docs['CIS_19_sec'], docs['CIS_19_non_sec'], "CIS 1909: ")
classify_and_evaluate(docs['SIE_19_sec'], docs['SIE_19_non_sec'], "Siemens 1909: ")
classify_and_evaluate(docs['CIS_Server_sec'], docs['CIS_Server_non_sec'],"CIS Server 2016: ")
classify_and_evaluate(docs['SIE_Server_sec'], docs['SIE_Server_non_sec'],"Siemens Server 2016: ")

### Topic Visualization

In [None]:
dct = gensim.corpora.Dictionary.load('../input/ase2022/dictionary.gensim')
corp = pickle.load(open('../input/ase2022/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('../input/ase2022/model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corp, dct, sort_topics=False)
pyLDAvis.display(lda_display)