In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn import svm
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from nltk.stem import WordNetLemmatizer, porter
from sklearn.preprocessing import LabelBinarizer
import pandas as pd
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import precision_recall_fscore_support

import os

In [3]:
df = pd.read_csv('Final techniques.csv')
print(df.head())

      id                                               desc
0  T1595  Adversaries may execute active reconnaissance ...
1  T1592  Adversaries may gather information about the v...
2  T1589  Adversaries may gather information about the v...
3  T1590  Adversaries may gather information about the v...
4  T1591  Adversaries may gather information about the v...


In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")
def split_into_sentences(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

df['sentences'] = df['desc'].apply(split_into_sentences)
df = df.explode('sentences').reset_index(drop=True)
df = df[['id', 'sentences']]
print(df)

In [42]:
df.to_csv('Final.csv', index=False)

In [5]:
import pandas as pd
new_row = {'id': 'T1008', 'sentences': 'Adversaries may use fallback or alternate communication channels if the primary channel is compromised or inaccessible in order to maintain reliable command and control and to avoid data transfer thresholds.'}
df_T = pd.DataFrame([new_row])

df = pd.concat([df, df_T])
df.reset_index(drop=True, inplace=True)

In [6]:
df

Unnamed: 0,id,sentences
0,T1595,Adversaries may execute active reconnaissance ...
1,T1595,Active scans are those where the adversary pro...
2,T1595,Adversaries may perform different forms of act...
3,T1595,These scans can also be performed in various w...
4,T1592,Adversaries may gather information about the v...
...,...,...
5739,T1498,Adversaries may be able to generate an increas...
5740,T1498,The extent of this increase will depending upo...
5741,T1498,Two prominent protocols that have enabled Refl...
5742,T1498,"In particular, the memcache protocol showed it..."


In [45]:
df.to_csv('Final.csv', index=False)

In [7]:
def lemmatize_set(dataset):
    lemmatizer = WordNetLemmatizer()
    lemmatized_list = []
    for sentence in dataset:
        word_list = word_tokenize(sentence)
        lemma_list = [lemmatizer.lemmatize(w) for w in word_list]
        lemmatized_list.append(' '.join(lemma_list))
    return lemmatized_list

def stemmatize_set(dataset):
    ps = porter.PorterStemmer()
    stemmatize_list = []
    for sentence in dataset:
        word_list = word_tokenize(sentence)
        stemma_list = [ps.stem(w) for w in word_list]
        stemmatize_list.append(' '.join(stemma_list))
    return stemmatize_list

In [5]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
data_df = df.copy(deep=True)
num_classes = len(data_df['id'].value_counts())
print(num_classes)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Amellouk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Amellouk\AppData\Roaming\nltk_data...


KeyboardInterrupt: 

In [9]:
vectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english', max_features=10000, ngram_range=(1,2))
stemmatized_set = stemmatize_set(data_df.sentences)
lemmatized_set = lemmatize_set(stemmatized_set)
x_train_vectors = vectorizer.fit_transform(lemmatized_set)
bow_vocab = vectorizer.get_feature_names_out()

In [10]:
encoder = LabelBinarizer()
encoder.fit(data_df.id)
y_enc = encoder.transform(data_df.id)
text_label = encoder.classes_

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data_df.sentences, data_df.id, test_size=0.2,  random_state=4, stratify=data_df.id)

stemmatized_set = stemmatize_set(X_train)
lemmatized_set = lemmatize_set(stemmatized_set)
x_train_vectors = vectorizer.fit_transform(lemmatized_set)

stemmatized_set = stemmatize_set(X_test)
lemmatized_set = lemmatize_set(stemmatized_set)
x_test_vectors = vectorizer.transform(lemmatized_set)

In [6]:
import joblib
from sklearn.metrics import accuracy_score

def fit_model(classifier, name, X_train, y_train, X_test,y_test):
    classifier.fit(X_train, y_train)
    print("Model "+name+ " has been trained!")
    print(y_train.shape)

    predicted = classifier.predict(x_test_vectors)
    print(predicted.shape)

    precision, recall, fscore, support = precision_recall_fscore_support(y_test, predicted, average='weighted')

    print("Results for: " + name + "\n")
    accuracy = accuracy_score(y_test, predicted)
    print(f'Accuracy: {accuracy * 100:.2f}%')
    print("Precision: " + str(precision) + " Recall: " + str(recall) + " F-Score: " + str(fscore) + "\n")

    path = 'ml_models'
    try:
        os.mkdir(path)
    except OSError as error:
        print(error)

    filename = path + '/' + name + '.pkl'
    joblib.dump(classifier, filename)

In [13]:
cnb_clf = ComplementNB()
logisticRegr = LogisticRegression(class_weight='balanced', multi_class='multinomial')
knn_clf=KNeighborsClassifier()
nn_clf = MLPClassifier(max_iter=1000, early_stopping=True)
clf_svm = svm.SVC(kernel='linear', probability=True, class_weight='balanced')

In [14]:
fit_model(cnb_clf, 'ComplementNB', x_train_vectors, y_train, x_test_vectors, y_test)
fit_model(logisticRegr, 'LogisticRegression', x_train_vectors, y_train, x_test_vectors, y_test)
fit_model(knn_clf, 'KNeighborsClassifier', x_train_vectors, y_train, x_test_vectors, y_test)
fit_model(clf_svm, 'SVC', x_train_vectors, y_train, x_test_vectors, y_test)

Model ComplementNB has been trained!
(4595,)
(1149,)
Results for: ComplementNB

Accuracy: 66.84%
Precision: 0.6190087232631735 Recall: 0.6684073107049608 F-Score: 0.6244596119827144



  _warn_prf(average, modifier, msg_start, len(result))


Model LogisticRegression has been trained!
(4595,)
(1149,)
Results for: LogisticRegression

Accuracy: 57.09%
Precision: 0.7067172658039654 Recall: 0.5709312445604874 F-Score: 0.5826329432973698

[Errno 17] File exists: 'ml_models'
Model KNeighborsClassifier has been trained!
(4595,)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(1149,)
Results for: KNeighborsClassifier

Accuracy: 29.16%
Precision: 0.6254007298641932 Recall: 0.2915578764142733 F-Score: 0.3490396196719964

[Errno 17] File exists: 'ml_models'


  _warn_prf(average, modifier, msg_start, len(result))


Model SVC has been trained!
(4595,)
(1149,)
Results for: SVC

Accuracy: 69.36%
Precision: 0.6922434398188154 Recall: 0.6936466492602262 F-Score: 0.6753632054701753

[Errno 17] File exists: 'ml_models'


  _warn_prf(average, modifier, msg_start, len(result))
