### Data Pre-Processing

In [5]:
import pandas as pd

def load_bio_file(file_path):
    sentences = []
    sentence = []

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line == '':
                if sentence:
                    sentences.append(sentence)
                    sentence = []
                continue
            parts = line.split()
            if len(parts) == 2:
                token, tag = parts
                sentence.append((token, tag))

    if sentence:
        sentences.append(sentence)

    return sentences

def convert_to_dataframe(sentences):
    data = {
        "sentence_id": [],
        "word": [],
        "tag": []
    }
    for i, sentence in enumerate(sentences):
        for token, tag in sentence:
            data["sentence_id"].append(i)
            data["word"].append(token)
            data["tag"].append(tag)
    return pd.DataFrame(data)

# 📌 Replace this with your actual file path
file_path = "APTNERtrain.txt"

# 🔧 Load and convert the BIO data
sentences = load_bio_file(file_path)
df = convert_to_dataframe(sentences)

# ✅ Save to CSV for future use
df.to_csv("ner_data_train.csv", index=False)

# ✅ Show entity tag types
print("Unique Tags:", sorted(df['tag'].unique()))
print("Sample:")
print(df.head())

Unique Tags: ['B-ACT', 'B-APT', 'B-EMAIL', 'B-ENCR', 'B-FILE', 'B-IDTY', 'B-IP', 'B-LOC', 'B-MAL', 'B-OS', 'B-PROT', 'B-SECTEAM', 'B-SHA2', 'B-TIME', 'B-TOOL', 'B-URL', 'B-VULID', 'B-VULNAME', 'E-ACT', 'E-APT', 'E-EMAIL', 'E-ENCR', 'E-FILE', 'E-IDTY', 'E-IP', 'E-LOC', 'E-MAL', 'E-OS', 'E-PROT', 'E-S-SECTEAM', 'E-SECTEAM', 'E-SHA2', 'E-TIME', 'E-TOOL', 'E-URL', 'E-VULNAME', 'I-ACT', 'I-APT', 'I-FILE', 'I-IDTY', 'I-LOC', 'I-MAL', 'I-OS', 'I-PROT', 'I-SECTEAM', 'I-TIME', 'I-TOOL', 'I-URL', 'I-VULNAME', 'O', 'PROT', 'S-ACT', 'S-APT', 'S-DOM', 'S-EMAIL', 'S-ENCR', 'S-FILE', 'S-IDTY', 'S-IP', 'S-LOC', 'S-MAL', 'S-MD5', 'S-OS', 'S-PROT', 'S-S-SECTEAM', 'S-SECTEAM', 'S-SHA1', 'S-SHA2', 'S-TIME', 'S-TOOL', 'S-URL', 'S-VULID', 'S-VULNAME']
Sample:
   sentence_id   word     tag
0            0   From       O
1            0  April  B-TIME
2            0  19-24  I-TIME
3            0      ,  I-TIME
4            0   2017  E-TIME


In [6]:
file_path = "APTNERtest.txt"

# 🔧 Load and convert the BIO data
sentences = load_bio_file(file_path)
df = convert_to_dataframe(sentences)

# ✅ Save to CSV for future use
df.to_csv("ner_data_test.csv", index=False)

# ✅ Show entity tag types
print("Unique Tags:", sorted(df['tag'].unique()))
print("Sample:")
print(df.head())

Unique Tags: ['B-ACT', 'B-APT', 'B-FILE', 'B-IDTY', 'B-LOC', 'B-MAL', 'B-OS', 'B-PROT', 'B-SECTEAM', 'B-TIME', 'B-TOOL', 'B-VULID', 'B-VULNAME', 'E-ACT', 'E-APT', 'E-FILE', 'E-IDTY', 'E-LOC', 'E-MAL', 'E-OS', 'E-PROT', 'E-SECTEAM', 'E-TIME', 'E-TOOL', 'E-VULID', 'E-VULNAME', 'I-ACT', 'I-APT', 'I-FILE', 'I-IDTY', 'I-LOC', 'I-MAL', 'I-OS', 'I-PROT', 'I-SECTEAM', 'I-TIME', 'I-TOOL', 'O', 'S-ACT', 'S-APT', 'S-DOM', 'S-EMAIL', 'S-ENCR', 'S-FILE', 'S-IDTY', 'S-IP', 'S-LOC', 'S-MAL', 'S-MD5', 'S-OS', 'S-PROT', 'S-SECTEAM', 'S-SHA2', 'S-TIME', 'S-TOOL', 'S-URL', 'S-VULID', 'S-VULNAME']
Sample:
   sentence_id         word tag
0            0          One   O
1            0  certificate   O
2            0          was   O
3            0    generated   O
4            0      locally   O


### MEMM

In [7]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn_crfsuite import CRF, metrics as crf_metrics
import nltk
from nltk.tag import hmm
import warnings
warnings.filterwarnings("ignore")

In [18]:
train_df = pd.read_csv("ner_data_train.csv")
test_df = pd.read_csv("ner_data_test.csv")

In [19]:
train_df.dropna(subset=["word", "tag"], inplace=True)
test_df.dropna(subset=["word", "tag"], inplace=True)

In [20]:
def word2features(sent, i):
    word = str(sent[i][0]) if sent[i][0] is not None else ""

    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit()
    }

    # Previous word
    if i > 0:
        prev_word = str(sent[i - 1][0]) if sent[i - 1][0] is not None else ""
        features.update({
            '-1:word': prev_word,
            '-1:is_title': prev_word.istitle()
        })
    else:
        features['BOS'] = True

    # Next word
    if i < len(sent) - 1:
        next_word = str(sent[i + 1][0]) if sent[i + 1][0] is not None else ""
        features.update({
            '+1:word': next_word,
            '+1:is_title': next_word.istitle()
        })
    else:
        features['EOS'] = True

    return features

def prepare_data(df):
    grouped = df.groupby("sentence_id").apply(lambda x: list(zip(x["word"], x["tag"])))
    return list(grouped)

def extract_features_labels(sentences):
    X, y = [], []
    for sent in sentences:
        X.append([word2features(sent, i) for i in range(len(sent))])
        y.append([tag for _, tag in sent])
    return X, y

In [21]:
train_sents = prepare_data(train_df)
test_sents = prepare_data(test_df)
X_train_feats, y_train = extract_features_labels(train_sents)
X_test_feats, y_test = extract_features_labels(test_sents)

In [22]:
X_flat = [f for seq in X_train_feats for f in seq]
y_flat = [t for seq in y_train for t in seq]
X_test_flat = [f for seq in X_test_feats for f in seq]
y_test_flat = [t for seq in y_test for t in seq]

vec = DictVectorizer()
X_train_vec = vec.fit_transform(X_flat)
X_test_vec = vec.transform(X_test_flat)

all_labels = y_flat + y_test_flat
le = LabelEncoder()
le.fit(all_labels)

y_train_enc = le.transform(y_flat)
y_test_enc = le.transform(y_test_flat)

clf = LogisticRegression(max_iter=200)
clf.fit(X_train_vec, y_train_enc)
y_pred_enc = clf.predict(X_test_vec)
y_pred = le.inverse_transform(y_pred_enc)

print("📌 MEMM Results:\n")
print(classification_report(y_test_flat, y_pred))

📌 MEMM Results:

              precision    recall  f1-score   support

       B-ACT       0.10      0.80      0.18        10
       B-APT       0.15      0.67      0.24        12
      B-FILE       0.00      0.00      0.00         6
      B-IDTY       0.03      0.08      0.04        25
       B-LOC       0.54      0.73      0.62        26
       B-MAL       0.06      0.27      0.10        15
        B-OS       0.00      0.00      0.00         4
      B-PROT       0.00      0.00      0.00         2
   B-SECTEAM       0.72      0.54      0.62        24
      B-TIME       0.82      0.49      0.61       177
      B-TOOL       0.50      0.09      0.15       135
     B-VULID       0.00      0.00      0.00         3
   B-VULNAME       0.00      0.00      0.00         1
       E-ACT       0.05      0.25      0.08        12
       E-APT       0.15      0.67      0.24        12
      E-FILE       0.00      0.00      0.00         6
      E-IDTY       0.04      0.12      0.05        25
       E-L

### CRF Model

In [23]:
crf = CRF(algorithm='lbfgs', max_iterations=100, all_possible_transitions=True)
crf.fit(X_train_feats, y_train)
y_pred_crf = crf.predict(X_test_feats)

print("📌 CRF Results:\n")
print(crf_metrics.flat_classification_report(y_test, y_pred_crf))

📌 CRF Results:

              precision    recall  f1-score   support

       B-ACT       0.12      0.80      0.21        10
       B-APT       0.15      0.83      0.26        12
      B-FILE       0.00      0.00      0.00         6
      B-IDTY       0.03      0.12      0.05        25
       B-LOC       0.55      0.69      0.61        26
       B-MAL       0.04      0.20      0.07        15
        B-OS       1.00      0.25      0.40         4
      B-PROT       0.00      0.00      0.00         2
   B-SECTEAM       0.74      0.58      0.65        24
      B-TIME       0.80      0.58      0.68       177
      B-TOOL       0.48      0.12      0.19       135
     B-VULID       0.00      0.00      0.00         3
   B-VULNAME       0.00      0.00      0.00         1
       E-ACT       0.09      0.50      0.15        12
       E-APT       0.12      0.67      0.21        12
      E-FILE       0.00      0.00      0.00         6
      E-IDTY       0.03      0.12      0.05        25
       E-LO

In [24]:
train_data_hmm = [[(w, t) for w, t in sent] for sent in train_sents]
test_data_hmm = [[w for w, t in sent] for sent in test_sents]
test_tags_hmm = [[t for w, t in sent] for sent in test_sents]

hmm_trainer = hmm.HiddenMarkovModelTrainer()
hmm_model = hmm_trainer.train_supervised(train_data_hmm)

hmm_pred = [hmm_model.tag(sent) for sent in test_data_hmm]
y_pred_hmm = [[tag for _, tag in sent] for sent in hmm_pred]

y_true = [t for seq in test_tags_hmm for t in seq]
y_pred = [t for seq in y_pred_hmm for t in seq]

print("📌 HMM Results:\n")
print(classification_report(y_true, y_pred))

📌 HMM Results:

              precision    recall  f1-score   support

       B-ACT       0.05      0.50      0.10        10
       B-APT       0.10      0.75      0.18        12
     B-EMAIL       0.00      0.00      0.00         0
      B-FILE       0.00      0.00      0.00         6
      B-IDTY       0.04      0.16      0.06        25
       B-LOC       0.39      0.58      0.47        26
       B-MAL       0.00      0.00      0.00        15
        B-OS       1.00      0.50      0.67         4
      B-PROT       0.00      0.00      0.00         2
   B-SECTEAM       0.63      0.50      0.56        24
      B-TIME       0.78      0.38      0.52       177
      B-TOOL       0.20      0.07      0.11       135
     B-VULID       0.00      0.00      0.00         3
   B-VULNAME       0.00      0.00      0.00         1
       E-ACT       0.03      0.25      0.06        12
       E-APT       0.10      0.75      0.18        12
     E-EMAIL       0.00      0.00      0.00         0
      E-FIL

In [25]:
import joblib

# Save the MEMM model, vectorizer, and label encoder
joblib.dump(clf, "memm_model.joblib")
joblib.dump(vec, "memm_vectorizer.joblib")
joblib.dump(le, "memm_label_encoder.joblib")

['memm_label_encoder.joblib']

In [26]:
import pickle

with open("crf_model.pkl", "wb") as f:
    pickle.dump(crf, f)

In [30]:
import dill

with open("hmm_model.dill", "wb") as f:
    dill.dump(hmm_model, f)

In [32]:
os_tokens = [word for word, tag in X_flat if "OS" in tag]
print(set(os_tokens))

ValueError: too many values to unpack (expected 2)

In [33]:
# If using HMM format like [[(word, tag), ...], ...]
os_tokens = [word for sent in train_data_hmm for word, tag in sent if "OS" in tag]
print("OS-related tokens in training data:", set(os_tokens))

OS-related tokens in training data: {'Win64', 'X', 'Linux-based', 'Creators', 'Xp/2003', 'Unix-based', 'Update', 'OSX', 'OS', 'operating', 'XP', 'Windows-based', 'macOS', '%ALLUSERSPROFILE%\\Windows', 'MS', 'Unix', 'Apple', 'MAC', 'Microsoft\\Windows', 'Mac', 'windows', 'The', 'HKLM\\SOFTWARE\\Microsoft\\Windows', 'SysWoW64', 'systems', 'Android', '’s', 'Win32', 'the', 'UNIX', 'MacOS', 'Unix-', 'Win', 'Linux', '10', 'Windows', 'Linux-'}
