# Basic ML only

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load dataset
data = pd.read_csv('combined_data_fix.csv')

# Extract the Snort rules and techniques
snort_rules = data['Rule']
techniques = data['MITRE Technique ID']

# Convert the techniques to categorical labels
technique_labels = techniques.astype('category').cat.codes

# Use TF-IDF to vectorize the Snort rules
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(snort_rules)
y = technique_labels

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the model
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Print the best parameters
print(f"Best parameters found: {best_params}")

# Train the final model with the best parameters
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 216 candidates, totalling 1080 fits




Best parameters found: {'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           2       1.00      0.33      0.50         3
           3       0.00      0.00      0.00         2
           4       1.00      0.67      0.80         6
           5       0.00      0.00      0.00         2
           7       0.82      0.90      0.86        10
           8       1.00      0.50      0.67         4
           9       0.81      1.00      0.90        13
          11       1.00      0.50      0.67         2
          12       0.00      0.00      0.00         2
          14       1.00      0.67      0.80         3
          15       1.00      1.00      1.00         1
          17       0.90      0.93      0.92        60
          18       0.88      0.78      0.82         9
          20       1.00      0.67      0.80         3
      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# ML - ChatGPT

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from ast import literal_eval

# Load the data
file_path = '/content/combined_data_fix.csv'
data = pd.read_csv(file_path)
data['MITRE Technique ID'] = data['MITRE Technique ID'].apply(literal_eval)
expanded_data = data.explode('MITRE Technique ID')

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(expanded_data['Rule'])
y = expanded_data['MITRE Technique ID']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize classifiers
logistic_regression = LogisticRegression(max_iter=1000)
random_forest = RandomForestClassifier(random_state=42)
svm = SVC()

# Cross-validate models
scores_lr = cross_val_score(logistic_regression, X_train, y_train, cv=5, scoring='f1_weighted')
scores_rf = cross_val_score(random_forest, X_train, y_train, cv=5, scoring='f1_weighted')
scores_svm = cross_val_score(svm, X_train, y_train, cv=5, scoring='f1_weighted')

# Print cross-validation results
print("Logistic Regression F1-Score:", scores_lr.mean())
print("Random Forest F1-Score:", scores_rf.mean())
print("SVM F1-Score:", scores_svm.mean())

# Train the best performing model (Random Forest in this case)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

# Evaluate the model
performance_report = classification_report(y_test, y_pred, zero_division=0)
print(performance_report)




Logistic Regression F1-Score: 0.5069940449693489
Random Forest F1-Score: 0.6992202365441476
SVM F1-Score: 0.5965679709574713
              precision    recall  f1-score   support

       T1001       0.00      0.00      0.00         0
       T1003       1.00      0.50      0.67         2
       T1005       0.00      0.00      0.00         1
       T1014       1.00      0.67      0.80         3
       T1018       0.00      0.00      0.00         0
       T1020       0.69      0.90      0.78        10
       T1021       1.00      0.75      0.86         4
       T1027       0.61      1.00      0.76        11
       T1041       1.00      1.00      1.00         1
       T1043       0.33      0.67      0.44         3
       T1047       1.00      1.00      1.00         2
       T1048       0.00      0.00      0.00         1
       T1056       0.90      1.00      0.95        38
       T1059       1.00      0.50      0.67         6
       T1070       0.00      0.00      0.00         2
       T10

In [17]:
from sklearn.model_selection import GridSearchCV

# Increase features in TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=2000)
X = tfidf_vectorizer.fit_transform(expanded_data['Rule'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Random Forest classifier with hyperparameter tuning
rf_classifier = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Maximum depth of the tree
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required at each leaf node
}

# Grid search with cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=3, scoring='f1_weighted', verbose=1)
grid_search.fit(X_train, y_train)

# Predict on the test set using the best model found
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate and display the classification report
performance_report = classification_report(y_test, y_pred, zero_division=0)
performance_report


Fitting 3 folds for each of 18 candidates, totalling 54 fits




'              precision    recall  f1-score   support\n\n       T1001       0.00      0.00      0.00         0\n       T1003       1.00      0.50      0.67         2\n       T1005       0.00      0.00      0.00         1\n       T1014       1.00      0.67      0.80         3\n       T1018       0.00      0.00      0.00         0\n       T1020       0.90      0.90      0.90        10\n       T1021       0.75      0.75      0.75         4\n       T1027       0.56      0.91      0.69        11\n       T1041       1.00      1.00      1.00         1\n       T1043       0.40      0.67      0.50         3\n       T1047       1.00      1.00      1.00         2\n       T1048       1.00      1.00      1.00         1\n       T1056       0.90      1.00      0.95        38\n       T1059       1.00      0.50      0.67         6\n       T1070       0.00      0.00      0.00         2\n       T1071       0.75      0.55      0.63        11\n       T1072       0.00      0.00      0.00         1\n       

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import Callback
from sklearn.metrics import precision_score, recall_score
import numpy as np

# Load your data
data = pd.read_csv('/content/combined_data_fix.csv')

# Assuming the MITRE Technique IDs need to be processed
data['MITRE Technique ID'] = data['MITRE Technique ID'].apply(lambda x: x.strip("[]").replace("'", "").split(','))
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['MITRE Technique ID'])

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Rule'])
sequences = tokenizer.texts_to_sequences(data['Rule'])
X_seq = pad_sequences(sequences, maxlen=100)

# Split the data
X_train_seq, X_test_seq, y_train, y_test = train_test_split(X_seq, y, test_size=0.2, random_state=42)

# Define the model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=100),
    Flatten(),
    Dense(100, activation='relu'),
    Dense(y_train.shape[1], activation='sigmoid')
])

class MacroPR(Callback):
    def __init__(self, validation_data):
        super(MacroPR, self).__init__()
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_precision = precision_score(val_targ, val_predict, average='weighted', zero_division=0)
        _val_recall = recall_score(val_targ, val_predict, average='weighted', zero_division=0)
        print(f'\nEpoch {epoch+1}: validation weighted precision: {_val_precision:.4f}, validation weighted recall: {_val_recall:.4f}')

# Instantiate the callback with validation data
macro_pr_callback = MacroPR(validation_data=(X_test_seq, y_test))


# Compile the model with precision and recall metrics
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', Precision(), Recall()])

# Train the model with the MacroPR callback
model.fit(X_train_seq, y_train, epochs=150, batch_size=32, validation_data=(X_test_seq, y_test), callbacks=[macro_pr_callback])

# Evaluate the model
loss, accuracy, precision, recall = model.evaluate(X_test_seq, y_test)
print("Test Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)


Epoch 1/150

Epoch 1: validation weighted precision: 0.0000, validation weighted recall: 0.0000
Epoch 2/150

Epoch 2: validation weighted precision: 0.0000, validation weighted recall: 0.0000
Epoch 3/150

Epoch 3: validation weighted precision: 0.2727, validation weighted recall: 0.0505
Epoch 4/150

Epoch 4: validation weighted precision: 0.2727, validation weighted recall: 0.2121
Epoch 5/150

Epoch 5: validation weighted precision: 0.4545, validation weighted recall: 0.2727
Epoch 6/150

Epoch 6: validation weighted precision: 0.4545, validation weighted recall: 0.2828
Epoch 7/150

Epoch 7: validation weighted precision: 0.5478, validation weighted recall: 0.3586
Epoch 8/150

Epoch 8: validation weighted precision: 0.5301, validation weighted recall: 0.3788
Epoch 9/150

Epoch 9: validation weighted precision: 0.5146, validation weighted recall: 0.3838
Epoch 10/150

Epoch 10: validation weighted precision: 0.5228, validation weighted recall: 0.3889
Epoch 11/150

Epoch 11: validation wei

In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

# Load training data
train_file_path = '/content/train_data_80_20.csv'
train_data = pd.read_csv(train_file_path)
train_data['technique ids'] = train_data['technique ids'].apply(eval)

# Load test data
test_file_path = '/content/test_data_80_20.csv'
test_data = pd.read_csv(test_file_path)
test_data['technique ids'] = test_data['technique ids'].apply(eval)

# Initialize MultiLabelBinarizer and TF-IDF Vectorizer
mlb = MultiLabelBinarizer()
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Process training labels and features
train_labels = mlb.fit_transform(train_data['technique ids'])
train_features = tfidf_vectorizer.fit_transform(train_data['Rule'])

# Process test labels and features
test_labels = mlb.transform(test_data['technique ids'])
test_features = tfidf_vectorizer.transform(test_data['Rule'])

# Split the training data (if needed, otherwise use full train set for training)
X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.2, random_state=42)

# Setup the Decision Tree model for multi-label classification
decision_tree_classifier = DecisionTreeClassifier(random_state=42)
multi_label_classifier = MultiOutputClassifier(decision_tree_classifier)
classifier_pipeline = Pipeline([('clf', multi_label_classifier)])

# Train the model
classifier_pipeline.fit(X_train, y_train)

# Predict on the test data
test_predictions = classifier_pipeline.predict(test_features)

# Evaluate the model with micro and macro averaging
precision_micro = precision_score(test_labels, test_predictions, average='micro')
recall_micro = recall_score(test_labels, test_predictions, average='micro')
f1_micro = f1_score(test_labels, test_predictions, average='micro')

precision_macro = precision_score(test_labels, test_predictions, average='macro')
recall_macro = recall_score(test_labels, test_predictions, average='macro')
f1_macro = f1_score(test_labels, test_predictions, average='macro')

precision_weighted = precision_score(test_labels, test_predictions, average='weighted')
recall_weighted = recall_score(test_labels, test_predictions, average='weighted')
f1_weighted = f1_score(test_labels, test_predictions, average='weighted')

# Print results
print("Micro Average Precision:", precision_micro)
print("Micro Average Recall:", recall_micro)
print("Micro Average F1-Score:", f1_micro)
print()
print("Macro Average Precision:", precision_macro)
print("Macro Average Recall:", recall_macro)
print("Macro Average F1-Score:", f1_macro)
print()
print("weighted Average Precision:", precision_weighted)
print("weighted Average Recall:", recall_weighted)
print("weighted Average F1-Score:", f1_weighted)
print()



Micro Average Precision: 0.5932203389830508
Micro Average Recall: 0.6730769230769231
Micro Average F1-Score: 0.6306306306306306

Macro Average Precision: 0.5615153104749188
Macro Average Recall: 0.5859819121447029
Macro Average F1-Score: 0.5449661197133391

weighted Average Precision: 0.6552711542843123
weighted Average Recall: 0.6730769230769231
weighted Average F1-Score: 0.6305977219856819



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [20]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# Load the dataset
file_path = 'combined_data_fix.csv'  # Update with your file path
data = pd.read_csv(file_path)

# Clean and transform the MITRE Technique IDs
data['MITRE Technique ID'] = data['MITRE Technique ID'].apply(eval)

# Extract features from the SNORT rules using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(data['Rule'])

# Convert MITRE Technique IDs to binary format for multi-label classification
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['MITRE Technique ID'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define Binary Relevance classifiers
binary_relevance_classifiers = {
    'Logistic Regression': OneVsRestClassifier(LogisticRegression(max_iter=1000)),
    'Random Forest': OneVsRestClassifier(RandomForestClassifier(n_estimators=100)),
    'Support Vector Machine': OneVsRestClassifier(SVC(kernel='linear', probability=True))
}

# Train and evaluate using Binary Relevance
results_binary_relevance = {}
for name, clf in binary_relevance_classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=mlb.classes_, zero_division=1, output_dict=True)
    results_binary_relevance[name] = {
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-score': report['weighted avg']['f1-score']
    }

results_binary_relevance_df = pd.DataFrame(results_binary_relevance).T

# Display results for Binary Relevance
print("Results using Binary Relevance:")
print(results_binary_relevance_df)

# Evaluate using Classifier Chains
from sklearn.multioutput import ClassifierChain

# Define Classifier Chains classifiers
classifier_chains = {
    'Logistic Regression': ClassifierChain(LogisticRegression(max_iter=1000)),
    'Random Forest': ClassifierChain(RandomForestClassifier(n_estimators=100)),
    'Support Vector Machine': ClassifierChain(SVC(kernel='linear', probability=True))
}

# Train and evaluate using Classifier Chains
results_classifier_chains = {}
for name, clf in classifier_chains.items():
    try:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        report = classification_report(y_test, y_pred, target_names=mlb.classes_, zero_division=1, output_dict=True)
        results_classifier_chains[name] = {
            'Precision': report['weighted avg']['precision'],
            'Recall': report['weighted avg']['recall'],
            'F1-score': report['weighted avg']['f1-score']
        }
    except Exception as e:
        print(f"Error with {name}: {e}")
        results_classifier_chains[name] = {
            'Precision': None,
            'Recall': None,
            'F1-score': None
        }

results_classifier_chains_df = pd.DataFrame(results_classifier_chains).T

# Display results for Classifier Chains
print("\nResults using Classifier Chains:")
print(results_classifier_chains_df)




Results using Binary Relevance:
                        Precision    Recall  F1-score
Logistic Regression      1.000000  0.378788  0.440086
Random Forest            0.983165  0.626263  0.701673
Support Vector Machine   0.983165  0.737374  0.782271
Error with Logistic Regression: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0
Error with Support Vector Machine: The number of classes has to be greater than one; got 1 class

Results using Classifier Chains:
                       Precision    Recall  F1-score
Logistic Regression         None      None      None
Random Forest           0.979197  0.626263  0.701406
Support Vector Machine      None      None      None


# CLAUDE

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score

# Load and preprocess the training data
train_data = pd.read_csv('/content/train_data_80_20.csv')
train_rules = train_data['Rule'].tolist()
train_techniques = train_data['technique ids'].tolist()

# Convert technique IDs to list format
train_techniques = [eval(technique) for technique in train_techniques]

# Extract features from the training rules using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_rules)

# Convert technique IDs to binary labels
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_techniques)

# Train the classifier
classifier = OneVsRestClassifier(LinearSVC())
classifier.fit(X_train, y_train)

# Load and preprocess the test data
test_data = pd.read_csv('/content/test_data_80_20.csv')
test_rules = test_data['Rule'].tolist()
test_techniques = test_data['technique ids'].tolist()

# Convert technique IDs to list format
test_techniques = [eval(technique) for technique in test_techniques]

# Extract features from the test rules using the same vectorizer
X_test = vectorizer.transform(test_rules)

# Convert test technique IDs to binary labels
y_test = mlb.transform(test_techniques)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1-score: {:.4f}".format(f1))

Precision: 0.6711
Recall: 0.4872
F1-score: 0.5180


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Read the data from the CSV file
data = pd.read_csv('combined_data_fix.csv')

# Split the technique IDs into separate rows
data = data.assign(technique=data['MITRE Technique ID'].str.strip('[]').str.split(',')).explode('technique')
data['technique'] = data['technique'].str.strip("'")

# Prepare the features and target
X = data['Rule']
y = data['technique']

# Remove classes with insufficient members
class_counts = y.value_counts()
valid_classes = class_counts[class_counts >= 2].index
X = X[y.isin(valid_classes)]
y = y[y.isin(valid_classes)]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Extract features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

# Define the models to evaluate
models = [
    SVC(kernel='linear'),
    RandomForestClassifier(),
    LogisticRegression()
]

# Evaluate the models using cross-validation
for model in models:
    scores = cross_val_score(model, X_train_features, y_train, cv=5, scoring='f1_weighted')
    print(f"{model.__class__.__name__}: Cross-validation F1-score: {scores.mean():.4f} (+/- {scores.std():.4f})")

# Train the best model on the entire training set
best_model = LogisticRegression()
best_model.fit(X_train_features, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test_features)

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nBest Model: {best_model.__class__.__name__}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")



SVC: Cross-validation F1-score: 0.6641 (+/- 0.0117)




RandomForestClassifier: Cross-validation F1-score: 0.7046 (+/- 0.0257)




LogisticRegression: Cross-validation F1-score: 0.4871 (+/- 0.0216)

Best Model: LogisticRegression
Precision: 0.4816
Recall: 0.6250
F1-score: 0.5232


  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.over_sampling import RandomOverSampler

# Read the data from the CSV file
data = pd.read_csv('combined_data_fix.csv')

# Split the technique IDs into separate rows
data = data.assign(technique=data['MITRE Technique ID'].str.strip('[]').str.split(',')).explode('technique')
data['technique'] = data['technique'].str.strip("'")

# Prepare the features and target
X = data['Rule']
y = data['technique']

# Remove classes with insufficient members
class_counts = y.value_counts()
valid_classes = class_counts[class_counts >= 2].index
X = X[y.isin(valid_classes)]
y = y[y.isin(valid_classes)]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Extract features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

# Feature selection using chi-square test
selector = SelectKBest(chi2, k=1000)
X_train_features = selector.fit_transform(X_train_features, y_train)
X_test_features = selector.transform(X_test_features)

# Handle class imbalance using random oversampling
oversampler = RandomOverSampler(random_state=42)
X_train_features, y_train = oversampler.fit_resample(X_train_features, y_train)

# Define the models and their hyperparameters for tuning
models = [
    (SVC(kernel='linear', probability=True), {'C': [0.1, 1, 10]}),  # Set probability=True for SVM
    (RandomForestClassifier(), {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, None]}),
    (LogisticRegression(), {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga']})
]

# Create an ensemble model using voting
ensemble = VotingClassifier(estimators=[(str(model), model) for model, _ in models], voting='soft')

# Perform grid search for hyperparameter tuning
best_model = None
best_score = 0
for model, params in models:
    grid_search = GridSearchCV(model, params, cv=5, scoring='f1_weighted')
    grid_search.fit(X_train_features, y_train)
    if grid_search.best_score_ > best_score:
        best_model = grid_search.best_estimator_
        best_score = grid_search.best_score_

# Train the ensemble model
ensemble.fit(X_train_features, y_train)

# Make predictions on the test set
y_pred = ensemble.predict(X_test_features)

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nBest Individual Model: {best_model.__class__.__name__}")
print(f"Best Individual Model F1-score: {best_score:.4f}")
print(f"\nEnsemble Model:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")




Best Individual Model: RandomForestClassifier
Best Individual Model F1-score: 0.9987

Ensemble Model:
Precision: 0.7648
Recall: 0.7969
F1-score: 0.7629


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# GEMINI

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the data
df = pd.read_csv("combined_data_fix.csv")  # Replace with your file name

# Preprocessing
# Assuming your data has a 'rule' column for SNORT rules and 'techniques' for a list of MITRE IDs
df['MITRE Technique ID'] = df['MITRE Technique ID'].str.split(',')  # Convert string of IDs to list

# Feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Rule'])

# Target variable transformation
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['MITRE Technique ID'])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model selection and hyperparameter tuning
models = {
    'Logistic Regression': {
        'model': OneVsRestClassifier(LogisticRegression()),
        'params': {
            'estimator__C': [0.1, 1, 10]
        }
    },
    'Random Forest': {
        'model': OneVsRestClassifier(RandomForestClassifier()),
        'params': {
            'estimator__n_estimators': [50, 100, 200],
            'estimator__max_depth': [None, 5, 10]
        }
    },
    'Linear SVC': {
        'model': OneVsRestClassifier(LinearSVC()),
        'params': {
            'estimator__C': [0.1, 1, 10]
        }
    }
}

best_model = None
best_f1 = 0

for model_name, model_data in models.items():
    print(f"Training {model_name}...")
    clf = GridSearchCV(model_data['model'], model_data['params'], cv=5, scoring='f1_weighted')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"{model_name} F1-score: {f1}")

    if f1 > best_f1:
        best_f1 = f1
        best_model = clf

# Evaluation on the test set
y_pred = best_model.predict(X_test)

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nBest Model:", best_model.best_estimator_)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Training Logistic Regression...


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

Logistic Regression F1-score: 0.6801713393818657
Training Random Forest...


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

Random Forest F1-score: 0.6142584300160058
Training Linear SVC...


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

Linear SVC F1-score: 0.8180032627745046

Best Model: OneVsRestClassifier(estimator=LinearSVC(C=10))
Precision: 0.8933381433381433
Recall: 0.7777777777777778
F1-score: 0.8180032627745046


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support
import ast
import numpy as np


# Read the CSV file into a DataFrame
df = pd.read_csv('combined_data_fix.csv')

# Convert the `MITRE Technique ID` column from a string representation of a list to an actual list of strings
df['MITRE Technique ID'] = df['MITRE Technique ID'].apply(lambda x: ast.literal_eval(x))

# Create a new dataframe where each row is a rule and a single technique
df_split = df.explode('MITRE Technique ID')

# Reset the index of `df_split` to ensure unique indices for each row
df_split = df_split.reset_index(drop=True)

# Split the data into training and test sets based on unique pairs of `Rule` and `MITRE Technique ID`, ensuring each pair appears in only one set
unique_pairs = df_split.groupby(['Rule', 'MITRE Technique ID']).size().reset_index().drop(0, axis=1)
train_pairs, test_pairs = train_test_split(unique_pairs, test_size=0.2, random_state=42)

train_df = df_split.merge(train_pairs, on=['Rule', 'MITRE Technique ID'], how='inner')
test_df = df_split.merge(test_pairs, on=['Rule', 'MITRE Technique ID'], how='inner')

# Create a TF-IDF vectorizer to convert the text data in the `Rule` column into numerical features
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)

# Transform the training and test data using the TF-IDF vectorizer
X_train = tfidf_vectorizer.fit_transform(train_df['Rule'])
X_test = tfidf_vectorizer.transform(test_df['Rule'])
y_train = train_df['MITRE Technique ID']
y_test = test_df['MITRE Technique ID']

# Initialize a dictionary to store the results of different models
results = {}

# Define a list of models to evaluate, including OneVsRestClassifier with Logistic Regression, LinearSVC, and RandomForestClassifier as base estimators
models = [
    ('OVR Logistic Regression', OneVsRestClassifier(LogisticRegression(max_iter=1000))),
    ('OVR Linear SVC', OneVsRestClassifier(LinearSVC(random_state=42))),
    ('OVR Random Forest', OneVsRestClassifier(RandomForestClassifier(random_state=42)))
]

# Iterate through the models
for name, model in models:
    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Perform cross-validation and store the mean F1-score
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_weighted')
    results[name] = {
        'CV F1-Score': np.mean(cv_scores),
        'Precision': 0,
        'Recall': 0,
        'F1-Score': 0
    }

    # Predict on the test set and calculate precision, recall, and F1-score (weighted)
    y_pred = model.predict(X_test)
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

    # Store the results in the `results` dictionary
    results[name]['Precision'] = precision
    results[name]['Recall'] = recall
    results[name]['F1-Score'] = f1_score

# Create a DataFrame from the `results` dictionary and sort it by F1-score in descending order
results_df = pd.DataFrame(results).transpose().sort_values(by='F1-Score', ascending=False)

# Print the sorted results
print(results_df.to_markdown(numalign="left", stralign="left"))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


|                         | CV F1-Score   | Precision   | Recall   | F1-Score   |
|:------------------------|:--------------|:------------|:---------|:-----------|
| OVR Linear SVC          | 0.756431      | 0.76698     | 0.80203  | 0.769834   |
| OVR Random Forest       | 0.716203      | 0.72711     | 0.756345 | 0.721052   |
| OVR Logistic Regression | 0.442241      | 0.477405    | 0.573604 | 0.468798   |


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
