In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load dataset
data = pd.read_csv('combined_data_fix.csv')

# Extract the Snort rules and techniques
snort_rules = data['Rule']
techniques = data['MITRE Technique ID']

# Convert the techniques to categorical labels
technique_labels = techniques.astype('category').cat.codes

# Use TF-IDF to vectorize the Snort rules
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(snort_rules)
y = technique_labels

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the model
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Print the best parameters
print(f"Best parameters found: {best_params}")

# Train the final model with the best parameters
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 216 candidates, totalling 1080 fits




Best parameters found: {'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           2       1.00      0.33      0.50         3
           3       0.00      0.00      0.00         2
           4       1.00      0.67      0.80         6
           5       0.00      0.00      0.00         2
           7       0.82      0.90      0.86        10
           8       1.00      0.50      0.67         4
           9       0.81      1.00      0.90        13
          11       1.00      0.50      0.67         2
          12       0.00      0.00      0.00         2
          14       1.00      0.67      0.80         3
          15       1.00      1.00      1.00         1
          17       0.90      0.93      0.92        60
          18       0.88      0.78      0.82         9
          20       1.00      0.67      0.80         3
      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
נ