# Train the model full code

In [4]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
import joblib

# Load the preprocessed data
X_train = pd.read_csv('../data/processed_data/X_train.csv')
y_train = pd.read_csv('../data/processed_data/y_train.csv').values.ravel()
X_test = pd.read_csv('../data/processed_data/X_test.csv')
y_test = pd.read_csv('../data/processed_data/y_test.csv').values.ravel()

# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Define hyperparameters for GridSearchCV
param_grid = {
    'Logistic Regression': {
        'model__C': [0.01, 0.1, 1, 10, 100],  # Regularization parameter
        'model__solver': ['liblinear', 'lbfgs']
    },
    'Support Vector Machine': {
        'model__C': [0.1, 1, 10, 100],
        'model__kernel': ['linear', 'rbf']
    },
    'Random Forest': {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10]
    },
    'Gradient Boosting': {
        'model__n_estimators': [50, 100, 200],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7]
    }
}

best_model = None
best_accuracy = 0

# Perform model selection and hyperparameter tuning
for name, model in models.items():
    print(f"Training and tuning {name}...")

    # Create pipeline with the model
    pipeline = Pipeline([
        ('model', model)
    ])

    # GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(pipeline, param_grid[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Best model from grid search
    tuned_model = grid_search.best_estimator_

    # Evaluate model with cross-validation
    cv_scores = cross_val_score(tuned_model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{name} Cross-Validation Accuracy: {cv_scores.mean() * 100:.2f}%")

    # Train the best model
    tuned_model.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred = tuned_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Test Accuracy: {accuracy * 100:.2f}%")
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}")

    # Save the best model if it has better accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = tuned_model
        joblib.dump(best_model, 'best_cybersecurity_model.pkl')
        print(f"Updated Best Model: {name}")

print(f"Model training complete. Best model saved to 'best_cybersecurity_model.pkl'")

Training and tuning Logistic Regression...
Logistic Regression Cross-Validation Accuracy: 100.00%
Logistic Regression Test Accuracy: 100.00%
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     66860
           1       1.00      1.00      1.00    133140

    accuracy                           1.00    200000
   macro avg       1.00      1.00      1.00    200000
weighted avg       1.00      1.00      1.00    200000

Updated Best Model: Logistic Regression
Training and tuning Support Vector Machine...
Support Vector Machine Cross-Validation Accuracy: 100.00%
Support Vector Machine Test Accuracy: 100.00%
Support Vector Machine Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     66860
           1       1.00      1.00      1.00    133140

    accuracy                           1.00    200000
   macro avg       1.00      1.00  