In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import (accuracy_score, classification_report, recall_score, 
                             precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [2]:
# Đọc và tiền xử lý dữ liệu

def load_data(train_path, test_path):
    train_data = pd.read_csv(train_path) 
    test_data = pd.read_csv(test_path)
    train_data.dropna(subset=['IncidentGrade'], inplace=True)
    return train_data, test_data

def preprocess_data(df, le_cat_columns):
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    for le_col in le_cat_columns:
        df[le_col] = df[le_col].astype('object')
    return df


In [3]:
# Mã hóa dữ liệu

def encode_data(train_data, test_data, ohe_cat_columns, le_cat_columns, numerical_columns):
    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe.fit(train_data[ohe_cat_columns])
    train_data_ohe = csr_matrix(ohe.transform(train_data[ohe_cat_columns]))
    test_data_ohe = csr_matrix(ohe.transform(test_data[ohe_cat_columns]))

    train_data_numerical = csr_matrix(train_data[numerical_columns].fillna(-1).values)
    test_data_numerical = csr_matrix(test_data[numerical_columns].fillna(-1).values)
    
    feature_le = LabelEncoder()
    train_data_le = pd.DataFrame()
    test_data_le = pd.DataFrame()
    
    for le_col in le_cat_columns:
        feature_le.fit(pd.concat([train_data[le_col], test_data[le_col]]))
        train_data_le[le_col] = feature_le.transform(train_data[le_col])
        test_data_le[le_col] = feature_le.transform(test_data[le_col])
    
    train_data_le = csr_matrix(train_data_le)
    test_data_le = csr_matrix(test_data_le)
    
    X_train = hstack([train_data_ohe, train_data_le, train_data_numerical])
    X_test = hstack([test_data_ohe, test_data_le, test_data_numerical])

    target_le = LabelEncoder()
    target_le.fit(train_data['IncidentGrade'])
    y_train = target_le.transform(train_data['IncidentGrade'])
    y_test = target_le.transform(test_data['IncidentGrade'])
    
    return X_train, y_train, X_test, y_test


In [4]:
# Huấn luyện và đánh giá mô hình

def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else y_pred
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    
    cm = confusion_matrix(y_test, y_pred)
    cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, 
                                        display_labels=['BenignPositive', 'FalsePositive', 'TruePositive'])
    cm_display.plot()
    plt.show()

    return y_pred


In [5]:
# Trực quan hóa các đặc trưng quan trọng

def plot_feature_importances(model, feature_columns):
    importances = model.feature_importances_ if hasattr(model, 'feature_importances_') else model.coef_[0]
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 6))
    plt.title(f"Feature Importances ({model.__class__.__name__})")
    plt.bar(range(len(importances)), importances[indices], align="center")
    plt.xticks(range(len(importances)), feature_columns[indices], rotation=90)
    plt.xlim([-1, len(importances)])
    plt.show()


In [6]:
# Xong

if __name__ == "__main__":
    train_path = '/Users/tentoilatai/Library/CloudStorage/OneDrive-Personal/Tran Viet Tai/Hoc tap/Ky VI/Hoc may va khai pha du lieu/BTL/GUIDE_Train.csv'
    test_path = '/Users/tentoilatai/Library/CloudStorage/OneDrive-Personal/Tran Viet Tai/Hoc tap/Ky VI/Hoc may va khai pha du lieu/BTL/GUIDE_Test.csv'
    
    le_cat_columns = ['Category', 'EntityType', 'EvidenceRole', 'SuspicionLevel', 'LastVerdict',
                      'ResourceType', 'Roles', 'AntispamDirection', 'ThreatFamily','CountryCode',
                      'OSFamily', 'OSVersion','State', 'City', 'RegistryValueName', 'RegistryValueData', 
                      'ResourceIdName', 'RegistryKey', 'OAuthApplicationId', 'ApplicationId', 'ApplicationName']
    numerical_columns = ['DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn', 'AccountObjectId',
                         'AccountName', 'DeviceName', 'NetworkMessageId', 'EmailClusterId', 'FileName', 'FolderPath']
    ohe_cat_columns = []
    
    train_data, test_data = load_data(train_path, test_path)
    train_data = preprocess_data(train_data, le_cat_columns)
    test_data = preprocess_data(test_data, le_cat_columns)
    
    X_train, y_train, X_test, y_test = encode_data(train_data, test_data, ohe_cat_columns, le_cat_columns, numerical_columns)
    
    # Huấn luyện và đánh giá Logistic Regression
    log_reg_model = LogisticRegression(max_iter=1000, random_state=0)
    log_reg_model = train_model(log_reg_model, X_train, y_train)
    evaluate_model(log_reg_model, X_test, y_test)
    plot_feature_importances(log_reg_model, np.array(ohe_cat_columns + le_cat_columns + numerical_columns))
    
    # Huấn luyện và đánh giá Random Forest
    rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
    rf_model = train_model(rf_model, X_train, y_train)
    evaluate_model(rf_model, X_test, y_test)
    plot_feature_importances(rf_model, np.array(ohe_cat_columns + le_cat_columns + numerical_columns))


  test_data = pd.read_csv(test_path)


KeyboardInterrupt: 