In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import (accuracy_score, classification_report, recall_score, 
                             precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter import ttk
import psutil
import socket


In [4]:
# Đọc và tiền xử lý dữ liệu

def load_data(train_path, test_path):
    train_data = pd.read_csv("C:/Users/tento/OneDrive/Tran Viet Tai/Hoc Tap/Ky VI/Hoc may va khai pha du lieu/BTL/GUIDE_Train.csv") 
    test_data = pd.read_csv("C:/Users/tento/OneDrive/Tran Viet Tai/Hoc Tap/Ky VI/Hoc may va khai pha du lieu/BTL/GUIDE_Test.csv")
    train_data.dropna(subset=['IncidentGrade'], inplace=True)
    return train_data, test_data

def preprocess_data(df, le_cat_columns):
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    for le_col in le_cat_columns:
        df[le_col] = df[le_col].astype('object')
    return df


In [5]:
# Mã hóa dữ liệu

def encode_data(train_data, test_data, ohe_cat_columns, le_cat_columns, numerical_columns):
    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe.fit(train_data[ohe_cat_columns])
    train_data_ohe = csr_matrix(ohe.transform(train_data[ohe_cat_columns]))
    test_data_ohe = csr_matrix(ohe.transform(test_data[ohe_cat_columns]))

    train_data_numerical = csr_matrix(train_data[numerical_columns].fillna(-1).values)
    test_data_numerical = csr_matrix(test_data[numerical_columns].fillna(-1).values)
    
    feature_le = LabelEncoder()
    train_data_le = pd.DataFrame()
    test_data_le = pd.DataFrame()
    
    for le_col in le_cat_columns:
        feature_le.fit(pd.concat([train_data[le_col], test_data[le_col]]))
        train_data_le[le_col] = feature_le.transform(train_data[le_col])
        test_data_le[le_col] = feature_le.transform(test_data[le_col])
    
    train_data_le = csr_matrix(train_data_le)
    test_data_le = csr_matrix(test_data_le)
    
    X_train = hstack([train_data_ohe, train_data_le, train_data_numerical])
    X_test = hstack([test_data_ohe, test_data_le, test_data_numerical])

    target_le = LabelEncoder()
    target_le.fit(train_data['IncidentGrade'])
    y_train = target_le.transform(train_data['IncidentGrade'])
    y_test = target_le.transform(test_data['IncidentGrade'])
    
    return X_train, y_train, X_test, y_test,target_le


In [6]:
# Huấn luyện và đánh giá mô hình

def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else y_pred
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    
    cm = confusion_matrix(y_test, y_pred)
    cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, 
                                        display_labels=['BenignPositive', 'FalsePositive', 'TruePositive'])
    cm_display.plot()
    plt.show()

    return y_pred


In [7]:
def predict_from_file(model, X_test, target_le):
    index = np.random.choice(range(X_test.shape[0]))
    sample = X_test[index]
    y_pred = model.predict([sample])
    y_prob = model.predict_proba([sample])[:, 1] if hasattr(model, 'predict_proba') else y_pred
    return target_le.inverse_transform(y_pred)[0], y_prob[0]

In [8]:
# Trực quan hóa các đặc trưng quan trọng

def plot_feature_importances(model, feature_columns):
    importances = model.feature_importances_ if hasattr(model, 'feature_importances_') else model.coef_[0]
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 6))
    plt.title(f"Feature Importances ({model.__class__.__name__})")
    plt.bar(range(len(importances)), importances[indices], align="center")
    plt.xticks(range(len(importances)), feature_columns[indices], rotation=90)
    plt.xlim([-1, len(importances)])
    plt.show()


In [9]:
class ScanDevices:
    def __init__(self):
        self.device_info = {}

    def scan_local_device(self):
        self.device_info = {
            'DeviceId': psutil.cpu_info().brand_raw,
            'Sha256': psutil.cpu_count(),
            'IpAddress': socket.gethostbyname(socket.gethostname()),
            'Url': 'N/A',
            'AccountSid': 'N/A',
            'AccountUpn': 'N/A',
            'AccountObjectId': 'N/A',
            'AccountName': 'N/A',
            'DeviceName': socket.gethostname(),
            'NetworkMessageId': 'N/A',
            'EmailClusterId': 'N/A',
            'FileName': 'N/A',
            'FolderPath': 'N/A'
        }
        return self.device_info

    def scan_remote_device(self, ipv4_address):
        try:
            ip = socket.gethostbyaddr(ipv4_address)
            self.device_info = {
                'DeviceId': 'N/A',
                'Sha256': 'N/A',
                'IpAddress': ipv4_address,
                'Url': 'N/A',
                'AccountSid': 'N/A',
                'AccountUpn': 'N/A',
                'AccountObjectId': 'N/A',
                'AccountName': 'N/A',
                'DeviceName': ip[0],
                'NetworkMessageId': 'N/A',
                'EmailClusterId': 'N/A',
                'FileName': 'N/A',
                'FolderPath': 'N/A'
            }
            return self.device_info
        except socket.error:
            raise ValueError("IPv4 address does not exist")


In [10]:
def create_ui(model, X_test, target_le):
    def on_predict_random():
        label, prob = predict_from_file(model, X_test, target_le)
        result_label.config(text=f"Prediction: {label}, Probability: {prob:.2f}")

    def on_scan_local():
        scanner = ScanDevices()
        device_info = scanner.scan_local_device()
        label, prob = predict_from_info(device_info)
        result_label.config(text=f"Prediction: {label}, Probability: {prob:.2f}")

    def on_scan_remote():
        ipv4 = ipv4_entry.get()
        try:
            scanner = ScanDevices()
            device_info = scanner.scan_remote_device(ipv4)
            label, prob = predict_from_info(device_info)
            result_label.config(text=f"Prediction: {label}, Probability: {prob:.2f}")
        except ValueError as e:
            messagebox.showerror("Error", str(e))

    def predict_from_info(device_info):
        df_device = pd.DataFrame([device_info])
        df_device = preprocess_data(df_device, le_cat_columns)
        X_device, _, _, _ = encode_data(df_device, df_device, ohe_cat_columns, le_cat_columns, numerical_columns)
        label, prob = predict_from_file(model, X_device, target_le)
        return label, prob

    root = tk.Tk()
    root.title("Security Incident Prediction")

    menubar = tk.Menu(root)
    root.config(menu=menubar)

    file_menu = tk.Menu(menubar, tearoff=0)
    menubar.add_cascade(label="File", menu=file_menu)
    file_menu.add_command(label="Open", command=lambda: filedialog.askopenfilename())
    file_menu.add_separator()
    file_menu.add_command(label="Exit", command=root.quit)

    view_menu = tk.Menu(menubar, tearoff=0)
    menubar.add_cascade(label="View", menu=view_menu)
    view_menu.add_command(label="Show Predictions", command=lambda: None)

    edit_menu = tk.Menu(menubar, tearoff=0)
    menubar.add_cascade(label="Edit", menu=edit_menu)
    edit_menu.add_command(label="Preferences", command=lambda: None)

    help_menu = tk.Menu(menubar, tearoff=0)
    menubar.add_cascade(label="Help", menu=help_menu)
    help_menu.add_command(label="About", command=lambda: messagebox.showinfo("About", "Security Incident Prediction App v1.0"))

    main_frame = ttk.Frame(root, padding="10")
    main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))

    ttk.Button(main_frame, text="Predict Random", command=on_predict_random).grid(row=0, column=0, pady=5)
    ttk.Button(main_frame, text="Scan Local Device", command=on_scan_local).grid(row=1, column=0, pady=5)
    ttk.Label(main_frame, text="Enter IPv4 to Scan:").grid(row=2, column=0, pady=5)
    ipv4_entry = ttk.Entry(main_frame)
    ipv4_entry.grid(row=3, column=0, pady=5)
    ttk.Button(main_frame, text="Scan Remote Device", command=on_scan_remote).grid(row=4, column=0, pady=5)

    result_label = ttk.Label(main_frame, text="Prediction: ", font=("Helvetica", 12))
    result_label.grid(row=5, column=0, pady=10)

    root.mainloop()

In [11]:
# Xong

if __name__ == "__main__":
    train_path = 'C:/Users/tento/OneDrive/Tran Viet Tai/Hoc Tap/Ky VI/Hoc may va khai pha du lieu/BTL/GUIDE_Train.csv'
    test_path = 'C:/Users/tento/OneDrive/Tran Viet Tai/Hoc Tap/Ky VI/Hoc may va khai pha du lieu/BTL/GUIDE_Test.csv'
    
    le_cat_columns = ['Category', 'EntityType', 'EvidenceRole', 'SuspicionLevel', 'LastVerdict',
                      'ResourceType', 'Roles', 'AntispamDirection', 'ThreatFamily','CountryCode',
                      'OSFamily', 'OSVersion','State', 'City', 'RegistryValueName', 'RegistryValueData', 
                      'ResourceIdName', 'RegistryKey', 'OAuthApplicationId', 'ApplicationId', 'ApplicationName']
    numerical_columns = ['DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn', 'AccountObjectId',
                         'AccountName', 'DeviceName', 'NetworkMessageId', 'EmailClusterId', 'FileName', 'FolderPath']
    ohe_cat_columns = []
    
    train_data, test_data = load_data(train_path, test_path)
    train_data = preprocess_data(train_data, le_cat_columns)
    test_data = preprocess_data(test_data, le_cat_columns)
    
    X_train, y_train, X_test, y_test, target_le = encode_data(train_data, test_data, ohe_cat_columns, le_cat_columns, numerical_columns)
    
    # Huấn luyện và đánh giá Logistic Regression
    log_reg_model = LogisticRegression(max_iter=1000, random_state=0)
    log_reg_model = train_model(log_reg_model, X_train, y_train)
    evaluate_model(log_reg_model, X_test, y_test)
    plot_feature_importances(log_reg_model, np.array(ohe_cat_columns + le_cat_columns + numerical_columns))
    
    # Huấn luyện và đánh giá Random Forest
    rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
    rf_model = train_model(rf_model, X_train, y_train)
    evaluate_model(rf_model, X_test, y_test)
    plot_feature_importances(rf_model, np.array(ohe_cat_columns + le_cat_columns + numerical_columns))
    log_reg_model = LogisticRegression(max_iter=1000, random_state=0)
    log_reg_model = train_model(log_reg_model, X_train, y_train)
    
    create_ui(log_reg_model, X_test, target_le)

In [None]:
def create_ui(model, X_test, target_le):
    def on_predict_from_input():
        input_data = {
            'DeviceId': device_id_entry.get(),
            'Sha256': sha256_entry.get(),
            'IpAddress': ip_address_entry.get(),
            'Url': url_entry.get(),
            # Thêm các trường khác tương ứng với các cột dữ liệu của bạn
        }
        
        try:
            df_input = pd.DataFrame([input_data])
            df_input = preprocess_data(df_input, le_cat_columns)
            X_input, _, _, _ = encode_data(df_input, df_input, ohe_cat_columns, le_cat_columns, numerical_columns)
            label, prob = predict_from_file(model, X_input, target_le)
            result_label.config(text=f"Prediction: {label}, Probability: {prob:.2f}")
        except Exception as e:
            messagebox.showerror("Error", str(e))

    root = tk.Tk()
    root.title("Security Incident Prediction")

    main_frame = ttk.Frame(root, padding="10")
    main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))

    # Form nhập liệu
    ttk.Label(main_frame, text="Device ID:").grid(row=0, column=0, pady=5)
    device_id_entry = ttk.Entry(main_frame)
    device_id_entry.grid(row=0, column=1, pady=5)
    
    ttk.Label(main_frame, text="Sha256:").grid(row=1, column=0, pady=5)
    sha256_entry = ttk.Entry(main_frame)
    sha256_entry.grid(row=1, column=1, pady=5)
    
    ttk.Label(main_frame, text="IP Address:").grid(row=2, column=0, pady=5)
    ip_address_entry = ttk.Entry(main_frame)
    ip_address_entry.grid(row=2, column=1, pady=5)
    
    ttk.Label(main_frame, text="URL:").grid(row=3, column=0, pady=5)
    url_entry = ttk.Entry(main_frame)
    url_entry.grid(row=3, column=1, pady=5)
    
    # Thêm các trường nhập liệu khác ở đây

    ttk.Button(main_frame, text="Predict", command=on_predict_from_input).grid(row=4, column=0, columnspan=2, pady=10)

    result_label = ttk.Label(main_frame, text="Prediction: ", font=("Helvetica", 12))
    result_label.grid(row=5, column=0, columnspan=2, pady=10)

    root.mainloop()
