In [None]:
import os
import pandas as pd

from scipy.sparse import hstack, csr_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import numpy as np
import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv('/Users/tentoilatai/Library/CloudStorage/OneDrive-Personal/Tran Viet Tai/Hoc tap/Ky VI/Hoc may va khai pha du lieu/BTL/GUIDE_Train.csv', nrows=10000)  # read a few rows to start

In [None]:
train_data['Category'].unique()

In [None]:
train_data['EvidenceRole'].unique()

In [None]:
train_data['CountryCode'].unique()

In [None]:
train_data.iloc[:,:15].head()

In [None]:
train_data.iloc[:, [0, 9] + list(range(15, 30))].head()

In [None]:
train_data.iloc[:, [0, 9] + list(range(30, 45))].head()

In [None]:
train_data['IncidentGrade'].value_counts()

In [None]:
train_data['IncidentGrade'].value_counts() * 100 / train_data['IncidentGrade'].shape[0]

In [None]:
train_data.isnull().sum()

In [None]:
le_cat_columns = ['Category', 'EntityType', 'EvidenceRole', 'SuspicionLevel', 'LastVerdict',
                  'ResourceType', 'Roles', 'AntispamDirection', 'ThreatFamily','CountryCode',
                  'OSFamily', 'OSVersion','State', 'City', 'RegistryValueName', 'RegistryValueData', 
                  'ResourceIdName', 'RegistryKey', 'OAuthApplicationId', 'ApplicationId', 'ApplicationName']

numerical_columns = ['DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn', 'AccountObjectId',
                     'AccountName', 'DeviceName', 'NetworkMessageId', 'EmailClusterId', 'FileName', 'FolderPath']

le_cat_columns += numerical_columns

numerical_columns = []

ohe_cat_columns = []

In [None]:
train_data[le_cat_columns].nunique().sort_values(ascending=False)

In [None]:
train_data[numerical_columns].nunique().sort_values(ascending=False)

In [None]:
for col in train_data:
    if train_data[col].nunique() < 10:
        print(col, train_data[col].unique())

In [None]:
## Data Exploration

In [None]:
def preprocess_data(df, le_cat_columns):
    """
        This function preprocesses the dataset
    """
    
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    
    for le_col in le_cat_columns:
        df[le_col] = df[le_col].astype('object')

    return df

In [None]:
train_data = preprocess_data(train_data, le_cat_columns)

In [None]:
print(train_data[le_cat_columns].nunique())
print(train_data[ohe_cat_columns].nunique())
print(train_data[numerical_columns].nunique())

In [None]:
from sklearn.feature_selection import f_classif

cat_columns = ohe_cat_columns + le_cat_columns


for cat in cat_columns:
    onehot_encoder = OneHotEncoder(sparse_output=False)
    X_encoded = onehot_encoder.fit_transform(train_data[[cat]])  
    
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(train_data['IncidentGrade'])
    f_statistic, p_value = f_classif(X_encoded, y)
    
    print("*" * 20)
    print(f"Feature: {cat}")
    print(f"ANOVA F-Statistic: {f_statistic}")
    print(f"p-Value: {p_value}")

In [None]:
# Convert 'Timestamp' column to datetime
train_data['Timestamp'] = pd.to_datetime(train_data['Timestamp'])

train_data.info()

In [None]:
def process_data():
    train_data = pd.read_csv('/Users/tentoilatai/Library/CloudStorage/OneDrive-Personal/Tran Viet Tai/Hoc tap/Ky VI/Hoc may va khai pha du lieu/BTL/GUIDE_Train.csv') 
    test_data = pd.read_csv('/Users/tentoilatai/Library/CloudStorage/OneDrive-Personal/Tran Viet Tai/Hoc tap/Ky VI/Hoc may va khai pha du lieu/BTL/GUIDE_Test.csv')
    
    print(train_data.shape)
    
    train_data.dropna(subset=['IncidentGrade'], inplace=True)
    
    train_data = preprocess_data(train_data, le_cat_columns)
    test_data = preprocess_data(test_data, le_cat_columns)
    
    group_columns = ohe_cat_columns + numerical_columns + le_cat_columns
    
    train_data = train_data.drop_duplicates(subset=group_columns)
    
    test_data.drop(['Usage'], axis=1, inplace=True)
    
    print(train_data.shape)
    print(test_data.shape)
    
    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe.fit(train_data[ohe_cat_columns])

    train_data_ohe = csr_matrix(ohe.transform(train_data[ohe_cat_columns]))
    test_data_ohe = csr_matrix(ohe.transform(test_data[ohe_cat_columns]))

    train_data_numerical = csr_matrix(train_data[numerical_columns].fillna(-1).values)
    test_data_numerical = csr_matrix(test_data[numerical_columns].fillna(-1).values)
    
    feature_le = LabelEncoder()
    
    train_data_le = pd.DataFrame()
    test_data_le = pd.DataFrame()
    
    for le_col in le_cat_columns:
        feature_le.fit(pd.concat([train_data[le_col], test_data[le_col]]))
        train_data_le[le_col] = feature_le.transform(train_data[le_col])
        test_data_le[le_col] = feature_le.transform(test_data[le_col])
    
    train_data_le = csr_matrix(train_data_le)
    test_data_le = csr_matrix(test_data_le)
    
    X_train = hstack([train_data_ohe, train_data_le ,train_data_numerical])
    X_test = hstack([test_data_ohe, test_data_le, test_data_numerical])

    target_le = LabelEncoder()
    
    target_le.fit(train_data['IncidentGrade'])
    y_train = target_le.transform(train_data['IncidentGrade'])
    y_test = target_le.transform(test_data['IncidentGrade'])
    
    """
        0: 'BenignPositive'
        1: 'FalsePositive'
        2: 'TruePositive'
    """
    print(f"Target Classes: {target_le.classes_}")
        
    return X_train, y_train, X_test, y_test
    
    
X_train, y_train, X_test, y_test = process_data()

In [None]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, auc
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def predict(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else y_pred
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nConfusion Matrix:")
    
    cm = confusion_matrix(y_test, y_pred)
    
    cm_display = ConfusionMatrixDisplay(confusion_matrix = cm, 
                                        display_labels = ['BenignPositive', 'FalsePositive', 'TruePositive'])

    cm_display.plot()
    plt.show()

    return y_pred

In [None]:
def train_random_forest_classifier(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
    
    model.fit(X_train, y_train)
    
    importances = model.feature_importances_
    
    feature_columns = np.array(ohe_cat_columns + le_cat_columns + numerical_columns)
    
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 6))
    plt.title("Feature Importances (Random Forest Classifier)")
    plt.bar(range(X_train.shape[1]), importances[indices], align="center")
    plt.xticks(range(X_train.shape[1]), feature_columns[indices], rotation=90)
    plt.xlim([-1, X_train.shape[1]])
    plt.show()

    return model

In [None]:
rfc_model = train_random_forest_classifier(X_train, y_train)

In [None]:
y_pred = predict(rfc_model, X_test, y_test)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')

f1 = f1_score(y_test, y_pred, average='macro')

print('Accuracy: {}'.format(accuracy))
print('Macro-Precision: {}'.format(precision))
print('Macro-Recall: {}'.format(recall))
print('Macro-F1 Score: {}'.format(f1))

In [None]:
from xgboost import XGBClassifier

def train_xgboost_classifier(X_train, y_train):
    model = XGBClassifier(n_estimators=100, max_depth=5, random_state=0, use_label_encoder=False, eval_metric='mlogloss')
    
    model.fit(X_train, y_train)
    
    importances = model.feature_importances_
    
    feature_columns = np.array(ohe_cat_columns + le_cat_columns + numerical_columns)
    
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 6))
    plt.title("Feature Importances (XGBoost Classifier)")
    plt.bar(range(X_train.shape[1]), importances[indices], align="center")
    plt.xticks(range(X_train.shape[1]), feature_columns[indices], rotation=90)
    plt.xlim([-1, X_train.shape[1]])
    plt.show()

    return model

In [None]:
xgb_model = train_xgboost_classifier(X_train, y_train)

In [None]:
y_pred = predict(xgb_model, X_test, y_test)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print('Accuracy: {}'.format(accuracy))
print('Macro-Precision: {}'.format(precision))
print('Macro-Recall: {}'.format(recall))
print('Macro-F1 Score: {}'.format(f1))

In [None]:
from catboost import CatBoostClassifier

def train_catboost_classifier(X_train, y_train):
    model = CatBoostClassifier(iterations=100, depth=5, random_seed=0, verbose=0)
    
    model.fit(X_train, y_train)
    
    importances = model.get_feature_importance()
    
    feature_columns = np.array(ohe_cat_columns + le_cat_columns + numerical_columns)
    
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 6))
    plt.title("Feature Importances (CatBoost Classifier)")
    plt.bar(range(X_train.shape[1]), importances[indices], align="center")
    plt.xticks(range(X_train.shape[1]), feature_columns[indices], rotation=90)
    plt.xlim([-1, X_train.shape[1]])
    plt.show()

    return model

In [None]:
cat_model = train_catboost_classifier(X_train, y_train)

In [None]:
y_pred = predict(cat_model, X_test, y_test)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print('Accuracy: {}'.format(accuracy))
print('Macro-Precision: {}'.format(precision))
print('Macro-Recall: {}'.format(recall))
print('Macro-F1 Score: {}'.format(f1))