In [1]:
import pandas as pd
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # For classification
from sklearn.ensemble import RandomForestRegressor   # For regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

In [2]:
df=pd.read_csv("GUIDE_train.csv",nrows=100000)
null=df.isna().sum()
total_length=len(df)
per=(null/total_length)*100
per=per.sort_values(ascending=False)
per=per[per>50].index
df.drop(columns=per,inplace=True)
df.dropna(subset=['IncidentGrade'], inplace=True)
benign_positive = df[df['IncidentGrade'] == 'BenignPositive']
true_positive = df[df['IncidentGrade'] == 'TruePositive']
false_positive = df[df['IncidentGrade'] == 'FalsePositive']

min_size = len(false_positive)
benign_positive_downsampled = resample(benign_positive, 
                                       replace=False, 
                                       n_samples=min_size, 
                                       random_state=42)

true_positive_downsampled = resample(true_positive, 
                                     replace=False, 
                                     n_samples=min_size, 
                                     random_state=42)

false_positive_downsampled = resample(false_positive, 
                                     replace=False, 
                                     n_samples=min_size, 
                                     random_state=42)

df = pd.concat([benign_positive_downsampled, 
                         true_positive_downsampled, 
                         false_positive_downsampled ])

df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['year'] = df['Timestamp'].dt.year
df['month'] = df['Timestamp'].dt.month
df['day'] = df['Timestamp'].dt.day
df['hour'] = df['Timestamp'].dt.hour
df['minute'] = df['Timestamp'].dt.minute
df['second'] = df['Timestamp'].dt.second
df.drop(columns=["Timestamp"],inplace=True)
df.drop(columns=["Id"],inplace=True)

Labelencoder=LabelEncoder()
for col in df.select_dtypes(include="object").columns:
    df[col]=Labelencoder.fit_transform(df[col])

def split(df):
    x=df.drop(columns=["IncidentGrade"])
    y=df["IncidentGrade"]
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42,shuffle=True, stratify=y)
    return x,y,X_train, X_test, y_train, y_test

In [3]:
n_df=df[["OrgId","IncidentId","AlertId","AlertTitle","day","Category","DetectorId","IncidentGrade"]]


from sklearn.model_selection import GridSearchCV
x, y, x_train, x_test, y_train, y_test = split(n_df)

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

param_grid = {
    'n_estimators': [300],
    'max_depth': [7],
    'learning_rate': [0.2],
    'subsample': [0.8],
    'colsample_bytree': [1.0]
}

grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, 
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)


grid_search.fit(x_train, y_train)

print("Best Parameters: ", grid_search.best_params_)

# Predict with the best model
best_model = grid_search.best_estimator_



Fitting 5 folds for each of 1 candidates, totalling 5 fits


Parameters: { "use_label_encoder" } are not used.



Best Parameters:  {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}


In [4]:
y_pred = best_model.predict(x_test)

print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.92      0.92      4313
           1       0.91      0.94      0.92      4313
           2       0.95      0.91      0.93      4313

    accuracy                           0.92     12939
   macro avg       0.92      0.92      0.92     12939
weighted avg       0.92      0.92      0.92     12939

[[3967  221  125]
 [ 187 4040   86]
 [ 196  180 3937]]


In [5]:
df1=pd.read_csv("GUIDE_test.csv",nrows=10000)
null=df1.isna().sum()
total_length=len(df1)
per=(null/total_length)*100
per=per.sort_values(ascending=False)
per=per[per>50].index
df1.drop(columns=per,inplace=True)
df1.drop(columns="Usage",inplace=True)
df1.dropna(subset=["IncidentGrade"])

df1['Timestamp'] = pd.to_datetime(df1['Timestamp'])
df1['year'] = df1['Timestamp'].dt.year
df1['month'] = df1['Timestamp'].dt.month
df1['day'] = df1['Timestamp'].dt.day
df1['hour'] = df1['Timestamp'].dt.hour
df1['minute'] = df1['Timestamp'].dt.minute
df1['second'] = df1['Timestamp'].dt.second
df1.drop(columns=["Timestamp"],inplace=True)
df1.drop(columns=["Id"],inplace=True)
Labelencoder=LabelEncoder()
for col in df1.select_dtypes(include="object").columns:
    df1[col]=Labelencoder.fit_transform(df1[col])

t_df=df1[["OrgId","IncidentId","AlertId","AlertTitle","day","Category","DetectorId","IncidentGrade"]]
x_test=t_df.drop(columns="IncidentGrade")
y_test=t_df["IncidentGrade"]
y_pred = best_model.predict(x_test)

print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.87      0.89      4192
           1       0.79      0.88      0.84      2167
           2       0.92      0.90      0.91      3641

    accuracy                           0.89     10000
   macro avg       0.87      0.89      0.88     10000
weighted avg       0.89      0.89      0.89     10000

[[3656  336  200]
 [ 158 1917   92]
 [ 201  160 3280]]
