In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder , StandardScaler
from sklearn.linear_model import LogisticRegressionCV ,SGDClassifier
from sklearn.metrics import accuracy_score ,classification_report,confusion_matrix,precision_score
from sklearn.model_selection import RandomizedSearchCV ,StratifiedKFold,StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from datetime import datetime
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv('dataset/process/scalar_train.csv')
test_data = pd.read_csv('dataset/process/scalar_test.csv')

In [3]:
train_data.value_counts("IncidentGrade")

IncidentGrade
2    634706
1     67320
0     58601
Name: count, dtype: int64

In [4]:
train_data_master = train_data.copy()

In [5]:
train_data, val_data = train_test_split(train_data_master, test_size=0.2, random_state=42)

In [6]:
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)

In [7]:
train_data.head()

Unnamed: 0,Id,OrgId,IncidentId,AlertId,DetectorId,AlertTitle,Category,IncidentGrade,EntityType,EvidenceRole,...,ResourceIdName,OSFamily,OSVersion,CountryCode,State,City,Year,Month,DayOfWeek,Hour
0,1678,1,301,309248,0,0.0,5,2,0,1,...,3586,5,66,242,1445,10630,1.0,0.454545,0.166667,0.782609
1,1923,342,120513,850402,0,0.0,5,1,1,1,...,3586,5,66,242,1445,10630,1.0,0.454545,0.5,0.608696
2,2495,5,286,163160,0,0.0,5,2,0,1,...,3586,5,66,242,1445,10630,1.0,0.454545,0.333333,0.173913
3,1809,0,303,69029,0,0.0,5,2,0,1,...,3586,5,66,242,1445,10630,1.0,0.454545,0.166667,0.826087
4,3360,35,139,1204921,0,0.0,5,2,0,1,...,3586,5,66,242,1445,10630,1.0,0.454545,1.0,0.826087


In [8]:
y_train = train_data['IncidentGrade']
x_train = train_data.drop(columns=['IncidentGrade'],axis=1)

In [9]:
X_sample = val_data.drop(columns=['IncidentGrade'],axis=1)
y_sample = val_data['IncidentGrade']

In [10]:
y_test = test_data['IncidentGrade']
x_test = test_data.drop(columns=['IncidentGrade'],axis=1)

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
# Initialize logistic regression
logreg = LogisticRegression(C = 100, penalty = 'l1', solver='liblinear')

logreg = logreg.fit(x_train,y_train)

In [13]:
y_pred = logreg.predict(X_sample)
print("Classification Report:")
print(classification_report(y_sample, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_sample, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.66      0.64     11737
           1       0.81      0.57      0.67     13525
           2       0.94      0.97      0.96    126864

    accuracy                           0.91    152126
   macro avg       0.80      0.73      0.76    152126
weighted avg       0.91      0.91      0.91    152126

Confusion Matrix:
[[  7707   1140   2890]
 [  1399   7660   4466]
 [  3071    625 123168]]


## Naive bayers

In [14]:
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import RandomOverSampler

In [15]:
nb = GaussianNB()

In [16]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_sample, y_sample)

In [17]:
nb.fit(X_resampled, y_resampled)

In [18]:
val=nb.predict(X_sample)
ac = accuracy_score(y_sample, val)
print(ac)

0.8536147667065459


## Decision Tree classifier

In [19]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [20]:
dt_classifier = DecisionTreeClassifier(random_state=42)

param_dist = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [10, 20, 50],
    'min_samples_leaf': [5, 10, 20]
}

random_search = RandomizedSearchCV(estimator=dt_classifier, param_distributions=param_dist,
                                   n_iter=50, scoring='f1', cv=5, n_jobs=-1, random_state=42)

random_search.fit(X_resampled, y_resampled)

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", l

In [21]:
best_params = random_search.best_params_
print("Best hyperparameters found: ", best_params)

Best hyperparameters found:  {'min_samples_split': 10, 'min_samples_leaf': 5, 'max_depth': 5}


In [22]:
best_dt_classifier = random_search.best_estimator_
y_pred = best_dt_classifier.predict(X_sample)
accuracy = accuracy_score(y_sample, y_pred)
print("Accuracy on val set: ", accuracy)

Accuracy on val set:  0.931267501939182


In [23]:
clf = classification_report(y_sample, y_pred)
print(clf)

              precision    recall  f1-score   support

           0       0.54      0.95      0.69     11737
           1       0.92      0.81      0.86     13525
           2       1.00      0.94      0.97    126864

    accuracy                           0.93    152126
   macro avg       0.82      0.90      0.84    152126
weighted avg       0.96      0.93      0.94    152126



## XGBoost

In [24]:
smote = SMOTE(random_state=42)
train_X_resampled, train_y_resampled = smote.fit_resample(x_train, y_train)

In [25]:
from xgboost import XGBClassifier

In [26]:
xgb = XGBClassifier(objective='multi:softmax', eval_metric='mlogloss',
                    use_label_encoder=False, random_state=42,
                    tree_method='hist')



In [27]:
xgb.fit(X_resampled, y_resampled)

In [28]:
y_val_pred_best = xgb.predict(X_sample)
print("Validation Set Classification Report :")
print(classification_report(y_val_pred_best, y_sample))

Validation Set Classification Report :
              precision    recall  f1-score   support

           0       0.97      0.92      0.94     12304
           1       0.96      0.95      0.96     13684
           2       0.99      1.00      1.00    126138

    accuracy                           0.99    152126
   macro avg       0.97      0.96      0.97    152126
weighted avg       0.99      0.99      0.99    152126



In [29]:
# Define hyperparameter grid
param_distributions = {
    'max_depth': [7],
    'n_estimators': [200],
    'learning_rate': [0.2],
    'subsample': [0.8],
    'colsample_bytree': [1.0],
}

random_search = RandomizedSearchCV(
    estimator= XGBClassifier(objective='multi:softmax', eval_metric='mlogloss',
                    use_label_encoder=False, random_state=42,
                    tree_method='hist'),
    param_distributions=param_distributions,
    scoring='f1_macro',
    cv=3,
    n_iter=20,
    verbose=1,
    n_jobs=-1
)



In [30]:
random_search.fit(train_X_resampled, train_y_resampled)

Fitting 3 folds for each of 1 candidates, totalling 3 fits




In [31]:
best_xgb = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

# Evaluate on validation set
y_val_pred_best = best_xgb.predict(X_sample)
print("Validation Set Classification Report :")
print(classification_report(y_val_pred_best, y_sample))

Best Parameters: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.2, 'colsample_bytree': 1.0}
Validation Set Classification Report :
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     11969
           1       0.96      0.95      0.96     13695
           2       0.99      1.00      1.00    126462

    accuracy                           0.99    152126
   macro avg       0.97      0.96      0.97    152126
weighted avg       0.99      0.99      0.99    152126



In [32]:
import joblib
joblib.dump(best_xgb,'model.pkl')

['model.pkl']