In [9]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot  as plt
import os


In [10]:
print(xgb.__version__)

1.7.3


In [11]:
# dateTimeCols=['open_datetime','response_datetime','resolved_datetime','close_datetime']
dateTimeCols=['open_datetime','close_datetime']

cateFeatures=['product_type','brand','incident_type','service_type','is_failure_type','sla']

# numericFeatures=['count_detail','open_to_close_hour','response_to_resolved_hour','open_to_response_hour','resolved_to_close_hour']
numericFeatures=['count_detail','open_to_close_hour']

colLabel='severity_label'


calLabelRefInfo=[colLabel,'severity_name']

colFeatures=cateFeatures+numericFeatures

cols=[colLabel]+colFeatures

print(cols)

from_date='2023-01-10'

['severity_label', 'product_type', 'brand', 'incident_type', 'service_type', 'is_failure_type', 'sla', 'count_detail', 'open_to_close_hour']


# Load and Prepare Data

In [12]:
df_incident = pd.read_csv('incident_data.csv',parse_dates=dateTimeCols)
print(f"Incident From {df_incident['open_datetime'].min()} To {df_incident['open_datetime'].max()}")
df_incident=df_incident.query("open_datetime>=@from_date")

df_incident=df_incident[cols]
print(df_incident.info())
df_incident.tail()

Incident From 2020-03-15 13:36:00 To 2023-02-01 06:40:00
<class 'pandas.core.frame.DataFrame'>
Int64Index: 97 entries, 0 to 96
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   severity_label      97 non-null     int64  
 1   product_type        97 non-null     object 
 2   brand               97 non-null     object 
 3   incident_type       97 non-null     object 
 4   service_type        97 non-null     object 
 5   is_failure_type     97 non-null     bool   
 6   sla                 97 non-null     object 
 7   count_detail        97 non-null     int64  
 8   open_to_close_hour  97 non-null     float64
dtypes: bool(1), float64(1), int64(2), object(5)
memory usage: 6.9+ KB
None


Unnamed: 0,severity_label,product_type,brand,incident_type,service_type,is_failure_type,sla,count_detail,open_to_close_hour
92,2,Hardware,Oracle,Hard Disk Drive Failure,Incident,False,24x7 4Hrs Resolution Time,1,1.983333
93,3,Software,VMWare,Software,Incident,False,24x7 4Hrs Response Time,2,2.35
94,3,Server,Oracle,CPU Failure,Incident,False,24x7 4Hrs Resolution Time,3,3.0
95,2,Server,HPE,Hard Disk Drive Failure,Incident,False,24x7 4Hrs Response Time,2,8.583333
96,3,Software,VMWare,Software,Incident,False,24x7 4Hrs Response Time,2,19.983333


# Process features and target class

In [13]:
# cateFeatures_cateLabel=cateFeatures.copy()+[colLabel]
print(f"{cateFeatures} and {colLabel}")
X = df_incident.loc[:,colFeatures]
X[cateFeatures]=X[cateFeatures].astype("category") 

print(X.info())
X.sample(10)


['product_type', 'brand', 'incident_type', 'service_type', 'is_failure_type', 'sla'] and severity_label
<class 'pandas.core.frame.DataFrame'>
Int64Index: 97 entries, 0 to 96
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   product_type        97 non-null     category
 1   brand               97 non-null     category
 2   incident_type       97 non-null     category
 3   service_type        97 non-null     category
 4   is_failure_type     97 non-null     category
 5   sla                 97 non-null     category
 6   count_detail        97 non-null     int64   
 7   open_to_close_hour  97 non-null     float64 
dtypes: category(6), float64(1), int64(1)
memory usage: 4.4 KB
None


Unnamed: 0,product_type,brand,incident_type,service_type,is_failure_type,sla,count_detail,open_to_close_hour
27,Hardware,Oracle,Backup Failure,Request,False,24x7 4Hrs Resolution Time,1,31.433333
12,Server,HPE,General Incident,Incident,False,24x7 4Hrs Resolution Time,2,8.45
5,Server,HPE,General Incident,Incident,False,24x7 4Hrs Resolution Time,5,92.866667
18,Switch,Cisco,Network Adapter Failure,Incident,True,8x5 4Hrs Response Time,8,56.916667
50,Storage,NetApp,General Incident,Incident,False,24x7 6Hrs Resolution Time,2,17.866667
84,Software,Trend Micro,Configuration Change,Request,False,24x7 4Hrs Response Time,1,4.066667
22,Firewall,Palo Alto,Software,Incident,False,24x7 4Hrs Resolution Time,1,0.366667
92,Hardware,Oracle,Hard Disk Drive Failure,Incident,False,24x7 4Hrs Resolution Time,1,1.983333
71,Server,HPE,Memory Failure,Incident,False,24x7 4Hrs Resolution Time,3,36.5
34,Storage,NetApp,Controller/Node Failure,Incident,False,24x7 4Hrs Response Time,5,20.433333


In [14]:
y=df_incident.loc[:,colLabel]
print(y.info())
y.sample(10)
# y=y.astype("category")

<class 'pandas.core.series.Series'>
Int64Index: 97 entries, 0 to 96
Series name: severity_label
Non-Null Count  Dtype
--------------  -----
97 non-null     int64
dtypes: int64(1)
memory usage: 1.5 KB
None


89    2
37    2
69    2
59    3
56    2
86    2
91    3
67    2
33    3
30    3
Name: severity_label, dtype: int64

In [15]:
# label_encoder = LabelEncoder()
# y = label_encoder.fit_transform(y)

# print(label_encoder.classes_)

# print(y[-5:])

# Load Model Model to Predict

In [23]:
model_xg = xgb.Booster()  # init model
model_xg.load_model('model.bst')  # load data

In [31]:
dtest = xgb.DMatrix(data=X,label=y,enable_categorical=True)
y_pred_values = model_xg.predict(dtest)
yPredictionsList = [round(value) for value in y_pred_values]

accuracy_xg = accuracy_score(y, yPredictionsList)
print("XGBoost Accuracy: %.2f%%" % (accuracy_xg * 100.0))

XGBoost Accuracy: 61.86%


In [33]:
yPrediction=pd.Series(yPredictionsList).to_frame("severity_prediction")

In [34]:
dfAll=pd.concat([yPrediction,y,X],axis=1)
dfAll.info()
dfAll

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   severity_prediction  97 non-null     int64   
 1   severity_label       97 non-null     int64   
 2   product_type         97 non-null     category
 3   brand                97 non-null     category
 4   incident_type        97 non-null     category
 5   service_type         97 non-null     category
 6   is_failure_type      97 non-null     category
 7   sla                  97 non-null     category
 8   count_detail         97 non-null     int64   
 9   open_to_close_hour   97 non-null     float64 
dtypes: category(6), float64(1), int64(3)
memory usage: 5.3 KB


Unnamed: 0,severity_prediction,severity_label,product_type,brand,incident_type,service_type,is_failure_type,sla,count_detail,open_to_close_hour
0,2,2,Server,HPE,Hard Disk Drive Failure,Incident,False,24x7 4Hrs Response Time,2,5.333333
1,3,2,Storage,EMC,Hard Disk Drive Failure,Incident,False,8x5 4Hrs Response Time,2,30.450000
2,2,2,Server,HPE,Memory Failure,Incident,False,24x7 4Hrs Response Time,2,6.833333
3,3,3,Server,HPE,General Incident,Incident,False,24x7 4Hrs Resolution Time,8,48.700000
4,2,2,Storage,NetApp,Hard Disk Drive Failure,Incident,False,24x7 4Hrs Response Time,1,22.650000
...,...,...,...,...,...,...,...,...,...,...
92,2,2,Hardware,Oracle,Hard Disk Drive Failure,Incident,False,24x7 4Hrs Resolution Time,1,1.983333
93,2,3,Software,VMWare,Software,Incident,False,24x7 4Hrs Response Time,2,2.350000
94,2,3,Server,Oracle,CPU Failure,Incident,False,24x7 4Hrs Resolution Time,3,3.000000
95,2,2,Server,HPE,Hard Disk Drive Failure,Incident,False,24x7 4Hrs Response Time,2,8.583333
