In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import time
from datetime import datetime

from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV


# Best: 0.766234 using {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 50} with auto-category by xgoost


In [None]:
dateTimeCols=['open_datetime','response_datetime','resolved_datetime','close_datetime']

cateFeatures=['product_type','brand','incident_type','service_type','is_failure_type','sla']
numericFeatures=['count_detail','open_to_close_hour','response_to_resolved_hour','open_to_response_hour','resolved_to_close_hour']

colLabel='severity_label'

colFeatures=cateFeatures+numericFeatures
cols=[colLabel]+colFeatures
print(cols)

isAutomaticOneHotByColumnTransformer=True
isManualEncoding=True
 
to_date='2023-12-31'

xscore='accuracy'

# Load and Prepare Data

In [None]:
df_incident = pd.read_csv('incident_data.csv',parse_dates=dateTimeCols)
print(f"Incident From {df_incident['open_datetime'].min()} To {df_incident['open_datetime'].max()}")
df_incident=df_incident.query("open_datetime<=@to_date")
df_incident=df_incident[cols]
print(df_incident.info())
df_incident.tail()



In [None]:
df_incident.groupby([colLabel]).size().plot(kind='bar')
plt.show()

# Process features and class

In [None]:
# cate_features=[ col for col in features if  col not in numericFeatures ]
if isAutomaticOneHotByColumnTransformer==False:
    
    if isManualEncoding==False:
        print("Let XGBoost encode itself")
    else:
        print("Encode by  LabelEncoder")
    features=[ col for col in list(df_incident.columns) if  col!=colLabel ]
    X = df_incident.loc[:,features]
    for cate_col in cateFeatures:
        if isManualEncoding==False:
              X[cate_col]=X[cate_col].astype("category")             
        else:  
            cateFeatEncoder = LabelEncoder()
            X[cate_col]=cateFeatEncoder.fit_transform(X[cate_col])

else:
        categorical_transformer = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        ct = ColumnTransformer(transformers=[("cat", categorical_transformer,cateFeatures )],remainder='passthrough')
        ct.set_output(transform='pandas')
        X = ct.fit_transform(df_incident)
        colLabel=f"remainder__{colLabel}"
        print(f"{colLabel} :OneHot Encoding to Fetures by ColumnTransformer")
         

In [None]:
if isAutomaticOneHotByColumnTransformer==False:
    Y = df_incident.loc[:,[colLabel]]
    Y[colLabel]=Y[colLabel].astype("category")

    
else:
    Y=X.pop(colLabel)
    # # X=X.pop(colLabel)
    # Y[colLabel]=Y[colLabel].astype("category")

print(Y.info())
Y[:5]

In [None]:
print(X.info())
X.tail()

# Encode string class values as integers

In [None]:
label_encoder = LabelEncoder()
ylabelEncoded = label_encoder.fit_transform(Y)

print(label_encoder.classes_)

print(ylabelEncoded[-5:])

# XGBoost

In [None]:
# # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
# if isManualEncoding==True or isAutomaticOneHotByColumnTransformer==True:
#     print("XGBClassifier()")
#     model = XGBClassifier()
# else:
#    print("XGBClassifier(enable_categorical=True,tree_method=hist)")
#    model = XGBClassifier(enable_categorical=True,tree_method="hist")

# print(model)

# n_estimators = list(range(10, 30,10))
# learning_rate=[0.001,0.01]
# n_estimators = list(range(50,500 ,50))
# learning_rate =[0.001,0.005,0.01,0.05,0.1,0.5]
# max_depth = [6, 10, 15, 20]

# print('the number of trees: ',n_estimators)
# print('learning rate: ',learning_rate)
# print('max_depth: ',max_depth)

# print(f"Tune combination : {len(n_estimators)*len(learning_rate)*len(max_depth)}")
# param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators,max_depth=max_depth)

# Random Forst

In [None]:
random_x=7

model=RandomForestClassifier()
n_estimators = list(range(50,500 ,50))
max_features =['auto', 'sqrt', 'log2']
max_depth = [6, 10, 15, 20]


print('the number of trees: ',n_estimators)
print('max_features: ',max_features)
print('max_depth: ',max_depth)

print(f"Tune combination : {len(n_estimators)*len(max_features)*len(max_depth)}")
param_grid = dict(max_features=max_features, n_estimators=n_estimators,max_depth=max_depth)


In [None]:
# tune parameter
xsplits=10
kfold = StratifiedKFold(n_splits=xsplits, shuffle=True, random_state=random_x)

In [None]:
t_Start=time.time()

print(f"Start tuning at {datetime.now()}")

In [None]:
# grid search

grid_search = GridSearchCV(model, param_grid, scoring=xscore, n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X, ylabelEncoded)

In [None]:
t_End=time.time()
t_elapsed=(t_End-t_Start)/60/60
print('Total execution : ',round(t_elapsed,2)) 
print(datetime.now())

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
 print("%f (%f) with: %r" % (mean, stdev, param))


In [None]:
# plot results
scores = np.array(means).reshape(len(max_features), len(n_estimators),len(max_depth))
print(scores)

for i, value in enumerate(max_features):
    plt.plot(n_estimators, scores[i], label='lr: ' + str(value))

plt.legend()
plt.xlabel('n_estimators')
plt.ylabel(xscore.title())
#plt.savefig('n_estimators-vs-learning_rate.png')
plt.show()

In [None]:
# # plot results
# scores = np.array(means).reshape(len(learning_rate), len(n_estimators),len(max_depth))
#print(scores)

# for i, value in enumerate(learning_rate):
#     plt.plot(n_estimators, scores[i], label='lr: ' + str(value))

# plt.legend()
# plt.xlabel('n_estimators')
# plt.ylabel(xscore.title())
# #plt.savefig('n_estimators-vs-learning_rate.png')
# plt.show()