In [None]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot  as plt

#http://localhost:8888/lab/workspaces/auto-O/tree/MyQuantFinProject/LearnDataSC/Prasertcbs_ML/sklearn_feature_importances_with_treeclassifier.ipynb
#http://localhost:8888/lab/workspaces/auto-C/tree/MyQuantFinProject/LearnDataSC/Prasertcbs_ML/sklearn_classification_iris2.ipynb

In [None]:
dateTimeCols=['open_datetime','response_datetime','resolved_datetime','close_datetime']

cateFeatures=['product_type','brand','incident_type','service_type','is_failure_type','sla']
numericFeatures=['count_detail','open_to_close_hour','response_to_resolved_hour','open_to_response_hour','resolved_to_close_hour']

colLabel='severity_label'

colFeatures=cateFeatures+numericFeatures
cols=[colLabel]+colFeatures
print(cols)


isAutomaticOneHotByColumnTransformer=False
isManualEncoding=True
 


to_date='2023-12-31'

# Load and Prepare Data

In [None]:
df_incident = pd.read_csv('incident_data.csv',parse_dates=dateTimeCols)
print(f"Incident From {df_incident['open_datetime'].min()} To {df_incident['open_datetime'].max()}")
df_incident=df_incident.query("open_datetime<=@to_date")
df_incident=df_incident[cols]
print(df_incident.info())
df_incident.tail()



In [None]:
dfLabel=df_incident.groupby([colLabel]).size().to_frame('count').sort_values(by='count',ascending=False)
dfLabel.plot(kind='bar')
plt.show()
dfLabel

# Process features and class

In [None]:
# cate_features=[ col for col in features if  col not in numericFeatures ]
if isAutomaticOneHotByColumnTransformer==False:
    
    if isManualEncoding==False:
        print("Let XGBoost encode itself")
    else:
        print("Encode by  LabelEncoder")
    features=[ col for col in list(df_incident.columns) if  col!=colLabel ]
    X = df_incident.loc[:,features]
    for cate_col in cateFeatures:
        if isManualEncoding==False:
              X[cate_col]=X[cate_col].astype("category")             
        else:  
            cateFeatEncoder = LabelEncoder()
            X[cate_col]=cateFeatEncoder.fit_transform(X[cate_col])

else:
        categorical_transformer = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        ct = ColumnTransformer(transformers=[("cat", categorical_transformer,cateFeatures )],remainder='passthrough')
        ct.set_output(transform='pandas')
        X = ct.fit_transform(df_incident)
        colLabel=f"remainder__{colLabel}"
        print(f"{colLabel} :OneHot Encoding to Fetures by ColumnTransformer")
        


In [None]:
print(X.info())
X.tail()

In [None]:
if isAutomaticOneHotByColumnTransformer==False:
    Y = df_incident.loc[:,[colLabel]]
    Y[colLabel]=Y[colLabel].astype("category")

    
else:
    Y=X.pop(colLabel)
    # # X=X.pop(colLabel)

    print(Y.info())
Y[:5]

# Encode string class values as integers

In [None]:
#if Y is object/category , we need to encode , if  not ,we can disregard

label_encoder = LabelEncoder()
ylabelEncoded = label_encoder.fit_transform(Y)

print(label_encoder.classes_)

print(ylabelEncoded[-5:])

# Split Train and Test Data

In [None]:
seed = 7
test_size = 0.15
X_train, X_test, y_train, y_test = train_test_split(X, ylabelEncoded, test_size=test_size, random_state=seed)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

print("---------------------------------------------------------------------------------------")
print(y_train[:5])
X_train[:5]



# RandomForest

In [None]:
model_rf = RandomForestClassifier()  

model_rf.fit(X_train, y_train)

y_pred_rf = model_rf.predict(X_test)
predictions_rf = [round(value) for value in y_pred_rf]

accuracy_rf = accuracy_score(y_test, predictions_rf)
print("RandomForest Accuracy: %.2f%%" % (accuracy_rf * 100.0))

# XGBoost 

In [None]:
# Supported tree methods are `gpu_hist`, `approx`, and `hist`.
if isManualEncoding==True or isAutomaticOneHotByColumnTransformer==True:
    print("XGBClassifier()")
    model_xg = XGBClassifier()
else:
   print("XGBClassifier(enable_categorical=True,tree_method=hist)")
   model_xg = XGBClassifier(enable_categorical=True,tree_method="hist")

# print(model_xg)
model_xg.fit(X_train, y_train)

y_pred_xg = model_xg.predict(X_test)
predictions_xg = [round(value) for value in y_pred_xg]

accuracy_xg = accuracy_score(y_test, predictions_xg)
print("XGBoost Accuracy: %.2f%%" % (accuracy_xg * 100.0))

# Feature Important

In [None]:
feature_series=pd.Series(model_xg.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_series)
feature_series.plot(kind='barh')
plt.show()

In [None]:
feature_series=pd.Series(model_rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_series)
feature_series.plot(kind='barh')
plt.show()