In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot  as plt
import os



In [None]:
print(xgb.__version__)
print(np.__version__)
print(pd.__version__)

In [None]:
# dateTimeCols=['open_datetime','response_datetime','resolved_datetime','close_datetime']
dateTimeCols=['open_datetime','close_datetime']

cateFeatures=['product_type','brand','incident_type','service_type','is_failure_type','sla']

# numericFeatures=['count_detail','open_to_close_hour','response_to_resolved_hour','open_to_response_hour','resolved_to_close_hour']
numericFeatures=['count_detail','open_to_close_hour']

colLabel='severity_label'


calLabelRefInfo=[colLabel,'severity_name']

colFeatures=cateFeatures+numericFeatures

cols=[colLabel]+colFeatures

print(cols)




to_date='2023-12-31'

# Load and Prepare Data

In [None]:
df_incident = pd.read_csv('incident_data.csv',parse_dates=dateTimeCols)
print(f"Incident From {df_incident['open_datetime'].min()} To {df_incident['open_datetime'].max()}")
print(df_incident.info())


In [None]:
df_incident=df_incident.query("open_datetime<=@to_date")
dfLabelRefInfo=df_incident[calLabelRefInfo]
df_incident=df_incident[cols]

In [None]:
print(df_incident.info())
df_incident.tail()

In [None]:
dfLabelRefInfo['severity']=dfLabelRefInfo.apply(lambda x:  f"{x.severity_label}-{x.severity_name}" ,axis=1)
print(dfLabelRefInfo.info())
dfLabelRefInfo.tail()

In [None]:
dfLabelSummary=dfLabelRefInfo.groupby(['severity']).size().to_frame('count').sort_values(by='count',ascending=False)

dfLabelSummary.plot(kind='bar')
plt.show()
dfLabelSummary

In [None]:
for col in cateFeatures:
    print(col)
    dfGroupCate=df_incident.groupby([col]).size().to_frame('count').sort_values(by='count',ascending=False)
    dfGroupCate.plot(kind='bar')
    plt.show()


In [None]:
# df_incident[numericFeatures].plot(kind='hist', subplots=True, layout=(len(numericFeatures),1), sharex=False)
for col in numericFeatures:
    sns.boxplot(df_incident[col])
    plt.show()

# Process features and target class

In [None]:
# cateFeatures_cateLabel=cateFeatures.copy()+[colLabel]
print(f"{cateFeatures} and {colLabel}")
X = df_incident.loc[:,colFeatures]
X[cateFeatures]=X[cateFeatures].astype("category") 

print(X.info())
X.sample(10)


In [None]:
y=df_incident.loc[:,colLabel]
print(y.info())
y.sample(10)
# y=y.astype("category")

In [None]:
# label_encoder = LabelEncoder()
# y = label_encoder.fit_transform(y)

# print(label_encoder.classes_)

# print(y[-5:])

# Model Parameter Setting

In [None]:
maxDept=10
lerningRate=0.1
nEstimators=100

# Final Model

In [None]:
print("Use DMatrix & Train Object") 
xg_param= {'max_depth': maxDept, 'learning_rate': lerningRate}
dAllTrain = xgb.DMatrix(data= X, label=y,enable_categorical=True)
model_xg=xgb.train(xg_param,dAllTrain,num_boost_round=nEstimators)

In [None]:
artifact_filename = 'model.bst'
local_path = artifact_filename

model_xg.save_model(local_path)

# file_scaler_pred=f'{model_path}/scaler_pred_{modelName}.gz'
# joblib.dump(scalerFinalTrain,file_scaler)

In [None]:
# from google.cloud import storage

# BUCKET_NAME='gs://smart-ai-model-pongthorn'
# MODEL_DIR = BUCKET_NAME + "/model"
# print(MODEL_DIR)


# storage_path = os.path.join(MODEL_DIR, artifact_filename)
# blob = storage.blob.Blob.from_string(storage_path, client=storage.Client())
# blob.upload_from_filename(local_path)

# storage_client = storage.Client()
# blob = storage.blob.Blob.from_string(MODEL_DIR, client=storage.Client())
# blob.upload_from_filename(local_path)

In [None]:
# def upload_blob(bucket_name, source_file_name, destination_blob_name):
#     """Uploads a file to the bucket."""

#     storage_client = storage.Client()
#     bucket = storage_client.bucket(bucket_name)
#     blob = bucket.blob(destination_blob_name)

#     blob.upload_from_filename(source_file_name)

#     destination_file_name = os.path.join("gs://", bucket_name, destination_blob_name)

#     return destination_file_name

# Train Data 

# Split Train and Test Data

In [None]:
seed = 7
test_size = 0.15
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

print("-----------------------------------------------------------------------------------------------------")
print(y_train[:5])
X_train[:5]



# XGBoost 

In [None]:
print("Use DMatrix & Train Object") 
xg_param= {'max_depth': maxDept, 'learning_rate': lerningRate}
dtrain = xgb.DMatrix(data= X_train, label=y_train,enable_categorical=True)
model_xg=xgb.train(xg_param,dtrain,num_boost_round=nEstimators)

dtest = xgb.DMatrix(data=X_test,label=y_test,enable_categorical=True)
y_pred_xg = model_xg.predict(dtest)


predictions_xg = [round(value) for value in y_pred_xg]

accuracy_xg = accuracy_score(y_test, predictions_xg)
print("XGBoost Accuracy: %.2f%%" % (accuracy_xg * 100.0))


# Feature Important

In [None]:
xgb.plot_importance(model_xg)
plt.show()