In [4]:
from google.cloud import bigquery

import pandas as pd
import numpy as np
from datetime import date,datetime,timedelta,timezone

import math
import os

import tensorflow as tf
import tensorflow_decision_forests as tfdf  # constantly registered to load model 
print(tf.__version__)
print(tfdf.__version__)

2.12.0
1.4.0


# Load Configuration Data and Constant Variable

In [5]:
all_prediction=True
is_evaluation=True


_model="demo_binary_gbt_tf_model"
model_version=f'{_model}_demo_t150723'

model_gs_path=f"gs://demo-tf-incident-pongthorn/{_model}"


projectId="pongthorn"
dataset_id="SMartML"
data_table="new2_incident"
prediction_table="new2_result_binary_prediction_incident"

unusedCols_unseen=['id','severity_name','imported_at']

# Get today's date
prediction_datetime=datetime.now(timezone.utc)
today_str=prediction_datetime.strftime("%Y-%m-%d")
today=datetime.strptime(today_str,"%Y-%m-%d")
print(f"Prediction at {prediction_datetime} for {today_str} ({today})")
      
print(model_gs_path)
print(f"Data: {data_table} and Prediction: {prediction_table}")    

Prediction at 2023-08-11 15:35:34.677912+00:00 for 2023-08-11 (2023-08-11 00:00:00)
gs://demo-tf-incident-pongthorn/demo_binary_gbt_tf_model
Data: new2_incident and Prediction: new2_result_binary_prediction_incident


# BigQuery Configuration

In [6]:
client = bigquery.Client(project=projectId)
new_data_table_id=f"{projectId}.{dataset_id}.{data_table}"
predictResult_table_id=f"{projectId}.{dataset_id}.{prediction_table}"
print(new_data_table_id)
print(predictResult_table_id)

pongthorn.SMartML.new2_incident
pongthorn.SMartML.new2_result_binary_prediction_incident


In [7]:
try:
    client.get_table(predictResult_table_id)  # Make an API request.
    print("Predict Result Table {} already exists.".format(predictResult_table_id))
    
except Exception as ex:
    schema = [
    bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"),
    bigquery.SchemaField("prediction_item_date", "DATE", mode="REQUIRED"),    
    bigquery.SchemaField("label_binary_severity", "INTEGER", mode="REQUIRED"),
    bigquery.SchemaField("pred_binary_severity", "INTEGER", mode="REQUIRED"),       
    bigquery.SchemaField("prediction_datetime", "DATETIME", mode="REQUIRED") ,
    bigquery.SchemaField("model_version", "STRING", mode="REQUIRED")     
    ]

    table = bigquery.Table(predictResult_table_id,schema=schema)
    table.time_partitioning = bigquery.TimePartitioning(
    type_=bigquery.TimePartitioningType.DAY,field="prediction_item_date")
    
    table = client.create_table(table)  # Make an API request.
    
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )

Created table pongthorn.SMartML.new2_result_binary_prediction_incident


# Load unseen data(new incident) to Make Prediction

In [8]:
if int(all_prediction)==0:
    sql=f"""
    SELECT *  FROM `{new_data_table_id}` 
     WHERE DATE(imported_at) = '{today_str}'
     order by imported_at
    """
else:
    sql=f"""
    SELECT *  FROM `{new_data_table_id}` 
     order by imported_at
    """

print(sql)


query_result=client.query(sql)
df=query_result.to_dataframe()
if df.empty==True:
  print("no data to make prediction")  
  # return "no data to make prediction"
print(df.info())


    SELECT *  FROM `pongthorn.SMartML.new2_incident` 
     order by imported_at
    
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   id                  100 non-null    Int64         
 1   severity_id         100 non-null    Int64         
 2   severity_name       100 non-null    object        
 3   sla                 100 non-null    object        
 4   product_type        100 non-null    object        
 5   brand               100 non-null    object        
 6   service_type        100 non-null    object        
 7   incident_type       100 non-null    object        
 8   open_to_close_hour  100 non-null    float64       
 9   imported_at         100 non-null    datetime64[ns]
dtypes: Int64(2), datetime64[ns](1), float64(1), object(6)
memory usage: 8.1+ KB
None


# Build Unseen data by removing label and others

In [9]:
unseen =df.drop(columns=unusedCols_unseen)
print(unseen.info())
unseen.tail(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   severity_id         100 non-null    Int64  
 1   sla                 100 non-null    object 
 2   product_type        100 non-null    object 
 3   brand               100 non-null    object 
 4   service_type        100 non-null    object 
 5   incident_type       100 non-null    object 
 6   open_to_close_hour  100 non-null    float64
dtypes: Int64(1), float64(1), object(5)
memory usage: 5.7+ KB
None


Unnamed: 0,severity_id,sla,product_type,brand,service_type,incident_type,open_to_close_hour
95,4,24x7 4Hrs Resolution Time,Software,VMWare,Request,Upgrade Software,14.5
96,4,24x7 4Hrs Resolution Time,Software,VMWare,Request,Upgrade Software,2.5
97,4,24x7 4Hrs Resolution Time,Access Point,Cisco,Request,General Incident,34.0
98,4,24x7 4Hrs Resolution Time,Access Point,Cisco,Request,General Incident,27.5
99,4,24x7 4Hrs Resolution Time,Access Point,Cisco,Request,General Incident,4.5


# Convert dataframe to tensorflow dataset

In [10]:
unseen_ds= tfdf.keras.pd_dataframe_to_tf_dataset(unseen.drop(columns=['severity_id']))
print(unseen_ds)

<_PrefetchDataset element_spec={'sla': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'product_type': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'brand': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'service_type': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'incident_type': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'open_to_close_hour': TensorSpec(shape=(None,), dtype=tf.float64, name=None)}>


# Load Model

In [11]:
abc_model = tf.keras.models.load_model(model_gs_path)  
print(abc_model.summary())

[INFO 23-08-11 15:36:38.7237 UTC kernel.cc:1243] Loading model from path gs://demo-tf-incident-pongthorn/demo_binary_gbt_tf_model/assets/ with prefix 7a8644254fd943aa
[INFO 23-08-11 15:36:39.3290 UTC abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 23-08-11 15:36:39.3298 UTC kernel.cc:1075] Use fast generic engine


Model: "gradient_boosted_trees_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1
Trainable params: 0
Non-trainable params: 1
_________________________________________________________________
None


# Make prediction

In [13]:
predResultList=abc_model.predict(unseen_ds)
predServerityIDList=[]
for predResult in predResultList:
    _class= 1 if predResult[0]>=0.5 else 0  
    predServerityIDList.append(_class) #0=normal , 1=critical
    print(f"{predResult} : {_class}")

dfPred=pd.DataFrame(data=predServerityIDList,columns=["pred_binary_severity"])      
dfPred    

[0.9366978] : 1
[0.331638] : 0
[0.46390465] : 0
[0.9705179] : 1
[0.9291451] : 1
[0.9291451] : 1
[0.933475] : 1
[0.7230494] : 1
[0.9291451] : 1
[0.7755305] : 1
[0.90914774] : 1
[0.9291451] : 1
[0.9350321] : 1
[0.8162701] : 1
[0.90195024] : 1
[0.9121128] : 1
[0.9179605] : 1
[0.9322281] : 1
[0.6789574] : 1
[0.92496586] : 1
[0.8420767] : 1
[0.13334112] : 0
[0.39225832] : 0
[0.6607408] : 1
[0.6408862] : 1
[0.87236756] : 1
[0.93689996] : 1
[0.9735334] : 1
[0.8612716] : 1
[0.9717969] : 1
[0.93689996] : 1
[0.9404177] : 1
[0.44124335] : 0
[0.87793475] : 1
[0.80020124] : 1
[0.87793475] : 1
[0.55469894] : 1
[0.8670623] : 1
[0.8142656] : 1
[0.84321916] : 1
[0.7821391] : 1
[0.93177086] : 1
[0.83759546] : 1
[0.80475336] : 1
[0.8034147] : 1
[0.8641785] : 1
[0.92131317] : 1
[0.20433852] : 0
[0.28166428] : 0
[0.690814] : 1
[0.8142656] : 1
[0.45067295] : 0
[0.2380667] : 0
[0.38679695] : 0
[0.4992696] : 0
[0.43371543] : 0
[0.47215948] : 0
[0.28544658] : 0
[0.7854402] : 1
[0.5646044] : 1
[0.15286462] : 0


Unnamed: 0,pred_binary_severity
0,1
1,0
2,0
3,1
4,1
...,...
95,0
96,0
97,0
98,0


# Map severity_id to label for actual value.
# Merge predicted value to main dataframe

In [14]:
def map_4to2_serverity(severity_id):
    if severity_id==1 or severity_id==2:
       return 1
    else:
       return 0 
df['label_binary_severity'] =df['severity_id'].apply(map_4to2_serverity)

dfPred
df=pd.concat([df,dfPred],axis=1)
df

Unnamed: 0,id,severity_id,severity_name,sla,product_type,brand,service_type,incident_type,open_to_close_hour,imported_at,label_binary_severity,pred_binary_severity
0,3212,1,Critical,24x7 4Hrs Response Time,Server,HPE,Incident,General Incident,57.133333,2023-08-08 16:39:37.692706,1,1
1,3263,2,Major,24x7 4Hrs Resolution Time,Server,Oracle,Request,Software,11.033333,2023-08-08 16:39:37.692706,1,0
2,3260,2,Major,24x7 4Hrs Resolution Time,Server,Oracle,Incident,Software,1.750000,2023-08-08 16:39:37.692706,1,0
3,3230,2,Major,24x7 6Hrs Resolution Time,Server,HPE,Incident,OS / Firmware,23.233333,2023-08-08 16:39:37.692706,1,1
4,3233,2,Major,24x7 4Hrs Resolution Time,Server,HPE,Incident,Hard Disk Drive Failure,6.000000,2023-08-08 16:39:37.692706,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
95,3191,4,Cosmetic,24x7 4Hrs Resolution Time,Software,VMWare,Request,Upgrade Software,14.500000,2023-08-08 16:39:37.692706,0,0
96,3189,4,Cosmetic,24x7 4Hrs Resolution Time,Software,VMWare,Request,Upgrade Software,2.500000,2023-08-08 16:39:37.692706,0,0
97,3216,4,Cosmetic,24x7 4Hrs Resolution Time,Access Point,Cisco,Request,General Incident,34.000000,2023-08-08 16:39:37.692706,0,0
98,3215,4,Cosmetic,24x7 4Hrs Resolution Time,Access Point,Cisco,Request,General Incident,27.500000,2023-08-08 16:39:37.692706,0,0


In [16]:
# Evaluate model and Show Metric Report

In [17]:
if is_evaluation:
    from sklearn.metrics import confusion_matrix,classification_report
    className=list(set().union(list(df['pred_binary_severity'].unique()),list(df['label_binary_severity'].unique())))
    print(className)
    actualClass=[  f'actual-{x}' for x in  className]
    predictedlClass=[  f'pred-{x}' for x in className]
    y_true=list(df['label_binary_severity'])
    y_pred=list(df['pred_binary_severity'])
    cnf_matrix = confusion_matrix(y_true,y_pred)
    cnf_matrix

    # #index=actual , column=prediction
    cm_df = pd.DataFrame(cnf_matrix,
                         index = actualClass, 
                         columns = predictedlClass)
    print(cm_df)
    print(classification_report(y_true, y_pred, labels=className))

[0, 1]
          pred-0  pred-1
actual-0      41       8
actual-1       7      44
              precision    recall  f1-score   support

           0       0.85      0.84      0.85        49
           1       0.85      0.86      0.85        51

    accuracy                           0.85       100
   macro avg       0.85      0.85      0.85       100
weighted avg       0.85      0.85      0.85       100



# Transform data for Writing Prediction Result to BQ

In [18]:
df=df[['id','label_binary_severity','pred_binary_severity']]
df['prediction_item_date']=today
df['prediction_datetime']=datetime.now()
df['model_version']=model_version
print(df.tail())

      id  label_binary_severity  pred_binary_severity prediction_item_date  \
95  3191                      0                     0           2023-08-11   
96  3189                      0                     0           2023-08-11   
97  3216                      0                     0           2023-08-11   
98  3215                      0                     0           2023-08-11   
99  3172                      0                     0           2023-08-11   

          prediction_datetime                          model_version  
95 2023-08-11 16:27:42.602139  demo_binary_gbt_tf_model_demo_t150723  
96 2023-08-11 16:27:42.602139  demo_binary_gbt_tf_model_demo_t150723  
97 2023-08-11 16:27:42.602139  demo_binary_gbt_tf_model_demo_t150723  
98 2023-08-11 16:27:42.602139  demo_binary_gbt_tf_model_demo_t150723  
99 2023-08-11 16:27:42.602139  demo_binary_gbt_tf_model_demo_t150723  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['prediction_item_date']=today
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['prediction_datetime']=datetime.now()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['model_version']=model_version


# Load data to BQ

In [19]:
def loadDataFrameToBQ():
    # WRITE_TRUNCATE , WRITE_APPEND
    try:
        if all_prediction==1: 
            job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")
        else:
            job_config = bigquery.LoadJobConfig(write_disposition="WRITE_APPEND")

        job = client.load_table_from_dataframe(
            df, predictResult_table_id, job_config=job_config
        )
        job.result()  # Wait for the job to complete.
        print("Total Prediction ML ", len(df), "Imported bigquery successfully")

    except BadRequest as e:
        print("Bigquery Error\n")
        for e in job.errors:
            print('ERROR: {}'.format(e['message']))

try:
    loadDataFrameToBQ()
except Exception as ex:
    raise ex

Total Prediction ML  100 Imported bigquery successfully
