In [None]:
# Guide for Tuturial
#https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/prediction/custom_batch_prediction_feature_filter.ipynb
#https://cloud.google.com/vertex-ai/docs/tutorials/train-tensorflow-bigquery
#https://cloud.google.com/vertex-ai/docs/tutorials/tabular-bq-prediction

#https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/custom/custom-tabular-bq-managed-dataset.ipynb

In [None]:
import argparse
import os
from typing import Tuple, Optional

import pandas as pd
import numpy as np
import tensorflow as tf

from google.cloud import bigquery
from google.cloud import storage

from sklearn.model_selection import train_test_split

In [None]:
EPOCHS = 100
BATCH_SIZE = 32
TRAIN_STRATEGY = "single"


mean_and_std_json_file="incident_mean_and_std.json"

LABEL_COLUMN = "severity_name"
UNUSED_COLUMNS = ['severity_id','label_binary_severity','label_multi_severity']

model_dir='model'

In [None]:
# def load_data_bq(sql:str):
#  client_bq = bigquery.Client()
#  query_result=client_bq.query(sql)
#  df=query_result.to_dataframe()
#  return df

# def download_table(bq_table_uri: str):
#     # Remove bq:// prefix if present
#     bqclient= bigquery.Client()
#     prefix = "bq://"
#     if bq_table_uri.startswith(prefix):
#         bq_table_uri = bq_table_uri[len(prefix) :]

#     table = bigquery.TableReference.from_string(bq_table_uri)
#     rows = bqclient.list_rows(
#         table,
#     )
#     return rows.to_dataframe()

# df = download_table("pongthorn.SMartML.TrainEval_Incident_20230316")

def load_ml_data(data_path):
 df=pd.read_csv(data_path)
 df =df.drop(columns=UNUSED_COLUMNS)
 return df

root_path='../../data'    

dfAll=pd.read_csv(f"{root_path}/ML_Incident_20230316.csv",
                  usecols=['severity_name','sla','product_type','brand','service_type','incident_type'])

df_train = load_ml_data(f"{root_path}/train_incident.csv")
# val=train.copy()
df_validation=load_ml_data(f"{root_path}/validation_incident.csv")
# test =val.copy()
df_test =load_ml_data(f"{root_path}/test_incident.csv")

# sr_predict=df.iloc[-1,:]
# df=df.iloc[0:len(df)-1,:]


In [None]:
print(dfAll.info())
dfAll.tail()

In [None]:
list_label=dfAll[LABEL_COLUMN].unique()
print(list_label)

cate_sla=dfAll['sla'].unique()
print(cate_sla)

cate_productType=dfAll['product_type'].unique()
print(cate_productType)

cate_brand=dfAll['brand'].unique()
print(cate_brand)

cate_serviceType=dfAll['service_type'].unique()
print(cate_serviceType)

cate_incidentType=dfAll['incident_type'].unique()
print(cate_incidentType)


_CATEGORICAL_TYPES = {  
    LABEL_COLUMN:pd.api.types.CategoricalDtype(categories=list_label),
    "sla": pd.api.types.CategoricalDtype(categories=cate_sla),
    "product_type": pd.api.types.CategoricalDtype(categories=cate_productType),
    "brand": pd.api.types.CategoricalDtype(categories=cate_brand),
    "service_type": pd.api.types.CategoricalDtype(categories=cate_serviceType),
    "incident_type": pd.api.types.CategoricalDtype(categories=cate_incidentType),
}
#print(_CATEGORICAL_TYPES)

In [None]:
def download_mean_and_std(mean_and_std_json_file):
    """Download mean and std for each column"""
    import json
    file_path=mean_and_std_json_file
    # bucket, file_path = extract_bucket_and_prefix_from_gcs_path(mean_and_std_json_file)
    # download_blob(bucket_name=bucket, source_blob_name=file_path, destination_file_name=file_path)
    
    with open(file_path, 'r') as file:
        return json.loads(file.read())

In [None]:
def preprocess(df):
    """Converts categorical features to numeric. Removes unused columns.

    Args:
      df: Pandas df with raw data

    Returns:
      df with preprocessed data
    """

    # Drop rows with NaN's
    df = df.dropna()

    # Convert integer valued (numeric) columns to floating point
    numeric_columns = df.select_dtypes(["int32", "float32", "float64"]).columns
    df[numeric_columns] = df[numeric_columns].astype("float32")

    # Convert categorical columns to numeric
    cat_columns = df.select_dtypes(["object"]).columns

    df[cat_columns] = df[cat_columns].apply(
        lambda x: x.astype(_CATEGORICAL_TYPES[x.name])
    )
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df

In [None]:
def standardize(df, mean_and_std):
    """Scales numerical columns using their means and standard deviation to get
    z-scores: the mean of each numerical column becomes 0, and the standard
    deviation becomes 1. This can help the model converge during training.

    Args:
      df: Pandas df

    Returns:
      Input df with the numerical columns scaled to z-scores
    """
    dtypes = list(zip(df.dtypes.index, map(str, df.dtypes)))
    # Normalize numeric columns.
    for column, dtype in dtypes:
        if dtype == "float32":
            df[column] -= mean_and_std[column]["mean"]
            df[column] /= mean_and_std[column]["std"]
    return df


In [None]:
def convert_dataframe_to_dataset(
    df_train,
    df_validation,
    mean_and_std
):

    df_train_x, df_train_y = df_train, df_train.pop(LABEL_COLUMN)
    df_validation_x, df_validation_y = df_validation, df_validation.pop(LABEL_COLUMN)

    # Join train_x and eval_x to normalize on overall means and standard
    # deviations. Then separate them again.
    all_x = pd.concat([df_train_x, df_validation_x], keys=["train", "eval"])
    all_x = standardize(all_x, mean_and_std)
    df_train_x, df_validation_x = all_x.xs("train"), all_x.xs("eval")

    y_train = np.asarray(df_train_y).astype("float32")
    y_validation = np.asarray(df_validation_y).astype("float32")

    # Convert to numpy representation
    x_train = np.asarray(df_train_x)
    x_test = np.asarray(df_validation_x)

    # Convert to one-hot representation
    y_train = tf.keras.utils.to_categorical(y_train, num_classes=len(list_label))
    y_validation = tf.keras.utils.to_categorical(y_validation, num_classes=len(list_label))
    
    print(x_train.shape,y_train.shape, x_test.shape,y_validation.shape)
    
    # return   x_train,y_train, x_test,y_validation

    dataset_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    dataset_validation = tf.data.Dataset.from_tensor_slices((x_test, y_validation))
    
    return (dataset_train, dataset_validation)

In [None]:
def create_model(num_features,num_classes):
    # Create model
    Dense = tf.keras.layers.Dense
    model = tf.keras.Sequential(
        [
            Dense(
                32,
                activation=tf.nn.relu,
                input_dim=num_features,
            ),
            Dense(32, activation=tf.nn.relu),
            Dense(num_classes, activation=tf.nn.softmax),
        ]
    )
    
    # Compile Keras model
    # optimizer = tf.keras.optimizers.RMSprop(lr=0.001)
    model.compile(
        loss="categorical_crossentropy", metrics=["accuracy"], optimizer='adam'
    )
    
    return model

In [None]:
mean_and_std = download_mean_and_std(mean_and_std_json_file)
print(mean_and_std)

In [None]:
df_train = preprocess(df_train)
df_validation = preprocess(df_validation)

print(df_train.info())
df_train.head()

In [None]:
dataset_train, dataset_validation = convert_dataframe_to_dataset(
  df_train, 
  df_validation, 
  mean_and_std
)
dataset_train = dataset_train.shuffle(len(df_train))


In [None]:
model = create_model( num_features=dataset_train._flat_shapes[0].dims[0].value,num_classes=len(list_label))
model.summary()

In [None]:
dataset_train = dataset_train.batch(BATCH_SIZE)
dataset_validation = dataset_validation.batch(BATCH_SIZE)

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=1)
history=model.fit(dataset_train, epochs=EPOCHS, validation_data=dataset_validation,batch_size=BATCH_SIZE,callbacks=[early_stopping])

In [None]:
loss, accuracy = model.evaluate(dataset_validation)
print("Average Accuracy on Eveluation", accuracy)

In [None]:
tf.saved_model.save(model, model_dir)

In [None]:
df2_test = preprocess(df_test)
print(df2_test.info())
df2_test.tail()

In [None]:
def convert_dataframe_to_list(df, mean_and_st):
    df = preprocess(df)

    df_x, df_y = df, df.pop(LABEL_COLUMN)

    # Normalize on overall means and standard deviations.
    df = standardize(df, mean_and_std)

    y = np.asarray(df_y).astype("float32")

    # Convert to numpy representation
    x = np.asarray(df_x)

    # Convert to one-hot representation
    return x.tolist(), y.tolist(), df_x


In [None]:
x_test, y_test, df_x = convert_dataframe_to_list(df2_test, mean_and_std)

In [None]:
ID_COLUMN_NAME = "id"
df_x_with_id = df_x.copy()
df_x_with_id[ID_COLUMN_NAME] = [i for i in range(0, df_x_with_id.shape[0])]

# Print columns of the datafram
print(f"Test dataset columns: {df_x_with_id.columns}")

In [None]:
#https://codelabs.developers.google.com/vertex-xgb-wit#7
#https://codelabs.developers.google.com/vertex-p2p-predictions#3

#https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/model_registry/get_started_with_model_registry.ipynb
#https://github.com/GoogleCloudPlatform/training-data-analyst/blob/master/courses/machine_learning/deepdive2/introduction_to_tensorflow/solutions/1_training_at_scale_vertex.ipynb