In [20]:
import json
import os

import numpy as np
import pandas as pd
from google.cloud import aiplatform, bigquery

In [21]:
PROJECT_ID = "pongthorn"

In [22]:
bqclient = bigquery.Client(PROJECT_ID)

def download_table(bq_table_uri: str):
    # Remove bq:// prefix if present
    prefix = "bq://"
    if bq_table_uri.startswith(prefix):
        bq_table_uri = bq_table_uri[len(prefix) :]

    table = bigquery.TableReference.from_string(bq_table_uri)
    rows = bqclient.list_rows(
        table,
    )
    return rows.to_dataframe()

def calculate_mean_and_std(df):
    # Calculate mean and std for each applicable column
    mean_and_std = {}
    dtypes = list(zip(df.dtypes.index, map(str, df.dtypes)))
    # Normalize numeric columns.
    for column, dtype in dtypes:
        if dtype == "float32" or dtype == "float64":
            mean_and_std[column] = {
                "mean": df[column].mean(),
                "std": df[column].std(),
            }

    return mean_and_std



In [28]:
# Define the BigQuery source dataset
BQ_SOURCE = "bq://pongthorn.SMartML.TrainEval_Incident_20230316"

dataframe = download_table(BQ_SOURCE)
dataframe=dataframe.iloc[0:len(dataframe)-1,:]
print(dataframe.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2142 entries, 0 to 2141
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   severity_id                2142 non-null   Int64  
 1   severity_name              2142 non-null   object 
 2   sla                        2142 non-null   object 
 3   product_type               2142 non-null   object 
 4   brand                      2142 non-null   object 
 5   service_type               2142 non-null   object 
 6   incident_type              2142 non-null   object 
 7   open_to_close_hour         2142 non-null   float64
 8   response_to_resolved_hour  2142 non-null   float64
dtypes: Int64(1), float64(2), object(6)
memory usage: 152.8+ KB
None


In [29]:
mean_and_std = calculate_mean_and_std(dataframe)
print(f"The mean and stds for each column are: {str(mean_and_std)}")

# Write to a file
MEAN_AND_STD_JSON_FILE = "incident_mean_and_std.json"

with open(MEAN_AND_STD_JSON_FILE, "w") as outfile:
    json.dump(mean_and_std, outfile)


The mean and stds for each column are: {'open_to_close_hour': {'mean': 95.90196856520386, 'std': 262.5754868829863}, 'response_to_resolved_hour': {'mean': 70.41082321817616, 'std': 217.11239323048156}}
