In [None]:
#https://cloud.google.com/bigquery/docs/reference/storage/libraries
#https://github.com/rocketechgroup/bigquery-storage-read-api-example/blob/master/main_simple.py
#https://github.com/googleapis/python-bigquery-storage/blob/main/samples/quickstart/quickstart.py
from google.cloud.bigquery_storage import BigQueryReadClient
from google.cloud.bigquery_storage import types
from google.protobuf.internal.well_known_types import Timestamp
import pandas as pd
# import google.cloud.bigquery_storage
# print('BQ-StorageAPI: ' +  google.cloud.bigquery_storage.__version__)
# import fastavro print(f"Fast-Avro: {fastavro.__version__}")
# import pyarrow 
# print(f"PyArrow: {pyarrow .__version__}")


In [None]:
mode="arrow" #avro
#mode="avro"

project_id = 'pongthorn'
dataset_id="SMartDW"
table_id="incident_y23"

snapshot_millis = 0
# The read session is created in this project. This project can be
# different from that which contains the table.
client = BigQueryReadClient()

# This example reads baby name data from the public datasets.
# table = "projects/{}/datasets/{}/tables/{}".format(
#     "bigquery-public-data", "usa_names", "usa_1910_current"
# )
table = f"projects/{project_id}/datasets/{dataset_id}/tables/{table_id}"
print(table)

In [None]:
requested_session = types.ReadSession()
requested_session.table = table
# This API can also deliver data serialized in Apache Arrow format.
# This example leverages Apache Avro.

if mode=="arrow" :
 requested_session.data_format = types.DataFormat.ARROW   
else:
 requested_session.data_format = types.DataFormat.AVRO

In [None]:
# We limit the output columns to a subset of those allowed in the table,
# and set a simple filter to only report names from the state of
# Washington (WA).

requested_session.read_options.selected_fields = ["id", "severity_name", "severity_id",
                                                  "service_type","incident_type","imported_at"]
requested_session.read_options.row_restriction = 'severity_id in (2,3) and open_datetime>="2023-09-01" '

In [None]:
# Set a snapshot time if it's been specified.
if snapshot_millis > 0:
    snapshot_time = types.Timestamp()
    snapshot_time.FromMilliseconds(snapshot_millis)
    requested_session.table_modifiers.snapshot_time = snapshot_time

In [None]:
parent = "projects/{}".format(project_id)
session = client.create_read_session(
    parent=parent,
    read_session=requested_session,
    # We'll use only a single stream for reading data from the table. However,
    # if you wanted to fan out multiple readers you could do so by having a
    # reader process each individual stream.
    max_stream_count=1,
)

reader = client.read_rows(session.streams[0].name)  # max_stream_count=1

#  Read by Avro and  convert to DF for Huge Ingestion

In [None]:
if mode=='avro':
# The read stream contains blocks of Avro-encoded bytes. The rows() method
# uses the fastavro library to parse these blocks as an iterable of Python
# dictionaries. Install fastavro with the following command:

# Do any local processing by iterating over the rows. The
# google-cloud-bigquery-storage client reconnects to the API after any
# transient network errors or timeouts.
# pip install google-cloud-bigquery-storage[fastavro]

    rows = reader.rows(session)
    frames = []
    for row in rows:
        frames.append(pd.Series(row))    
    df=pd.concat(frames, axis=1).T   
# convert all filed type manually

#  Read by Arrow and  convert to DF For Data Analysics

In [None]:
if mode=="arrow" :
    frames = []
    for message in reader.rows().pages:
        frames.append(message.to_dataframe())
    df = pd.concat(frames)


In [None]:
print(df.info())
df.sample(20)