In [None]:
import os
import pandas as pd
from loguru import logger

from google.cloud import bigquery
from google.oauth2 import service_account

# BigQuery API. Работа с таблицами (создание, удаление, схемы)

In [4]:
# BigQuery connect
big_query_key_path = "XXXXXXX.json"
project_id = 'my_project'
dataset="test1"
table_name="table_test"
table_id=f"{project_id}.{dataset}.{table_name}"

In [None]:


def get_client_and_table(dataset):
    key_path = big_query_key_path
    credentials = service_account.Credentials.from_service_account_file(
        key_path, scopes=["https://www.googleapis.com/auth/cloud-platform"],
    )

    client = bigquery.Client(credentials=credentials, project=credentials.project_id,)
    return client, credentials

client, credentials = get_client_and_table(dataset=dataset)

In [None]:
# list tables in dataset
tables = client.list_tables(f'{project_id}.{dataset}')
for table_item in tables:
    print(table_item.table_id)

In [None]:
# get count rows in table
table = client.get_table(table_id)
print('{} rows in {}'.format(table.num_rows,table.table_id))

In [None]:
# get schema and find field
table = client.get_table(table_id)
for field in table.schema:
    if 'count' in field.name:
        print(field)
    print(field)

In [None]:
# Create new blank table with schema
# https://medium.com/pipeline-a-data-engineering-resource/how-to-create-nested-schemas-in-python-using-the-google-bigquery-api-8d86b1602cbd
# https://cloud.google.com/bigquery/docs/schemas

schema = [
    bigquery.SchemaField('headers', 'RECORD', mode='NULLABLE', fields=[
        bigquery.SchemaField('tag', 'STRING', 'NULLABLE'), 
        bigquery.SchemaField('last_date', 'STRING', 'NULLABLE'), 
        bigquery.SchemaField('server', 'STRING', 'NULLABLE'), 
        bigquery.SchemaField('length', 'INTEGER', 'NULLABLE')]),
    bigquery.SchemaField('group_id', 'INTEGER', mode='NULLABLE'),
    bigquery.SchemaField('json', 'RECORD', mode='NULLABLE', fields=[
        bigquery.SchemaField('field1', 'STRING', 'NULLABLE'), 
        bigquery.SchemaField('field2', 'INTEGER', 'NULLABLE'), 
        bigquery.SchemaField('name', 'STRING', 'NULLABLE'), 
        bigquery.SchemaField('id', 'STRING', 'NULLABLE')]),
    bigquery.SchemaField('event', 'STRING', mode='NULLABLE'),
    bigquery.SchemaField('date_upload', 'DATETIME', mode='NULLABLE'),
]

table = bigquery.Table(table_id, schema=schema)
table = client.create_table(table)
print(
    "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
)

In [None]:
# Delete Table
client.delete_table(table_id, not_found_ok=True)  
print("Deleted table '{}'.".format(table_id))

In [None]:
# access to rows
table = client.get_table(table_id)
df = client.list_rows(table).to_dataframe()
df.head(3)

In [None]:
# Delete ALL ROWS from table BigQuery
query = (f'DELETE FROM {table_id} WHERE true')
query_job = client.query(query)
table = client.get_table(table_id)
print('{} rows in {}'.format(table.num_rows,table.table_id))

# BigQuery API. QUERY

In [None]:
def __load_df_from_bq(table_id, query=None):
    if query is None:
        query = f"SELECT * FROM {table_id}"
    result = client.query(query).to_dataframe()
    logger.info("Finish load {}, {}".format(dataset, result.shape))
    return result

In [None]:
def get_none_query(table_id):
    return __load_df_from_bq(table_id=table_id)

In [None]:
def get_is_query(table_id):
    query = (
        """
        SELECT *
        FROM my_project.test1.table_test 
        WHERE date_diff(current_date(),date_upload,DAY) <= 7
        LIMIT 1000
        """
    )    
    return __load_df_from_bq_2(table_id=table_id, query=query)

In [None]:
df_standart=get_none_query(table_id)
df_new_query=get_is_query(table_id)

## SQL-query sample olist

In [7]:
sql_text=f"""
#standardSQL
WITH geoavg AS (SELECT 
  geolocation_zip_code_prefix,
  AVG(geolocation_lat) AS lat,
  AVG(geolocation_lng) AS lng
FROM `{project_id}.olist.geolocation`
GROUP BY
    geolocation_zip_code_prefix),  

custgeo AS (
SELECT 
    c.*, 
    geoavg.lat AS customers_geo_lat,
    geoavg.lng AS customers_geo_lng
FROM `{project_id}.olist.customers` c LEFT JOIN geoavg ON c.customer_zip_code_prefix=geoavg.geolocation_zip_code_prefix),

ordcustgeo AS (
SELECT
    ord.*, 
    c.* EXCEPT(customer_id)
FROM `{project_id}.olist.orders` ord LEFT JOIN custgeo c ON ord.customer_id=c.customer_id),

itemavg AS (SELECT 
  order_id,
  MAX(order_item_id) AS order_item_count,
  SUM(price) AS sum_price,
  SUM(freight_value) AS sum_freight_value,
  SUM(price)+SUM(freight_value) AS sum_price_freight

FROM `{project_id}.olist.order_items1`
GROUP BY
    order_id),  

orditem1 AS (
SELECT 
    o.*, 
    itemavg.* EXCEPT (order_id)
FROM ordcustgeo o LEFT JOIN itemavg ON o.order_id=itemavg.order_id),

ordprod AS (
SELECT 
  COUNT(*) as num_prod,
  order_id,
  product_id
  FROM `{project_id}.olist.order_items1`

GROUP BY
  order_id, 
  product_id
),

unprod AS(
SELECT 
  COUNT(*) as num_uniq_prod,
  order_id,
  FROM ordprod

GROUP BY
  order_id),

orditem2 AS (
SELECT 
    o.*, 
    unprod.* EXCEPT (order_id)
FROM orditem1 o LEFT JOIN unprod ON o.order_id=unprod.order_id),

tmp AS (
SELECT 
  MAX(review_answer_timestamp) as max_review_answer_date,
  order_id
  FROM `{project_id}.olist.order_reviews2`
GROUP BY
  order_id),

score AS(
SELECT 
  s.review_score AS review_score,
  s.sent_score AS sent_score, 
  s.sent_magnitude AS sent_magnitude,
  s.entities_list AS entities_list,
  s.sentences_count AS sentences_count,
  s.token_count AS token_count,
  s.sentlist AS sentlist,
  s.tokenlist AS tokenlist,
  s.review_id AS review_id,
  s.review_answer_timestamp AS review_answer_timestamp,
  s.order_id AS order_id
FROM `{project_id}.olist.order_reviews2` s RIGHT JOIN tmp ON s.review_answer_timestamp=tmp.max_review_answer_date AND s.order_id=tmp.order_id
),

avgr AS(
SELECT 
  order_id,
  COUNT(*) as num_reviews_per_order,
  STRING_AGG(message1, " ") as message,
  STRING_AGG(title11, " ") as title,
  MIN(review_creation_date) as min_review_create_date,
  MAX(review_creation_date) as max_review_create_date,
  MIN(review_answer_timestamp) as min_review_answer_date,
  MAX(review_answer_timestamp) as max_review_answer_date
  
FROM `{project_id}.olist.order_reviews2` 

GROUP BY
  order_id),

rev AS (
SELECT 
avgr.*,
score.review_id AS review_id,
score.review_score,
score.sent_score AS sent_score, 
score.sent_magnitude AS sent_magnitude,
score.entities_list AS entities_list,
score.sentences_count AS sentences_count,
score.token_count AS token_count,
score.sentlist AS sentlist,
score.tokenlist AS tokenlist,
FROM avgr LEFT JOIN score ON avgr.order_id=score.order_id)

SELECT 
    s.*,
    rev.* EXCEPT (order_id)
FROM orditem2 s LEFT JOIN rev ON s.order_id=rev.order_id
"""

## SQL-Query for field "RECORD"

In [None]:
schema = [
    bigquery.SchemaField('order', 'RECORD', mode='NULLABLE', fields=[
        bigquery.SchemaField('amount', 'FLOAT', mode='NULLABLE'),
        bigquery.SchemaField('currency', 'STRING', mode='NULLABLE'),
        bigquery.SchemaField('created_at', 'DATETIME', mode='NULLABLE')]),
    bigquery.SchemaField('date_upload', 'DATETIME', mode='NULLABLE'),
]

# Sometimes you need to use this syntax for describing fields - with symbols " `order` "
query = (
    """
    SELECT
        date_upload,
        `order`.amount,
        `order`.currency,
        `order`.created_at,
    FROM my_project.my_dataset.my_table
        
    """
)