In [None]:
## Setting up BigQuery
from google.cloud import bigquery

# Client > Project > Dataset > Tables
# Client objects hold projects and a connection to the BigQuery service.
# Project is a collection of datasets.
# Dataset is a collection of tables.
# Tables are composed of rows and columns.

#* Create a "Client" object
client = bigquery.Client()

#* Construct a reference to the "hacker_news" dataset contained within the "bigquery-public-data" project 
dataset_ref = client.dataset("hacker_news", project="bigquery-public-data")

#* API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

#* List all the tables in the "hacker_news" dataset
tables = list(client.list_tables(dataset))

#* Print names of all tables in the dataset
for table in tables:  
    print(table.table_id)

#* Construct a reference to the "full" table
table_ref = dataset_ref.table("full")

#* API request - fetch the table
table = client.get_table(table_ref)

In [None]:
## Understanding the table's schema

#* Describe the "full" table's columns
table.schema
# Info is displayed in the following order: 
# name of the column, 
# field type (or datatype) in the column, 
# mode of the column ('NULLABLE' means that a column allows NULL values, and is the default), 
# description of the data in that column.

#* Preview the first five row of "full" table
client.list_rows(table, max_results=5).to_dataframe()

#* Preview the first five row of "full" table's first column
client.list_rows(table, selected_fields=table.schema[:1], max_results=5).to_dataframe()

In [None]:
## Querying

#* Create a "Client" object
client = bigquery.Client()

#* Note: Arguments pass to FROM and JOIN use backticks (`) and not quotation marks (' or ")
#* Arguments are pass within strings, not with semicolon (;)
query = """
        SELECT column1
        FROM `project.database.table`
        WHERE column2 = 'ABC'
        """

JOIN_query = """
        SELECT abc.col1, xyz.colB, hta.colII
        FROM `project.database.tableABC` AS abc
        JOIN `project.database.tableXYZ` AS xyz
        ON abc.col0 = xyz.colA
        LEFT JOIN `project.database.tableHTA` AS hta
        ON abc.col0 = hta.colI
        """

#* Set up the query
query_job = client.query(query)

#* API request - run the query, and return a pandas DataFrame
df1 = query_job.to_dataframe()



# ## To estimate the size of any query before running it
# # Create a QueryJobConfig object to estimate size of query without running it
# dry_run_config = bigquery.QueryJobConfig(dry_run=True)

# # API request - dry run query to estimate costs
# dry_run_query_job = client.query(query, job_config=dry_run_config)
# print(dry_run_query_job.total_bytes_processed)


# ## Limiting size of query
# # Only run the query if it's less than 1 MB
# safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=1000000)

# # Set up the query (will only run if it's less than 1 MB)
# safe_query_job = client.query(query, job_config=safe_config)

# # API request - try to run the query, and return a pandas DataFrame
# df1 = safe_query_job.to_dataframe()

In [None]:
## BigQuery allows for nested columns (column containing dictionaries; RECORD/STRUCT) and multiple values (REPEATED)

# Nested data (each row contains a dictionary that can have multiple keys, but each key only has 1 value)
#* To select the fields (keys) in nested data: 
'''
SELECT nested_column.field1, nested_column.field2
FROM `project.database.table`
'''

# Repeated data (each row contains an array of values with the same datatype)
#* To flatten repeated data:
'''
SELECT new_column
FROM `project.database.table`, UNNEST(nested_column) AS new_column
'''

# Nested and repeated data (each row contains an array of dictionaries, which may contain nested dictionaries within it)
#* To flatten and select the data:
'''
SELECT nc.field1, nc.field2
FROM `project.database.table`, 
    UNNEST(nested_column) AS nc
'''

'''
SELECT nc.field1.nestedfield1
FROM `project.database.table`,
    UNNEST(nested_column) AS nc
WHERE nc.field2='AAA' AND nc.field3='ABC'
'''
