In [1]:
import sys

In [2]:
!{sys.executable} -V

Python 3.9.6


In [None]:
#!{sys.executable} -m pip install --upgrade google-cloud-bigquery

In [None]:
#!{sys.executable} -m pip install --upgrade python-dotenv

In [None]:
#!{sys.executable} -m pip install --upgrade pipreqs

In [None]:
#!{sys.executable} -m pip show google-cloud-bigquery

In [None]:
#!{sys.executable} -m pip show python-dotenv

In [None]:
#!{sys.executable} -m pip show pipreqs

In [3]:
from google.cloud import bigquery
from google.cloud.bigquery import Table

In [4]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
project_id = 'bigquery-project-32089'
dataset_id = 'customers_dataset'
table_id = 'customers'
dataset_full_name = f"{project_id}.{dataset_id}"
table_full_name = f"{project_id}.{dataset_id}.{table_id}"

In [6]:
client = bigquery.Client(project=project_id)

In [7]:
def create_dataset(dataset_id):
    dataset_ref = bigquery.DatasetReference.from_string(dataset_id, default_project=client.project)
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = "US"
    dataset = client.create_dataset(dataset)
    print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

In [8]:
create_dataset(dataset_id)

Created dataset bigquery-project-32089.customers_dataset


In [9]:
def list_datasets():
    datasets = list(client.list_datasets())
    project = client.project

    if datasets:
        print("Datasets in project {}:".format(project))
        for dataset in datasets:
            print("\t{}".format(dataset.dataset_id))
    else:
        print("{} project does not contain any datasets.".format(project))

In [10]:
# List available datasets
list_datasets()

Datasets in project bigquery-project-32089:
	customer_dataset
	customers_dataset


In [11]:
#  create a table in Python and load the data into it.
def create_table(tablefullname):    
    table_id = Table.from_string(tablefullname)

    schema = [
    bigquery.SchemaField("_id", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("index", "INTEGER", mode="NULLABLE"),        
    bigquery.SchemaField("guid", "STRING", mode="NULLABLE"),        
    bigquery.SchemaField("isActive", "BOOLEAN", mode="NULLABLE"),
    bigquery.SchemaField("age", "INTEGER", mode="NULLABLE"),
    bigquery.SchemaField("eyeColor", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("name", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("gender", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("company", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("email", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("phone", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("address", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("about", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("registered", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("latitude", "FLOAT", mode="NULLABLE"),
    bigquery.SchemaField("longitude", "FLOAT", mode="NULLABLE"), 
   
    bigquery.SchemaField("tags", "STRING", mode="REPEATED"),
        
    bigquery.SchemaField(
        "friends",
        "RECORD",
        mode="REPEATED",
        fields=[
            bigquery.SchemaField("id", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("name", "STRING", mode="NULLABLE"),
            ],
        ),
        
    bigquery.SchemaField("greeting", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("favoriteFruit", "STRING", mode="NULLABLE"),
    
    ]  
    

    table = bigquery.Table(table_id, schema=schema)
    table = client.create_table(table)
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )

In [12]:
create_table(table_full_name)

Created table bigquery-project-32089.customers_dataset.customers


In [13]:
def insert_rows(tablefullname,json):    
    table_id = Table.from_string(tablefullname)    
    
    errors = client.insert_rows_json(table_id, json)
    if errors == []:
        print("New rows have been added.")
    else:
        print("Encountered errors while inserting rows: {}".format(errors))

In [14]:
# Load customers_1.json file
import json
# Define a string of json data
customers_1_json_file = '.\data\customers_1.json'
customers_1_json_data = [json.loads(line) for line in open(customers_1_json_file, 'r')]

In [15]:
# Insert rows into table
insert_rows(table_full_name,customers_1_json_data)

New rows have been added.


In [16]:
# Load customers_2.json file has additional fields. Table needs to be updated for the schema change.
customers_2_json_file = '.\data\customers_2.json'
customers_2_json_data = [json.loads(line) for line in open(customers_2_json_file, 'r')]

In [19]:
def update_schema_insert_fields(project_id,dataset_id,table_id,customers_2_json):   
    project = client.project
    dataset_ref = bigquery.DatasetReference(project, dataset_id)    

    # Retrieves the destination table and checks the length of the schema    
    table_ref = dataset_ref.table(table_id)
    table = client.get_table(table_ref)
    print("Table {} contains {} columns.".format(table_id, len(table.schema)))
    
    
    job_config = bigquery.LoadJobConfig(
        autodetect=True,
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
        schema_update_options=[
            bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION,
            bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION
        ],
        write_disposition=bigquery.WriteDisposition.WRITE_APPEND
    )    
    
    job_config.schema = [
        bigquery.SchemaField("balance", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("picture", "STRING", mode="NULLABLE"),
    ]    
        
    job = client.load_table_from_json(customers_2_json_data, table_ref, location="US", job_config=job_config)
    
    job.result()  # Waits for table load to complete.
    print(
        "Loaded {} rows into {}:{}.".format(
            job.output_rows, dataset_id, table_ref.table_id
        )
    )

    # Checks the updated length of the schema
    table = client.get_table(table)
    print("Table {} now contains {} columns.".format(table_id, len(table.schema)))

In [20]:
# Update table schema with new fields
update_schema_insert_fields(project_id,dataset_id,table_id,customers_2_json_data)

Table customers contains 20 columns.
Loaded 4 rows into customers_dataset:customers.
Table customers now contains 22 columns.


In [21]:
# Load customers_3.json file that includes another schema change.
customers_3_json_file = '.\data\customers_3.json'
customers_3_json_data = [json.loads(line) for line in open(customers_3_json_file, 'r')]

In [22]:
# Streaming insert can possible 
# 1.) Using Legacy Streaming API  OR  # 2.) Using BigQuery Storage Write API 

# 1.) Using Legacy Streaming API
def streaming_insert_rows(tablefullname,json):    
    table_id = Table.from_string(tablefullname)    

    errors = client.insert_rows_json(table_id, json)
    if errors == []:
        print("New rows have been added.")
    else:
        print("Encountered errors while inserting rows: {}".format(errors))

In [24]:
streaming_insert_rows(table_full_name,customers_3_json_data)

Encountered errors while inserting rows: [{'index': 0, 'errors': [{'reason': 'invalid', 'location': 'foes', 'debugInfo': '', 'message': 'no such field: foes.'}]}, {'index': 1, 'errors': [{'reason': 'invalid', 'location': 'foes', 'debugInfo': '', 'message': 'no such field: foes.'}]}, {'index': 2, 'errors': [{'reason': 'invalid', 'location': 'foes', 'debugInfo': '', 'message': 'no such field: foes.'}]}, {'index': 3, 'errors': [{'reason': 'invalid', 'location': 'foes', 'debugInfo': '', 'message': 'no such field: foes.'}]}]


### As you can see from above example , when I try to execute above code, It will produce errors. Because customers_3.json file includes another schema change.

# Legacy Streaming API doesn't support schema update while appending new rows into table

In [26]:
# Solution - Update table schema first and then use Legacy Streaming API calls again.
def update_table_schema(tableid):
    table = client.get_table(tableid)  

    original_schema = table.schema
    new_schema = original_schema[:]  # Creates a copy of the schema.
    
    # Append new fields to schema
    new_schema.append(bigquery.SchemaField(
        "foes",
        "RECORD",
        mode="REPEATED",
        fields=[
            bigquery.SchemaField("id", "INTEGER", mode="NULLABLE"),
            bigquery.SchemaField("name", "STRING", mode="NULLABLE"),
            ],
        ))

    # Update Table with new schema
    table.schema = new_schema
    table = client.update_table(table, ["schema"]) 

    if len(table.schema) == len(original_schema) + 1 == len(new_schema):
        print("A new column has been added.")
    else:
        print("The column has not been added.")

In [28]:
# Update table schema with new fields
update_table_schema(table_full_name)

A new column has been added.


In [29]:
# Use Legacy Streaming API call again with json file that has additional fields.
def streaming_insert_rows(tablefullname,json):    
    table_id = Table.from_string(tablefullname)    

    errors = client.insert_rows_json(table_id, json)
    if errors == []:
        print("New rows have been added.")
    else:
        print("Encountered errors while inserting rows: {}".format(errors))

In [30]:
# Call streaming_insert_rows() function again.
streaming_insert_rows(table_full_name,customers_3_json_data)

New rows have been added.
