In [1]:
from bqt import bqt, Writer
from bqt.lib.table import BqSingleTable

In [2]:
table_name, dataset_name, project_name = ("genre_entity","genre_entity","knowledge-graph-112233")
partition="_20190901"
table_obj = BqSingleTable(table=table_name+partition, 
                          dataset=dataset_name, 
                          project=project_name)
output_table_name, output_dataset_name, output_project_name = (table_name,
                                                              "ad_hoc",
                                                              "creator-insights")

## Access the schema of a BqSingleTable object
Normally not needed by user

In [3]:
schema_raw = table_obj.schema_raw
schema_raw

[SchemaField(u'genre_gid', u'STRING', u'REQUIRED', u'Genre GID in hexadecimal string', ()),
 SchemaField(u'date', u'STRING', u'REQUIRED', u'Date of entity creation', ()),
 SchemaField(u'genre', u'RECORD', u'NULLABLE', u'{ policy: { accessTier: BROAD } }', (SchemaField(u'gid', u'STRING', u'REQUIRED', u'GID identifying genre', ()), SchemaField(u'name', u'STRING', u'REQUIRED', u'internal name identifying genre', ()), SchemaField(u'display_name', u'STRING', u'REQUIRED', u'genre name for display', ()), SchemaField(u'description', u'STRING', u'NULLABLE', u'verbose description of the genre', ()), SchemaField(u'short_description', u'STRING', u'NULLABLE', u'short description of the genre', ()), SchemaField(u'aliases', u'STRING', u'REPEATED', u'list of alternate display names for the genre', ()), SchemaField(u'status_flags', u'RECORD', u'REQUIRED', u'flags identifying properties of the genre', (SchemaField(u'no_extrapolate', u'BOOLEAN', u'REQUIRED', None, ()), SchemaField(u'no_infer', u'BOOLEAN'

## Get the table description

In [4]:
table_obj.description

u"{policy: { accessTier: BROAD },description: 'Genre entity merge from various sources: genre, sims, mirrors'}"

## Get the schema as a list of dicts so you can easily edit
Schema for end user to edit and pass to functions

In [5]:
schema = table_obj.schema
schema

[{'description': u'Genre GID in hexadecimal string',
  'mode': u'REQUIRED',
  'name': u'genre_gid',
  'type': u'STRING'},
 {'description': u'Date of entity creation',
  'mode': u'REQUIRED',
  'name': u'date',
  'type': u'STRING'},
 {'description': u'{ policy: { accessTier: BROAD } }',
  'fields': [{'description': u'GID identifying genre',
    'mode': u'REQUIRED',
    'name': u'gid',
    'type': u'STRING'},
   {'description': u'internal name identifying genre',
    'mode': u'REQUIRED',
    'name': u'name',
    'type': u'STRING'},
   {'description': u'genre name for display',
    'mode': u'REQUIRED',
    'name': u'display_name',
    'type': u'STRING'},
   {'description': u'verbose description of the genre',
    'mode': u'NULLABLE',
    'name': u'description',
    'type': u'STRING'},
   {'description': u'short description of the genre',
    'mode': u'NULLABLE',
    'name': u'short_description',
    'type': u'STRING'},
   {'description': u'list of alternate display names for the genre',
  

## Checking that converting from one to the other is equivalent
`_schema_from_dict` is internal method, typically not needed by end user

In [6]:
schema_bq = Writer._schema_from_dict(Writer(table_obj),schema)

## They match, good

In [7]:
schema_bq == schema_raw

True

## You can now set a table with a schema and/or a description

In [8]:
q = """
select * from `{project_name}.{dataset_name}.{table_name}{partition}` limit 1""".format(
                            project_name=project_name,
                            dataset_name=dataset_name,
                            table_name=table_name,
                            partition=partition)

In [9]:
for t in ["temp_with_schema_","temp_without_schema_","dummy_","dummy2_","dummy3_"]:
    bqt.delete_table(project=output_project_name,
                     dataset=output_dataset_name,
                     tables=t+output_table_name, confirm=False)

Deleting temp_with_schema_genre_entity ...
Deleting temp_without_schema_genre_entity ...
Deleting dummy_genre_entity ...
Deleting dummy2_genre_entity ...
Deleting dummy3_genre_entity ...


In [10]:
bqt.create_table(q,
                 output_dataset_name,
                 "temp_with_schema_"+output_table_name,
                 project=output_project_name,
                 schema=schema,
                 write_disposition='WRITE_TRUNCATE',
                 description="blah"
                )

Creating table `creator-insights.ad_hoc.temp_with_schema_genre_entity` ...
[92mJob finished successfuly![0m


<bqt.lib.job.BqJobResult at 0x111faa750>

## Let's compare what happens when you create a table without explicitly sending a schema

In [11]:
bqt.create_table(q,
                 output_dataset_name,
                 "temp_without_schema_"+output_table_name,
                 project=output_project_name,
                 write_disposition='WRITE_TRUNCATE'
                )

Creating table `creator-insights.ad_hoc.temp_without_schema_genre_entity` ...
[92mJob finished successfuly![0m


<bqt.lib.job.BqJobResult at 0x1002ed990>

## Require a created table have required fields in schema
The original genre entity table had this field as REQUIRED. But when we created it with the default create_table settings, it wrote the table with all REQUIRED fields as NULLABLE. To explicitly set certain fields as REQUIRED, we can pass in a schema calling for this. The table where we pass in a schema matches the original.

In [12]:
orginal_field_example = schema[3]['fields'][1]['fields'][1]
orginal_field_example

{'description': None, 'mode': u'REQUIRED', 'name': u'score', 'type': u'FLOAT'}

In [13]:
with_schema = BqSingleTable(table="temp_with_schema_"+output_table_name,
                            dataset=output_dataset_name,
                            project=output_project_name)
with_schema_field_example = with_schema.schema
with_schema_field_example[3]['fields'][1]['fields'][1]

{'description': None, 'mode': u'REQUIRED', 'name': u'score', 'type': u'FLOAT'}

In [14]:
without_schema = BqSingleTable(table="temp_without_schema_"+output_table_name, 
                               dataset=output_dataset_name, 
                               project=output_project_name)
without_schema_field_example = without_schema.schema
without_schema_field_example[3]['fields'][1]['fields'][1]

{'description': None, 'mode': u'NULLABLE', 'name': u'score', 'type': u'FLOAT'}

## The table we created with a description has a description

In [15]:
with_schema.description

u'blah'

## The table we created without one does not

In [16]:
without_schema.description

## Let's update the table with no description and give it one

In [17]:
bqt.update_table_metadata(table="temp_without_schema_"+output_table_name,
                 dataset=output_dataset_name,
                 project=output_project_name,
                 description='more blah')

## It has one now

In [18]:
without_schema_updated = BqSingleTable(table="temp_without_schema_"+output_table_name,
                                       dataset=output_dataset_name,
                                       project=output_project_name)
without_schema_updated.description

u'more blah'

## What if we wanted to change the description of a specific field?

In [19]:
with_schema.schema[3]['fields'][1]['fields'][1]

{'description': None, 'mode': u'REQUIRED', 'name': u'score', 'type': u'FLOAT'}

## No problem. Let's get the schema as a dict, then change the description

In [20]:
schema_with_new_description = with_schema.schema
schema_with_new_description[3]['fields'][1]['fields'][1]['description']='a super strict field'

In [21]:
schema_with_new_description[3]['fields'][1]['fields'][1]

{'description': 'a super strict field',
 'mode': u'REQUIRED',
 'name': u'score',
 'type': u'FLOAT'}

## Update the table and verify it got changed

In [22]:
bqt.update_table_metadata(table="temp_with_schema_"+output_table_name,
                 dataset=output_dataset_name,
                 project=output_project_name,
                 schema=schema_with_new_description)

In [23]:
BqSingleTable(table="temp_with_schema_"+table_name,
              dataset=output_dataset_name, 
              project=output_project_name).schema[3]['fields'][1]['fields'][1]

{'description': u'a super strict field',
 'mode': u'REQUIRED',
 'name': u'score',
 'type': u'FLOAT'}

## What if we just want to update one field's description without having to deal with schemas and what not...

In [24]:
BqSingleTable(table="temp_with_schema_"+table_name,
              dataset=output_dataset_name, 
              project=output_project_name).schema[0]

{'description': u'Genre GID in hexadecimal string',
 'mode': u'REQUIRED',
 'name': u'genre_gid',
 'type': u'STRING'}

In [25]:
bqt.update_field_description(dataset=output_dataset_name,
                             table="temp_with_schema_"+table_name,
                             project=output_project_name,
                             field="genre_gid",
                             description="something else"
                            )

In [26]:
BqSingleTable(table="temp_with_schema_"+table_name,
              dataset=output_dataset_name, 
              project=output_project_name).schema[0]

{'description': u'something else',
 'mode': u'REQUIRED',
 'name': u'genre_gid',
 'type': u'STRING'}

## How about let's update a field within a struct; we can use dot notation to access it

In [27]:
BqSingleTable(table="temp_with_schema_"+table_name,
              dataset=output_dataset_name, 
              project=output_project_name).schema[2]['fields'][1]

{'description': u'internal name identifying genre',
 'mode': u'REQUIRED',
 'name': u'name',
 'type': u'STRING'}

In [28]:
bqt.update_field_description(dataset=output_dataset_name,
                             table="temp_with_schema_"+table_name,
                             project=output_project_name,
                             field="genre.name",
                             description="changing description"
                            )

In [29]:
BqSingleTable(table="temp_with_schema_"+table_name,
              dataset=output_dataset_name, 
              project=output_project_name).schema[2]['fields'][1]

{'description': u'changing description',
 'mode': u'REQUIRED',
 'name': u'name',
 'type': u'STRING'}

In [30]:
BqSingleTable(table="temp_with_schema_"+table_name,
              dataset=output_dataset_name, 
              project=output_project_name).schema[4]['fields'][1]['fields'][0]['fields'][1]

{'description': None, 'mode': u'REQUIRED', 'name': u'name', 'type': u'STRING'}

In [31]:
bqt.update_field_description(dataset=output_dataset_name,
                             table="temp_with_schema_"+table_name,
                             project=output_project_name,
                             field="mirrors.similars.genre.name",
                             description="a description added to empty field"
                            )

In [32]:
BqSingleTable(table="temp_with_schema_"+table_name,
              dataset=output_dataset_name, 
              project=output_project_name).schema[4]['fields'][1]['fields'][0]['fields'][1]

{'description': u'a description added to empty field',
 'mode': u'REQUIRED',
 'name': u'name',
 'type': u'STRING'}

## Warning message when you try to update a non-existant field

In [33]:
bqt.update_field_description(dataset=output_dataset_name,
                             table="temp_with_schema_"+table_name,
                             project=output_project_name,
                             field="thisfielddoesntexist",
                             description="another description changed"
                            )

[93mField not found[0m


## Let's imagine we have a table with lots of fields and we don't want to run dozens of update_table requests or want to manually edit lots of common field descriptions...

In [34]:
bqt.create_table("""select 'bob' as user_id,
                        "USA" as reg_country,
                        "BRAZIL" as reportingCountry,
                        112 as streams
                        
                 """,
                 output_dataset_name,
                 "dummy_"+output_table_name,
                 project=output_project_name,
                 write_disposition='WRITE_TRUNCATE'
                )

Creating table `creator-insights.ad_hoc.dummy_genre_entity` ...
[92mJob finished successfuly![0m


<bqt.lib.job.BqJobResult at 0x10bcb9f50>

### No field descriptions for this table

In [35]:
schema_dict_2 = BqSingleTable(table="dummy_"+table_name,
              dataset=output_dataset_name, 
              project=output_project_name).schema
schema_dict_2

[{'description': None,
  'mode': u'NULLABLE',
  'name': u'user_id',
  'type': u'STRING'},
 {'description': None,
  'mode': u'NULLABLE',
  'name': u'reg_country',
  'type': u'STRING'},
 {'description': None,
  'mode': u'NULLABLE',
  'name': u'reportingCountry',
  'type': u'STRING'},
 {'description': None,
  'mode': u'NULLABLE',
  'name': u'streams',
  'type': u'INTEGER'}]

### Specify the descriptions all at once. Now the schema is ready

In [36]:
my_field_description_map = {
    "user_id":"{ policy: { semanticType: userId } }",
    "reg_country":"{ policy: { semanticType: country } }",
    "reportingCountry":"{ policy: { semanticType: country } }",
    "streams":"streams from past day"
}

updated_schema_with_descriptions = bqt.add_annotation_to_schema(schema_dict_2,
                                                                my_field_description_map)
print(updated_schema_with_descriptions)

[{'description': '{ policy: { semanticType: userId } }', 'type': u'STRING', 'mode': u'NULLABLE', 'name': u'user_id'}, {'description': '{ policy: { semanticType: country } }', 'type': u'STRING', 'mode': u'NULLABLE', 'name': u'reg_country'}, {'description': '{ policy: { semanticType: country } }', 'type': u'STRING', 'mode': u'NULLABLE', 'name': u'reportingCountry'}, {'description': 'streams from past day', 'type': u'INTEGER', 'mode': u'NULLABLE', 'name': u'streams'}]


### Go ahead and pass this schema to update_tabel and verify that the schema is now good

In [37]:
bqt.update_table_metadata(table="dummy_"+output_table_name,
                 dataset=output_dataset_name,
                 project=output_project_name,
                 schema=updated_schema_with_descriptions)
BqSingleTable(table="dummy_"+table_name,
              dataset=output_dataset_name, 
              project=output_project_name).schema

[{'description': u'{ policy: { semanticType: userId } }',
  'mode': u'NULLABLE',
  'name': u'user_id',
  'type': u'STRING'},
 {'description': u'{ policy: { semanticType: country } }',
  'mode': u'NULLABLE',
  'name': u'reg_country',
  'type': u'STRING'},
 {'description': u'{ policy: { semanticType: country } }',
  'mode': u'NULLABLE',
  'name': u'reportingCountry',
  'type': u'STRING'},
 {'description': u'streams from past day',
  'mode': u'NULLABLE',
  'name': u'streams',
  'type': u'INTEGER'}]

## Ok, but since some of these are super common field names, let's auto generate a schema where these common descriptions are automatically filled in

In [38]:
bqt.create_table("""select 'bob' as user_id,
                        "USA" as reg_country,
                        "BRAZIL" as reportingCountry,
                        112 as streams
                        
                 """,
                 output_dataset_name,
                 "dummy2_"+output_table_name,
                 project=output_project_name,
                 write_disposition='WRITE_TRUNCATE'
                )

Creating table `creator-insights.ad_hoc.dummy2_genre_entity` ...
[92mJob finished successfuly![0m


<bqt.lib.job.BqJobResult at 0x11219d290>

In [39]:
schema_dict_3 = BqSingleTable(table="dummy2_"+table_name,
              dataset=output_dataset_name, 
              project=output_project_name).schema
schema_dict_3

[{'description': None,
  'mode': u'NULLABLE',
  'name': u'user_id',
  'type': u'STRING'},
 {'description': None,
  'mode': u'NULLABLE',
  'name': u'reg_country',
  'type': u'STRING'},
 {'description': None,
  'mode': u'NULLABLE',
  'name': u'reportingCountry',
  'type': u'STRING'},
 {'description': None,
  'mode': u'NULLABLE',
  'name': u'streams',
  'type': u'INTEGER'}]

In [41]:
updated_schema_with_descriptions2 = bqt.add_annotation_to_schema(schema_dict_3,exact_match=False)
updated_schema_with_descriptions2

[{'description': '{ policy: { semanticType: userId } }',
  'mode': u'NULLABLE',
  'name': u'user_id',
  'type': u'STRING'},
 {'description': '{ policy: { semanticType: country } }',
  'mode': u'NULLABLE',
  'name': u'reg_country',
  'type': u'STRING'},
 {'description': '{ policy: { semanticType: country } }',
  'mode': u'NULLABLE',
  'name': u'reportingCountry',
  'type': u'STRING'},
 {'description': None,
  'mode': u'NULLABLE',
  'name': u'streams',
  'type': u'INTEGER'}]

### You can now manually tweak anything else in case it automatically misapplied the wrong semantic type to the field name or want to add extra descriptions. Now update the table as you did before.

In [42]:
bqt.update_table_metadata(table="dummy2_"+output_table_name,
                 dataset=output_dataset_name,
                 project=output_project_name,
                 schema=updated_schema_with_descriptions2)
BqSingleTable(table="dummy2_"+table_name,
              dataset=output_dataset_name, 
              project=output_project_name).schema

[{'description': u'{ policy: { semanticType: userId } }',
  'mode': u'NULLABLE',
  'name': u'user_id',
  'type': u'STRING'},
 {'description': u'{ policy: { semanticType: country } }',
  'mode': u'NULLABLE',
  'name': u'reg_country',
  'type': u'STRING'},
 {'description': u'{ policy: { semanticType: country } }',
  'mode': u'NULLABLE',
  'name': u'reportingCountry',
  'type': u'STRING'},
 {'description': None,
  'mode': u'NULLABLE',
  'name': u'streams',
  'type': u'INTEGER'}]

## Can also save yourself the step of generating the automatically annotated schema then updating table. Instead, do it all in one go

In [43]:
bqt.create_table("""select 'bob' as user_id,
                        "USA" as reg_country,
                        "BRAZIL" as reportingCountry,
                        112 as streams
                        
                 """,
                 output_dataset_name,
                 "dummy3_"+output_table_name,
                 project=output_project_name,
                 write_disposition='WRITE_TRUNCATE'
                )

Creating table `creator-insights.ad_hoc.dummy3_genre_entity` ...
[92mJob finished successfuly![0m


<bqt.lib.job.BqJobResult at 0x10bca4d90>

In [44]:
bqt.update_table_metadata(table="dummy3_"+output_table_name,
                 dataset=output_dataset_name,
                 project=output_project_name,
                 auto_annotate=True)
BqSingleTable(table="dummy3_"+table_name,
              dataset=output_dataset_name, 
              project=output_project_name).schema

[{'description': u'{ policy: { semanticType: userId } }',
  'mode': u'NULLABLE',
  'name': u'user_id',
  'type': u'STRING'},
 {'description': u'{ policy: { semanticType: country } }',
  'mode': u'NULLABLE',
  'name': u'reg_country',
  'type': u'STRING'},
 {'description': u'{ policy: { semanticType: country } }',
  'mode': u'NULLABLE',
  'name': u'reportingCountry',
  'type': u'STRING'},
 {'description': None,
  'mode': u'NULLABLE',
  'name': u'streams',
  'type': u'INTEGER'}]

In [None]:
# def _update_key(dictionary,key,val):
#     key = key.split('.')
#     temp = dictionary
#     for i in key[:-1]:
#         temp = temp[i]
#     temp[key[-1]] = val
#     return dictionary