# Querying BQ for Building Blocks and Writing to Cloud Storage 

Make sure the Dataflow API is enabled

In [24]:
!gcloud services enable dataflow

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)


In [25]:
import apache_beam as beam
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.runners import DataflowRunner
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib
import google.auth

In [26]:
options = {
    'project': 'compound-enumeration',
    'temp_location' : 'gs://compound-enumeration-dataflow-temp'
}

In [27]:
options = pipeline_options.PipelineOptions(flags=[], **options)

In [28]:
p = beam.Pipeline(InteractiveRunner(), options)

In the query below, make sure you are referring to the table you loaded with the test data

In [29]:
building_blocks = (
    p
    | 'QueryTable' >> beam.io.ReadFromBigQuery(
        query='SELECT * FROM '\
              '[compound-enumeration.building_blocks.mcule_building_blocks_100]')
)
    

In [30]:
ib.show_graph(p)

/usr/bin/dot


## Dataflow Additions

In [31]:
# Setting up the Apache Beam pipeline options.
options = pipeline_options.PipelineOptions(flags={})

# Sets the project to the default project in your current Google Cloud environment.
_, options.view_as(GoogleCloudOptions).project = google.auth.default()

# Sets the Google Cloud Region in which Cloud Dataflow runs.
options.view_as(GoogleCloudOptions).region = 'us-central1'

In [32]:
# IMPORTANT! Adjust the following to choose a Cloud Storage location.
dataflow_gcs_location = 'gs://compound-enumeration-dataflow-temp'

In [33]:
# Dataflow Staging Location. This location is used to stage the Dataflow Pipeline and SDK binary.
options.view_as(GoogleCloudOptions).staging_location = '%s/staging' % dataflow_gcs_location

# Dataflow Temp Location. This location is used to store temporary files or intermediate results before finally outputting to the sink.
options.view_as(GoogleCloudOptions).temp_location = '%s/temp' % dataflow_gcs_location

In [34]:
# The directory to store the output files of the job.
output_gcs_location = '%s/output' % dataflow_gcs_location

# Specifying the Cloud Storage location to write `counts` to,
# based on the `output_gcs_location` variable set earlier.
(building_blocks | 'Write building blocks to Cloud Storage' 
 >> beam.io.WriteToText(output_gcs_location + '/building_blocks.txt'))


<PCollection[[34]: Write building blocks to Cloud Storage/Write/WriteImpl/FinalizeWrite.None] at 0x7f82835f7910>

In [35]:
ib.show_graph(p)

/usr/bin/dot


In [36]:
pipeline_result = DataflowRunner().run_pipeline(p, options=options)


[notice] A new release of pip available: 22.3 -> 22.3.1
[notice] To update, run: python -m pip install --upgrade pip

[notice] A new release of pip available: 22.3 -> 22.3.1
[notice] To update, run: python -m pip install --upgrade pip


In [37]:
pipeline_result.wait_until_finish()

'DONE'

## Check results

In [38]:
!gsutil ls {output_gcs_location}

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
gs://compound-enumeration-dataflow-temp/output/building_blocks.txt-00000-of-00001


In [39]:
!gsutil cat {output_gcs_location}/building_blocks.txt* | head -10

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
{'smiles': 'C(O)(=O)C(O)=O', 'mcule_id': 'MCULE-6647815245'}
{'smiles': 'C(O)(O)=O', 'mcule_id': 'MCULE-3743781291'}
{'smiles': 'OS(O)=O', 'mcule_id': 'MCULE-3188569799'}
{'smiles': 'OS(O)(=O)=O', 'mcule_id': 'MCULE-1504889908'}
{'smiles': 'ClC(Cl)Cl', 'mcule_id': 'MCULE-5607930311'}
{'smiles': 'CO', 'mcule_id': 'MCULE-1370061678'}
{'smiles': 'OCCO', 'mcule_id': 'MCULE-6366313128'}
{'smiles': 'ClC1C=CC=CC=1', 'mcule_id': 'MCULE-2469021740'}
{'smiles': 'ClC1C(Cl)=CC=CC=1', 'mcule_id': 'MCULE-4275292262'}
{'smiles': 'CI', 'mcule_id': 'MCULE-1718786667'}
