In [11]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.transforms.trigger import AfterWatermark, AfterCount
import logging

class PrintMessages(beam.DoFn):
    def process(self, element):
        print(element)
        yield element

def run():
    pipeline_options = PipelineOptions()

    # Set the Google Cloud project and specify the Dataflow runner
    google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'my-another-394512'
    google_cloud_options.job_name = 'pubsub-to-bq-batch'
    google_cloud_options.staging_location = 'gs://maniprakash-bucket/staging'
    google_cloud_options.temp_location = 'gs://maniprakash-bucket/temp'
    google_cloud_options.region = 'europe-west2'

    # Enable streaming mode
    pipeline_options.view_as(beam.options.pipeline_options.StandardOptions).streaming = True
    pipeline_options.view_as(beam.options.pipeline_options.StandardOptions).runner = 'DataflowRunner'

    # Create the pipeline
    pipeline = beam.Pipeline(options=pipeline_options)

    # Read messages from Pub/Sub topic
    messages = (
        pipeline
        | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(subscription='projects/my-another-394512/subscriptions/my-topic-sub')
        | 'Decode message' >> beam.Map(lambda x: x.decode('utf-8'))
       # | 'Add timestamps' >> beam.Map(lambda x: beam.window.TimestampedValue(x, 0))  # Assign fixed timestamps
    )

    # Apply windowing and triggers
    windowed_messages = messages | 'Apply Windowing' >> beam.WindowInto(
        beam.window.FixedWindows(60),  # 1-minute window
        trigger=AfterWatermark(early=AfterCount(10), late=AfterCount(20)),  # Batch trigger
        accumulation_mode=beam.trigger.AccumulationMode.DISCARDING
    )

    # Print messages to console
    windowed_messages | 'Print Messages' >> beam.ParDo(PrintMessages())

    # Write messages to BigQuery
    table_spec = 'my-another-394512.mydataset.pubsub'
    windowed_messages | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
        table_spec,
        schema='msg:STRING',  # Define your schema here
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
    )

    pipeline.run()

if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    run()


INFO:apache_beam.runners.portability.stager:Downloading source distribution of the SDK from PyPi
INFO:apache_beam.runners.portability.stager:Executing command: ['/jupyter/.kernels/apache-beam-2.49.0/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmpl81p32at', 'apache-beam==2.49.0', '--no-deps', '--no-binary', ':all:']
INFO:apache_beam.runners.portability.stager:Staging SDK sources from PyPI: dataflow_python_sdk.tar
INFO:apache_beam.runners.portability.stager:Downloading binary distribution of the SDK from PyPi
INFO:apache_beam.runners.portability.stager:Executing command: ['/jupyter/.kernels/apache-beam-2.49.0/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmpl81p32at', 'apache-beam==2.49.0', '--no-deps', '--only-binary', ':all:', '--python-version', '38', '--implementation', 'cp', '--abi', 'cp38', '--platform', 'manylinux2014_x86_64']
INFO:apache_beam.runners.portability.stager:Staging binary distribution of the SDK from PyPI: apache_beam-2.49.0-cp38-cp38-manylinux_2_17_

makjshsgyeadnaskdsakdnlkasn
mmmmmmmmmmnnnnnnnbvvvvvvvvvvvvvvvvassssssssssss
mmmmmmmmmmnnnnnnnbvvvvvvvvvvvvvvvvassssssssssss
mmmmmmmmmmnnnnnnnbvvvvvvvvvvvvvvvvassssssssssss
mmmmmmmmmmnnnnnnnbvvvvvvvvvvvvvvvvassssssssssss
mnbvcxz
mnbvcxz
mnbvcxz
mnbvcxz
mmmmmmmmmmnnnnnnnbvvvvvvvvvvvvvvvvassssssssssss
mnbvcxz
makjshsgyeadnaskdsakdnlkasn
mmmmmmmmmmnnnnnnnbvvvvvvvvvvvvvvvvassssssssssss
mnbvcxz
mnbvcxz
mnbvcxz
mnbvcxz
mnbvcxz
