In [None]:


import argparse
import logging
import re

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.testing.util import assert_that
from apache_beam.testing.util import equal_to


class FilterTextFn(beam.DoFn):
  """A DoFn that filters for a specific key based on a regular expression."""
  def __init__(self, pattern):

    beam.DoFn.__init__(self)
    self.pattern = pattern
    self.matched_words = Metrics.counter(self.__class__, 'matched_words')
    self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')

  def process(self, element):
    word, _ = element
    if re.match(self.pattern, word):
      logging.info('Matched %s', word) #logging is from global scope
      self.matched_words.inc()
      yield element
    else:
      logging.debug('Did not match %s', word)
      self.umatched_words.inc()

#define a PTransform class
class CountWords(beam.PTransform):
  """A transform to count the occurrences of each word.

  A PTransform that converts a PCollection containing lines of text into a
  PCollection of (word, count) tuples.
  """
  def expand(self, pcoll):
    def count_ones(word_ones):
      (word, ones) = word_ones
      return (word, sum(ones))

    return (
        pcoll
        | 'split' >> (
            beam.FlatMap(
                lambda x: re.findall(r'[A-Za-z\']+', x)).with_output_types(str))
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
        #instead of using beam.CombineByKey use use two step with GroupByKey + MAp
        | 'group' >> beam.GroupByKey()
        | 'count' >> beam.Map(count_ones))


def run(argv=None, save_main_session=True):
  """Runs the debugging wordcount pipeline."""

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflow-samples/shakespeare/kinglear.txt',
      help='Input file to process.')
  parser.add_argument(
      '--output',
      dest='output',
      required=True,
      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
  with beam.Pipeline(options=pipeline_options) as p:

    filtered_words = (
        p | 'read' >> ReadFromText(known_args.input)
        | CountWords()
        | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

    assert_that(filtered_words, equal_to([('Flourish', 3), ('stomach', 1)]))

    def format_result(word_count):
      (word, count) = word_count
      return '%s: %s' % (word, count)

    output = (
        filtered_words
        | 'format' >> beam.Map(format_result)
        | 'write' >> WriteToText(known_args.output))


if __name__ == '__main__':
  #reference abount logging with logging.basicConfig() interface: https://docs.python.org/3/howto/logging.html#basic-logging-tutorial
  logging.getLogger().setLevel(logging.INFO)
  run()

In [3]:
#we can reference to global object as long as we do not assign new value to it
a = 1

def print_a():
    a = a + 1 #you can not assign new value to variable which
    print(a)
print_a()
print(a)

UnboundLocalError: local variable 'a' referenced before assignment

**Summary note on wordcount_logging**
- we use logging package to log processing data.
- logger.setLevel(LOG_LEVEL) to decide which event to log
- we can also add our log message with logging.info(message) or logging.warning()