In [1]:
import apache_beam as beam

#beam.ParDo:
    #ParDo take input as class (beam.DoFn) instead of function like Map or FlatMap
    #with beam.ParDo we can custom our processing Class or function

#Example of build beam.DoFn class    
class SplitWords(beam.DoFn):
  def __init__(self, delimiter=','):
    self.delimiter = delimiter

  def process(self, text):
    for word in text.split(self.delimiter):
      yield word

with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Gardening plants' >> beam.Create([
          '🍓Strawberry,🥕Carrot,🍆Eggplant',
          '🍅Tomato,🥔Potato',
      ])
      | 'Split words' >> beam.ParDo(SplitWords(','))
      | beam.Map(print))



🍓Strawberry
🥕Carrot
🍆Eggplant
🍅Tomato
🥔Potato


In [2]:
import apache_beam as beam

#problem setting:
    #pvalue do not have timestamp. We need to add timestime assosicate with the element
    #inside beam.DoFn have have variable timestamp and window to reference to the timestampe and window of the pvalue and pcollection

#requirement:
    #for each pvalue in pCollection we have to know
        #timestammp associate with that element (event time)
        #what is the window that pvalue belong to

class AnalyzeElement(beam.DoFn):
  def process(
      self,
      elem,
      timestamp=beam.DoFn.TimestampParam,
      window=beam.DoFn.WindowParam):
    yield '\n'.join([
        '# timestamp',
        'type(timestamp) -> ' + repr(type(timestamp)),
        'timestamp.micros -> ' + repr(timestamp.micros),
        'timestamp.to_rfc3339() -> ' + repr(timestamp.to_rfc3339()),
        'timestamp.to_utc_datetime() -> ' + repr(timestamp.to_utc_datetime()),
        '',
        '# window',
        'type(window) -> ' + repr(type(window)),
        'window.start -> {} ({})'.format(
            window.start, window.start.to_utc_datetime()),
        'window.end -> {} ({})'.format(
            window.end, window.end.to_utc_datetime()),
        'window.max_timestamp() -> {} ({})'.format(
            window.max_timestamp(), window.max_timestamp().to_utc_datetime()),
    ])

with beam.Pipeline() as pipeline:
  dofn_params = (
      pipeline
      | 'Create a single test element' >> beam.Create([':)'])
      | 'Add timestamp (Spring equinox 2020)' >>
      beam.Map(lambda elem: beam.window.TimestampedValue(elem, 1584675660))
      |
      'Fixed 30sec windows' >> beam.WindowInto(beam.window.FixedWindows(30))
      | 'Analyze element' >> beam.ParDo(AnalyzeElement())
      | beam.Map(print))

# timestamp
type(timestamp) -> <class 'apache_beam.utils.timestamp.Timestamp'>
timestamp.micros -> 1584675660000000
timestamp.to_rfc3339() -> '2020-03-20T03:41:00Z'
timestamp.to_utc_datetime() -> datetime.datetime(2020, 3, 20, 3, 41)

# window
type(window) -> <class 'apache_beam.transforms.window.IntervalWindow'>
window.start -> Timestamp(1584675660) (2020-03-20 03:41:00)
window.end -> Timestamp(1584675690) (2020-03-20 03:41:30)
window.max_timestamp() -> Timestamp(1584675689.999999) (2020-03-20 03:41:29.999999)
