**Discuss about beam.GroupBy**
- reference link: https://beam.apache.org/documentation/transforms/python/aggregation/groupby/

In [2]:
import apache_beam as beam


#Example of using beam.GroupBy() to collect pvalues by custome created key
    #collect data multiple pvalue into one pvalue with custome created key
with beam.Pipeline() as p:
  grouped = (
      p
      | beam.Create(
          ['strawberry', 'raspberry', 'blueberry', 'blackberry', 'banana'])
      | beam.GroupBy(lambda s: s[0])
      | beam.Map(print)
      )
      

('s', ['strawberry'])
('r', ['raspberry'])
('b', ['blueberry', 'blackberry', 'banana'])


In [5]:
#using beam.GroupBy() with multiple key
#in example we use two key: letter and is_berry. key is just a lambda function
with beam.Pipeline() as p:
    grouped = (
        p
        | beam.Create(
            ['strawberry', 'raspberry', 'blueberry', 'blackberry', 'banana'])
        | beam.GroupBy(
            letter=lambda s: s[0], is_berry=lambda s: 'berry' in s) #beam.GroupBy() with two keys (letter, is_berry)
        | beam.Map(print)
            
    )

(BeamSchema_8a175560_cf66_42d4_b20d_31a1df12b01d(letter='s', is_berry=True), ['strawberry'])
(BeamSchema_8a175560_cf66_42d4_b20d_31a1df12b01d(letter='r', is_berry=True), ['raspberry'])
(BeamSchema_8a175560_cf66_42d4_b20d_31a1df12b01d(letter='b', is_berry=True), ['blueberry', 'blackberry'])
(BeamSchema_8a175560_cf66_42d4_b20d_31a1df12b01d(letter='b', is_berry=False), ['banana'])


In [7]:
GROCERY_LIST = [
    beam.Row(recipe='pie', fruit='strawberry', quantity=3, unit_price=1.50),
    beam.Row(recipe='pie', fruit='raspberry', quantity=1, unit_price=3.50),
    beam.Row(recipe='pie', fruit='blackberry', quantity=1, unit_price=4.00),
    beam.Row(recipe='pie', fruit='blueberry', quantity=1, unit_price=2.00),
    beam.Row(recipe='muffin', fruit='blueberry', quantity=2, unit_price=2.00),
    beam.Row(recipe='muffin', fruit='banana', quantity=3, unit_price=1.00),
]

with beam.Pipeline() as p:
  grouped = (
    p
    | beam.Create(GROCERY_LIST) 
    | beam.GroupBy('recipe')
    | beam.Map(print)
    )

('pie', [BeamSchema_61bc2c04_ac8a_43bb_b2c1_32890c046053(recipe='pie', fruit='strawberry', quantity=3, unit_price=1.5), BeamSchema_61bc2c04_ac8a_43bb_b2c1_32890c046053(recipe='pie', fruit='raspberry', quantity=1, unit_price=3.5), BeamSchema_61bc2c04_ac8a_43bb_b2c1_32890c046053(recipe='pie', fruit='blackberry', quantity=1, unit_price=4.0), BeamSchema_61bc2c04_ac8a_43bb_b2c1_32890c046053(recipe='pie', fruit='blueberry', quantity=1, unit_price=2.0)])
('muffin', [BeamSchema_61bc2c04_ac8a_43bb_b2c1_32890c046053(recipe='muffin', fruit='blueberry', quantity=2, unit_price=2.0), BeamSchema_61bc2c04_ac8a_43bb_b2c1_32890c046053(recipe='muffin', fruit='banana', quantity=3, unit_price=1.0)])


In [10]:
with beam.Pipeline() as p:
  grouped = (
      p | beam.Create(GROCERY_LIST)
      | beam.GroupBy('recipe', is_berry=lambda x: 'berry' in x.fruit)
      | beam.Map(print)
    )

(BeamSchema_5a65a388_5d0a_40dc_9fe7_86b3a1a81be8(recipe='pie', is_berry=True), [BeamSchema_27b5b7bf_ba0f_469d_844e_af7715c09c9f(recipe='pie', fruit='strawberry', quantity=3, unit_price=1.5), BeamSchema_27b5b7bf_ba0f_469d_844e_af7715c09c9f(recipe='pie', fruit='raspberry', quantity=1, unit_price=3.5), BeamSchema_27b5b7bf_ba0f_469d_844e_af7715c09c9f(recipe='pie', fruit='blackberry', quantity=1, unit_price=4.0), BeamSchema_27b5b7bf_ba0f_469d_844e_af7715c09c9f(recipe='pie', fruit='blueberry', quantity=1, unit_price=2.0)])
(BeamSchema_5a65a388_5d0a_40dc_9fe7_86b3a1a81be8(recipe='muffin', is_berry=True), [BeamSchema_27b5b7bf_ba0f_469d_844e_af7715c09c9f(recipe='muffin', fruit='blueberry', quantity=2, unit_price=2.0)])
(BeamSchema_5a65a388_5d0a_40dc_9fe7_86b3a1a81be8(recipe='muffin', is_berry=False), [BeamSchema_27b5b7bf_ba0f_469d_844e_af7715c09c9f(recipe='muffin', fruit='banana', quantity=3, unit_price=1.0)])


**Aggregation with beam.GroupBy**

In [11]:
#Example of using beam.GroupBy().aggregate_field to compute aggreation value of a group on a filed of that group
#beam.GroupBy("group_key").aggregate_field("target_key", functon_name, "result_key")
with beam.Pipeline() as p:
  grouped = (
      p
      | beam.Create(GROCERY_LIST)
      | beam.GroupBy('fruit').aggregate_field(
          'quantity', sum, 'total_quantity')
      | beam.Map(print)
    )

Result(fruit='strawberry', total_quantity=3)
Result(fruit='raspberry', total_quantity=1)
Result(fruit='blackberry', total_quantity=1)
Result(fruit='blueberry', total_quantity=3)
Result(fruit='banana', total_quantity=3)


In [13]:
#Example of using beam.GroupBy("group_key").aggregate_field().aggregate_field() to compute multiple field as one
with beam.Pipeline() as p:
  grouped = (
      p
      | beam.Create(GROCERY_LIST)
      | beam.GroupBy('recipe').aggregate_field(
          'quantity', sum, 'total_quantity').aggregate_field(
              lambda x: x.quantity * x.unit_price, sum, 'price')
      | beam.Map(print)          
    )

Result(recipe='pie', total_quantity=6, price=14.0)
Result(recipe='muffin', total_quantity=5, price=7.0)


In [14]:
from apache_beam.transforms.combiners import MeanCombineFn

#Example of using a complex combiners from apache beam
#in the example we use MeanCombineFn
with beam.Pipeline() as p:
  grouped = (
      p
      | beam.Create(GROCERY_LIST)
      | beam.GroupBy().aggregate_field(
          'unit_price', min, 'min_price').aggregate_field(
              'unit_price', MeanCombineFn(), 'mean_price').aggregate_field(
                  'unit_price', max, 'max_price')
      | beam.Map(print)               
  )

Result(min_price=1.0, mean_price=2.3333333333333335, max_price=4.0)


**Discuss about beam.CombineBykey**
- reference link: https://beam.apache.org/documentation/transforms/python/aggregation/groupbykey/

In [15]:
import apache_beam as beam

#using beam.GroupByKey:
    #when we know pvalue is key-value pair. Which present as tuple in python. We already know the group key field
with beam.Pipeline() as pipeline:
  produce_counts = (
      pipeline
      | 'Create produce counts' >> beam.Create([
          ('spring', '🍓'),
          ('spring', '🥕'),
          ('spring', '🍆'),
          ('spring', '🍅'),
          ('summer', '🥕'),
          ('summer', '🍅'),
          ('summer', '🌽'),
          ('fall', '🥕'),
          ('fall', '🍅'),
          ('winter', '🍆'),
      ])
      | 'Group counts per produce' >> beam.GroupByKey()
      | beam.MapTuple(lambda k, vs: (k, sorted(vs)))  # sort and format
      | beam.Map(print))

('spring', ['🍅', '🍆', '🍓', '🥕'])
('summer', ['🌽', '🍅', '🥕'])
('fall', ['🍅', '🥕'])
('winter', ['🍆'])


**Discuss about using beam.GroupIntoBatches**
- reference link: https://beam.apache.org/documentation/transforms/python/aggregation/groupintobatches/

In [17]:
import apache_beam as beam

#Example of using beam.GroupIntoBatches(n):
    #use to group pvalue into pvalue with n element each pvalue
with beam.Pipeline() as pipeline:
  batches_with_keys = (
      pipeline
      | 'Create produce' >> beam.Create([
          ('spring', '🍓'),
          ('spring', '🥕'),
          ('spring', '🍆'),
          ('spring', '🍅'),
          ('summer', '🥕'),
          ('summer', '🍅'),
          ('summer', '🌽'),
          ('fall', '🥕'),
          ('fall', '🍅'),
          ('winter', '🍆'),
      ])
      | 'Group into batches' >> beam.GroupIntoBatches(3)
      | beam.Map(print))

('spring', ['🍓', '🥕', '🍆'])
('summer', ['🥕', '🍅', '🌽'])
('spring', ['🍅'])
('fall', ['🥕', '🍅'])
('winter', ['🍆'])


  | 'Group into batches' >> beam.GroupIntoBatches(3)


In [18]:
import apache_beam as beam

#using beam.CoGroupByKey:
    #when we have to combine multiple PCollection collection in your groupby
with beam.Pipeline() as pipeline:
  icon_pairs = pipeline | 'Create icons' >> beam.Create([
      ('Apple', '🍎'),
      ('Apple', '🍏'),
      ('Eggplant', '🍆'),
      ('Tomato', '🍅'),
  ])

  duration_pairs = pipeline | 'Create durations' >> beam.Create([
      ('Apple', 'perennial'),
      ('Carrot', 'biennial'),
      ('Tomato', 'perennial'),
      ('Tomato', 'annual'),
  ])
    # "icons" and "duration" are fields in the new pvalue created by  beam.CoGroupBy
  plants = (({
      'icons': icon_pairs, 'durations': duration_pairs
  })
            | 'Merge' >> beam.CoGroupByKey()
            | beam.Map(print)
            )

('Apple', {'icons': ['🍎', '🍏'], 'durations': ['perennial']})
('Eggplant', {'icons': ['🍆'], 'durations': []})
('Tomato', {'icons': ['🍅'], 'durations': ['perennial', 'annual']})
('Carrot', {'icons': [], 'durations': ['biennial']})
