In [1]:
!pip3 install apache_beam

Collecting apache_beam
  Downloading apache_beam-2.36.0-cp37-cp37m-manylinux2010_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 7.8 MB/s 
[?25hCollecting requests<3.0.0,>=2.24.0
  Downloading requests-2.27.1-py2.py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.4 MB/s 
[?25hCollecting cloudpickle<3,>=2.0.0
  Downloading cloudpickle-2.0.0-py3-none-any.whl (25 kB)
Collecting fastavro<2,>=0.21.4
  Downloading fastavro-1.4.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 44.2 MB/s 
[?25hCollecting hdfs<3.0.0,>=2.1.0
  Downloading hdfs-2.6.0-py3-none-any.whl (33 kB)
Collecting orjson<4.0
  Downloading orjson-3.6.7-cp37-cp37m-manylinux_2_24_x86_64.whl (255 kB)
[K     |████████████████████████████████| 255 kB 53.5 MB/s 
Collecting proto-plus<2,>=1.7.1
  Downloading proto_plus-1.20.3-py3-none-any.whl (46 kB)
[K     |████████████████████████████████| 46 kB 2.3 MB/s 
Coll

In [1]:
import apache_beam as beam

## **GroupBy**:

*   Takes a collection of elements and produces a collection grouped, by properties of those elements.
*   Unlike GroupByKey, the key is dynamically created from the elements themselves.





In [None]:
with beam.Pipeline() as p:
  grouped = (
      p
      | beam.Create(['strawberry', 'raspberry', 'blueberry', 'blackberry', 'banana'])
      | beam.GroupBy(lambda s: s[0])
      |beam.Map(print))



('s', ['strawberry'])
('r', ['raspberry'])
('b', ['blueberry', 'blackberry', 'banana'])


## **GroupByKey**:

*   Takes a keyed collection of elements and produces a collection where each element consists of a key and all values associated with that key.



In [23]:
records = [("vignesh", [27, "engineer"]),
("neethu", [27, "developer"]),
("farooqui", [26, "data analyst"]),
("sai", [29, "web developer"]),
("tinkle", [28, "fullstack developer"]),
("neethu", 'Employed'),
("sai", 'Unemployed'),
("tinkle", 'Employed'),
("farooqui",'Employed'),
("vignesh", 'Unemployed')]

In [24]:
with beam.Pipeline() as pipeline:
  produce_counts = (
      pipeline
      | 'Create produce counts' >> beam.Create(records)
      | 'Group counts per produce' >> beam.GroupByKey()
      | beam.Map(print))



('vignesh', [[27, 'engineer'], 'Unemployed'])
('neethu', [[27, 'developer'], 'Employed'])
('farooqui', [[26, 'data analyst'], 'Employed'])
('sai', [[29, 'web developer'], 'Unemployed'])
('tinkle', [[28, 'fullstack developer'], 'Employed'])


## **CoGroupByKey**:

*   Aggregates all input elements by their key and allows downstream processing to consume all values associated with the key. 
*   While GroupByKey performs this operation over a single input collection and thus a single type of input values.
*  CoGroupByKey operates over multiple input collections. As a result, the result for each key is a tuple of the values associated with that key in each input collection.



In [8]:
with beam.Pipeline() as pipeline:
  student_pairs = pipeline | 'Create icons' >> beam.Create([
      ('vignesh', 'bangalore'),
      ('khaula', 'hyderabad'),
      ('neethu', 'malapur'),
      ('sai', 'chennai'),
  ])

  student_result = pipeline | 'Create durations' >> beam.Create([
      ('vignesh', [15,"FAIL"]),
      ('khaula', [99,"PASS"]),
      ('neethu', [100,"PASS"]),
      ('sai',[ 37,"FAIL"]),
  ])

  plants = (({
      'icons': student_pairs, 'durations': student_result
  })
            | 'Merge' >> beam.CoGroupByKey()
            | beam.Map(print))



('vignesh', {'icons': ['bangalore'], 'durations': [[15, 'FAIL']]})
('khaula', {'icons': ['hyderabad'], 'durations': [[99, 'PASS']]})
('neethu', {'icons': ['malapur'], 'durations': [[100, 'PASS']]})
('sai', {'icons': ['chennai'], 'durations': [[37, 'FAIL']]})


In [12]:
with beam.Pipeline() as pipeline:
  student_pairs = pipeline | 'Create icons' >> beam.Create([
      ('vignesh', 15),
      ('khaula', 99),
      ('neethu', 100),
      ('sai', 37),
  ])

  student_result = pipeline | 'Create durations' >> beam.Create([
      ('vignesh', "FAIL"),
      ('khaula',"PASS"),
      ('neethu',"PASS"),
      ('sai', "FAIL"),
  ])

  plants = (({
      'Marks': student_pairs, 'Result': student_result
  })
  | 'Merge' >> beam.CoGroupByKey()
  | beam.Map(print))



('vignesh', {'Marks': [15], 'Result': ['FAIL']})
('khaula', {'Marks': [99], 'Result': ['PASS']})
('neethu', {'Marks': [100], 'Result': ['PASS']})
('sai', {'Marks': [37], 'Result': ['FAIL']})


## **GroupIntoBatches**:



*   Batches the input into desired batch size.



In [None]:
with beam.Pipeline() as pipeline:
  batches_with_keys = (
      pipeline
      | 'Create produce' >> beam.Create([
          ('spring', '🍓'),
          ('spring', '🥕'),
          ('spring', '🍆'),
          ('spring', '🍅'),
          ('summer', '🥕'),
          ('summer', '🍅'),
          ('summer', '🌽'),
          ('fall', '🥕'),
          ('fall', '🍅'),
          ('winter', '🍆'),
      ])
      | 'Group into batches' >> beam.GroupIntoBatches(4)  #3, #2
      | beam.Map(print))

('spring', ['🍓', '🥕', '🍆', '🍅'])
('summer', ['🥕', '🍅', '🌽'])
('fall', ['🥕', '🍅'])
('winter', ['🍆'])


