In [2]:
!pip3 install apache_beam

Collecting apache_beam
  Downloading apache_beam-2.36.0-cp37-cp37m-manylinux2010_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 5.3 MB/s 
[?25hCollecting pymongo<4.0.0,>=3.8.0
  Downloading pymongo-3.12.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (508 kB)
[K     |████████████████████████████████| 508 kB 68.9 MB/s 
Collecting cloudpickle<3,>=2.0.0
  Downloading cloudpickle-2.0.0-py3-none-any.whl (25 kB)
Collecting orjson<4.0
  Downloading orjson-3.6.7-cp37-cp37m-manylinux_2_24_x86_64.whl (255 kB)
[K     |████████████████████████████████| 255 kB 52.6 MB/s 
Collecting hdfs<3.0.0,>=2.1.0
  Downloading hdfs-2.6.0-py3-none-any.whl (33 kB)
Collecting fastavro<2,>=0.21.4
  Downloading fastavro-1.4.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 40.7 MB/s 
[?25hCollecting proto-plus<2,>=1.7.1
  Downloading proto_plus-1.20.3-py3-none-any.whl (46 kB)
[K     |███████████████████████

In [1]:
import apache_beam as beam

In [3]:
!ls

sample_data  students.txt


## Map 

•	Applies a simple 1-to-1 mapping function over each element in the collection.

In [13]:
def strip_header_and_newline(text):
  return text.strip('# \n')

with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Gardening plants' >> beam.Create([
          '# 🍓Strawberry\n',
          '# 🥕Carrot\n',
          '# 🍆Eggplant\n',
          '# 🍅Tomato\n',
          '# 🥔Potato\n',
      ])
      | 'Strip header' >> beam.Map(strip_header_and_newline)
      | beam.Map(print))



🍓Strawberry
🥕Carrot
🍆Eggplant
🍅Tomato
🥔Potato


**MapTuple** for key-value pairs. If your PCollection consists of (key, value) pairs, you can use MapTuple to unpack them into different function arguments.

In [None]:
# MapTuple for key-value pairs
with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Gardening plants' >> beam.Create([
          ('🍓', 'Strawberry'),
          ('🥕', 'Carrot'),
          ('🍆', 'Eggplant'),
          ('🍅', 'Tomato'),
          ('🥔', 'Potato'),
      ])
      | 'Format' >> beam.MapTuple(lambda icon, plant: '{}{}'.format(icon, plant))
      | beam.Map(print))

## FlatMap

•	Applies a simple 1-to-many mapping function over each element in the collection. The many elements are flattened into the resulting collection.

In [None]:
def split_words(text):
  return text.split(',')

with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Gardening plants' >> beam.Create([
          '🍓Strawberry,🥕Carrot,🍆Eggplant',
          '🍅Tomato,🥔Potato',
      ])
      | 'Split words' >> beam.FlatMap(split_words)
      | beam.Map(print))

**FlatMapTuple for key-value pairs**

If your PCollection consists of (key, value) pairs, you can use FlatMapTuple to unpack them into different function arguments.

In [None]:
def format_plant(icon, plant):
  if icon:
    yield '{}{}'.format(icon, plant)

with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Gardening plants' >> beam.Create([
          ('🍓', 'Strawberry'),
          ('🥕', 'Carrot'),
          ('🍆', 'Eggplant'),
          ('🍅', 'Tomato'),
          ('🥔', 'Potato'),
          (None, 'Invalid'),
      ])
      | 'Format' >> beam.FlatMapTuple(format_plant)
      | beam.Map(print))

## Filter

Given a predicate, filter out all elements that don’t satisfy that predicate. May also be used to filter based on an inequality with a given value based on the comparison ordering of the element.

In [None]:
def is_perennial(plant):
  return plant['duration'] == 'perennial'

with beam.Pipeline() as pipeline:
  perennials = (
      pipeline
      | 'Gardening plants' >> beam.Create([
          {
              'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'
          },
          {
              'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'
          },
          {
              'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'
          },
          {
              'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'
          },
          {
              'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'
          },
      ])
      | 'Filter perennials' >> beam.Filter(is_perennial)
      | beam.Map(print))

In [None]:
#Filtering with multiple arguments

def has_duration(plant, duration):
  return plant['duration'] == duration

with beam.Pipeline() as pipeline:
  perennials = (
      pipeline
      | 'Gardening plants' >> beam.Create([
          {
              'icon': '🍓', 'name': 'Strawberry', 'duration': 'perennial'
          },
          {
              'icon': '🥕', 'name': 'Carrot', 'duration': 'biennial'
          },
          {
              'icon': '🍆', 'name': 'Eggplant', 'duration': 'perennial'
          },
          {
              'icon': '🍅', 'name': 'Tomato', 'duration': 'annual'
          },
          {
              'icon': '🥔', 'name': 'Potato', 'duration': 'perennial'
          },
      ])
      | 'Filter perennials' >> beam.Filter(has_duration, 'perennial')
      | beam.Map(print))

In [5]:
with beam.Pipeline() as pipeline:
  students = (
      pipeline
      |"Read from text" >> beam.io.ReadFromText("students.txt", skip_header_lines= True)
      |"spliting the record" >> beam.Map(lambda record : record.split(','))
      |"filtering the data with PASS" >> beam.Filter(lambda record : record[5]=="FAIL")
      |"Write to text" >> beam.io.WriteToText("result/pass_students")
  )




In [3]:
!ls

result	sample_data  students.txt


In [6]:
!{('head -n 10 result/pass_students-00000-of-00001')}

['1', 'vignesh', 'chn', '27', '15', 'FAIL']
['2', 'joey', 'us', '51', '20', 'FAIL']
['6', 'sree', 'koc', '25', '27', 'FAIL']
['9', 'tinkle', 'ker', '27', '9', 'FAIL']
