<a href="https://colab.research.google.com/github/technoavengers/Apache_Beam/blob/main/apache_beam_ptransforms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install apache_beam

# ***Map PTransform***


***Map is used to apply some operation on every record of your Pcollection.It takes one record as input and produce one record as an output***

(1,2,3,4,5)  =>  ***map***(lambda record : record*2) => (2,4,6,8,10)

In [None]:
import apache_beam as beam
p1 = beam.Pipeline()
customers = (
    p1
    |beam.io.ReadFromText('data/Customers_age.txt')
    |beam.Map(lambda record:record.split(','))
    |beam.Filter(lambda record: record[2]=='NY' and int(record[3])>20)
    |beam.io.WriteToText('map')
)
p1.run()



In [None]:
!cat map-00000-of-00001

# ***FlatMap PTransform***

***Flatmap is just like Map Ptransform and is applied on every record of Pcollection but it can produce 0,1 or more records as an output***

1 2 3 <br>
4 5 6   => ***flatmap***(lambda record: record.split(' '))   => 1 2 3 4 5 6 7 8 9 <br>
7 8 9 <br>

In [None]:
import apache_beam as beam

words=['peter','piper','pickled','picked','peck','pepper']
def FindWord(element):
 if element in words:
    return True

p1 = beam.Pipeline()

freq = (
    p1
    |beam.io.ReadFromText('data/Peter_Piper.txt')
    |beam.FlatMap(lambda record: record.split(' '))
    |beam.Filter(FindWord)
    |beam.Map(lambda record: (record,1))
    |beam.CombinePerKey(sum)
    |beam.io.WriteToText('flatmap')
)
p1.run()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cat flatmap-00000-of-00001

# ***Flatten PTransform***

***Flatten Ptransform is used to flatten multiple PCollections into single Pcollections***

In [None]:
import apache_beam as beam

p = beam.Pipeline()

even = {2,4,6,8}
odd = {1,3,5,7,9}
name = ('John','Jim','Mary')

even_pc = p | 'Create Pcollection from Even number' >> beam.Create(even)
odd_pc = p | 'Create Pcollection from Odd Number' >> beam.Create(odd)
name_pc = p | 'Create Pcollection from Name' >> beam.Create(name)

result = ((even_pc,odd_pc,name_pc) | beam.Flatten()) | beam.Map(print)
p.run()

# ***Partition PTransform***

***Partition PTransform is used to divide data into seperate partition based on some logic***

In [None]:
import apache_beam as beam

p = beam.Pipeline()
number = {1,2,3,4,5,6,7,8}

def partition_fn(element,num_partition):
  return 0 if element%2 ==0 else 1


number_pc = p| beam.Create(number)| beam.Partition(partition_fn,2)

number_pc[1]| 'Printing first partition' >> beam.Map(print)

p.run()

## ***Composite PTransform***

***Using composite PTransform, you can club multiple PTransforms into one and apply as one PTransform.***

***Without Using Composite***

In [None]:
import apache_beam as beam
p1 = beam.Pipeline()

def SplitRow(input_element):
  return input_element.split(',')

def FilterBasedonCountry(countryName,input_element):
  return input_element[1]==countryName

def CalculateSum(elem):
  return elem[0],(int(elem[2])+int(elem[3])+int(elem[4]))

def FormatText(elem):
  return elem[0]+' has recieved '+str(elem[1]) +' marks'

input_collection = (
    p1
    |beam.io.ReadFromText('students_marks.txt')
    |beam.Map(SplitRow)
  )

US_pipeline = (
   input_collection
   | beam.Filter(lambda record: FilterBasedonCountry('US',record))
   | 'Calculate sum for US' >> beam.Map(CalculateSum)
   | 'Format output for US' >> beam.Map(FormatText)
   | 'Writing results to US File' >> beam.io.WriteToText('US_Result')
)

India_pipeline = (
   input_collection
   | beam.Filter(lambda record: FilterBasedonCountry('IN',record))
   | 'Calculate sum for India' >> beam.Map(CalculateSum)
   | 'Format output for India' >>beam.Map(FormatText)
   | 'Writing results to India File' >> beam.io.WriteToText('IN_Result')
)


p1.run()


In [None]:
!cat IN_Result-00000-of-00001

In [None]:
!cat US_Result-00000-of-00001

***Using Composite PTransform***

In [None]:
import apache_beam as beam
p1 = beam.Pipeline()

class MyTransform(beam.PTransform):
  def expand(self, input_col):
    a = (
        input_col
                   | 'Calculate Sum' >> beam.Map(CalculateSum)
                   | 'Apply Formatting' >> beam.Map(FormatText)
    )
    return a

def SplitRow(input_element):
  return input_element.split(',')

def FilterBasedonCountry(countryName,input_element):
  return input_element[1]==countryName

def CalculateSum(elem):
  return elem[0],(int(elem[2])+int(elem[3])+int(elem[4]))

def FormatText(elem):
  return elem[0]+' has recieved '+str(elem[1]) +' marks'

input_collection = (
    p1
    |beam.io.ReadFromText('students_marks.txt')
    |beam.Map(SplitRow)
  )

US_pipeline = (
   input_collection
   | beam.Filter(lambda record: FilterBasedonCountry('US',record))
   | "Composite Transformation for US" >> MyTransform()
   | 'Writing results to US File' >> beam.io.WriteToText('US_Composite')
)

India_pipeline = (
   input_collection
   | beam.Filter(lambda record: FilterBasedonCountry('IN',record))
   | "Composite Transformation for IN" >> MyTransform()
   |'Writing results to India File' >> beam.io.WriteToText('IN_Composite')
)


p1.run()

In [None]:
!cat IN_Composite-00000-of-00001

In [None]:
!cat US_Composite-00000-of-00001

# ***CoGroupByKey***

***CoGroupBykey is used to join two PCollection based on some joining key.***

In [None]:
import apache_beam as beam

p = beam.Pipeline()
movie_name = [
    (1, 'SpiderMan'),
    (2, 'Avenger'),
    (3, 'Titanic'),
    (4, 'Green Miles'),
]
movies_rating = [
    (1, 3.5),
    (2, 4),
    (1,4.5),
    (3,3.5),
    (2,4.5)
]

name = p | 'Create Name Pcollection' >> beam.Create(movie_name)
ratings = p | 'Create Rating Pcollection' >> beam.Create(movies_rating)

joinedResult= ({'movie_name':name,'movie_rating':ratings} | beam.CoGroupByKey()) | beam.Map(print)

p.run()

# ***Pardo PTransform***

***Pardo PTransform is a general purpose transform using which you can write custom code. It can also be used to replace existing Ptransforms.***

In [None]:
import apache_beam as beam
p1 = beam.Pipeline()

class SplitRow(beam.DoFn):
  def process(self,element):
    return [element.split(',')]

class FilterCustomer(beam.DoFn):
  def process(self,element):
    if element[2]=='NY' and int(element[3])>20:
      return [element]


customers = (
    p1
    |beam.io.ReadFromText('Customers_age.txt')
    |beam.Map(lambda record:record.split(','))
    |beam.Filter(lambda record: record[2]=='NY' and int(record[3])>20)
    |beam.io.WriteToText('result')
)
p1.run()



In [None]:
import apache_beam as beam
p1 = beam.Pipeline()

class SplitRow(beam.DoFn):
  def process(self,element):
    return [element.split(',')]

class FilterCustomer(beam.DoFn):
  def process(self,element):
    if element[2]=='NY' and int(element[3])>20:
      return [element]


customers = (
    p1
    |beam.io.ReadFromText('Customers_age.txt')
    |beam.ParDo(SplitRow())
    |beam.ParDo(FilterCustomer())
    |beam.io.WriteToText('result')
)
p1.run()