In [None]:
!pip3 install apache_beam



In [None]:
import apache_beam as beam

def SplitRow(element):
  return element.split(',')
 
  
def filter_on_count(element):
  name, count = element
  if count > 30:
    return element
  
def format_output(element):
  name, count = element
  return (name.encode('ascii'),str(count),'Experienced employee')

## Using normal transformation like 



1.   Map
2.   Filter
3.   CombinePerKey

For three different operations and it includes more code, memory space and time as well.


In [None]:
with beam.Pipeline() as p:
  input_data = (p 
                      | "Read from text file" >> beam.io.ReadFromText('dept_data.txt')
                      | "Split rows" >> beam.Map(SplitRow)
                   )
  
  accounts_count = (
                      input_data
                      | 'Get all Accounts dept persons' >> beam.Filter(lambda record: record[3] == 'Accounts')
                      | 'Pair each accounts employee with 1' >> beam.Map(lambda record: ("Accounts, " +record[1], 1))
                      | 'Group and sum1' >> beam.CombinePerKey(sum)
                      | 'count filter accounts' >> beam.Filter(filter_on_count)
                      | 'Regular accounts employee' >> beam.Map(format_output)
                      | 'Write results for account' >> beam.io.WriteToText('data/Account_quick')
                 )
  
  hr_count = (
                      input_data
                      | 'Get all HR  dept persons' >> beam.Filter(lambda record: record[3] == 'HR')
                      | 'Pair each HR employee with 1' >> beam.Map(lambda record: ("HR, " +record[1], 1))
                      | 'Group and sum1' >> beam.CombinePerKey(sum)
                      | 'count filter accounts' >> beam.Filter(filter_on_count)
                      | 'Regular accounts employee' >> beam.Map(format_output)
                      | 'Write results for HR' >> beam.io.WriteToText('data/HR_quick')
                 )

  finance_count = (
                  input_data
                  | 'Get all HR dept persons' >> beam.Filter(lambda record: record[3] == 'Finance')
                  | 'Pair each finance employee with 1' >> beam.Map(lambda record: ("Finance, " +record[1], 1))
                  | 'Group and sum11' >> beam.CombinePerKey(sum)
                  | 'count filter accounts1' >> beam.Filter(filter_on_count)
                  | 'Regular accounts employee1' >> beam.Map(format_output)
                  | 'Write results for finance' >> beam.io.WriteToText('data/Finance_quick')
                   )



In [None]:
!{('head -n 10 data/Account_quick-00000-of-00001')}

(b'Accounts, Marco', '31', 'Experienced employee')
(b'Accounts, Rebekah', '31', 'Experienced employee')
(b'Accounts, Itoe', '31', 'Experienced employee')
(b'Accounts, Edouard', '31', 'Experienced employee')
(b'Accounts, Kyle', '62', 'Experienced employee')
(b'Accounts, Kumiko', '31', 'Experienced employee')
(b'Accounts, Gaston', '31', 'Experienced employee')


In [None]:
!{('head -n 10 data/Finance_quick-00000-of-00001')}

(b'Finance, Kumiko', '31', 'Experienced employee')
(b'Finance, Wendy', '31', 'Experienced employee')
(b'Finance, Cristobal', '31', 'Experienced employee')
(b'Finance, Erika', '31', 'Experienced employee')
(b'Finance, Sebastien', '31', 'Experienced employee')
(b'Finance, Valerie', '31', 'Experienced employee')
(b'Finance, Dolly', '31', 'Experienced employee')
(b'Finance, Emily', '31', 'Experienced employee')
(b'Finance, Kaori', '31', 'Experienced employee')
(b'Finance, Hitomi', '31', 'Experienced employee')


## **Composite Transforms:**

* Transforms can have a nested structure, where a complex transform performs multiple simpler transforms (such as more than one ParDo, Combine, GroupByKey, or even other composite transforms). These transforms are called composite transforms.

In [None]:
class MyTransform(beam.PTransform):
  
  def expand(self, input_coll):
    
    a = ( 
        input_coll
                       | 'Group and sum1' >> beam.CombinePerKey(sum)
                       | 'count filter accounts' >> beam.Filter(filter_on_count)
                       | 'Regular accounts employee' >> beam.Map(format_output)
              
    )
    return a

In [None]:
with beam.Pipeline() as p:
  input_data = (p 
                      | "Read from text file" >> beam.io.ReadFromText('dept_data.txt')
                      | "Split rows" >> beam.Map(SplitRow)
                   )
  
  accounts_count = (
                      input_data
                      | 'Get all Accounts dept persons' >> beam.Filter(lambda record: record[3] == 'Accounts')
                      | 'Pair each accounts employee with 1' >> beam.Map(lambda record: ("Accounts, " +record[1], 1))
                      | 'composite accounts' >> MyTransform()
                      | 'Write results for account' >> beam.io.WriteToText('data/Account')
                 )

  finance_count = (
                  input_data
                  | 'Get all Finance dept persons' >> beam.Filter(lambda record: record[3] == 'Finance')
                  | 'Pair each Finance employee with 1' >> beam.Map(lambda record: ("Finance, " +record[1], 1))
                  | 'composite Finance' >> MyTransform()
                  | 'Write results for Finance' >> beam.io.WriteToText('data/Finance')
            ) 



In [None]:
!{('head -n 10 data/Account-00000-of-00001')}

(b'Accounts, Marco', '31', 'Regular employee')
(b'Accounts, Rebekah', '31', 'Regular employee')
(b'Accounts, Itoe', '31', 'Regular employee')
(b'Accounts, Edouard', '31', 'Regular employee')
(b'Accounts, Kyle', '62', 'Regular employee')
(b'Accounts, Kumiko', '31', 'Regular employee')
(b'Accounts, Gaston', '31', 'Regular employee')


In [38]:
!{('head -n 10 data/Finance-00000-of-00001')}

(b'Finance, Kumiko', '31', 'Regular employee')
(b'Finance, Wendy', '31', 'Regular employee')
(b'Finance, Cristobal', '31', 'Regular employee')
(b'Finance, Erika', '31', 'Regular employee')
(b'Finance, Sebastien', '31', 'Regular employee')
(b'Finance, Valerie', '31', 'Regular employee')
(b'Finance, Dolly', '31', 'Regular employee')
(b'Finance, Emily', '31', 'Regular employee')
(b'Finance, Kaori', '31', 'Regular employee')
(b'Finance, Hitomi', '31', 'Regular employee')
