#Intro

In [1]:
!pip install apache-beam

Collecting apache-beam
  Downloading apache_beam-2.61.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting crcmod<2.0,>=1.7 (from apache-beam)
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.2,>=0.3.1.1 (from apache-beam)
  Downloading dill-0.3.1.1.tar.gz (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cloudpickle~=2.2.1 (from apache-beam)
  Downloading cloudpickle-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting fastavro<2,>=0.23.6 (from apache-beam)
  Downloading fastavro-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting fasteners<1.0,>=0.3 (from apache-beam)
  D

In [2]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.transforms.core import DoFn

In [3]:
!mkdir -p data

In [4]:
from google.colab import files
uploaded = files.upload()

Saving 005_dept_data.txt to 005_dept_data.txt


In [5]:
import pandas as pd

columns = ['EmployeeID', 'Name', 'DepartmentID', 'Department', 'StartDate']

df = pd.read_csv('005_dept_data.txt', header=None, names=columns, delimiter=',')
df.head()

Unnamed: 0,EmployeeID,Name,DepartmentID,Department,StartDate
0,149633CM,Marco,10,Accounts,1-01-2019
1,212539MU,Rebekah,10,Accounts,1-01-2019
2,231555ZZ,Itoe,10,Accounts,1-01-2019
3,503996WI,Edouard,10,Accounts,1-01-2019
4,704275DC,Kyle,10,Accounts,1-01-2019


In [6]:
df.describe()

Unnamed: 0,DepartmentID
count,898.0
mean,20.356347
std,8.088555
min,10.0
25%,10.0
50%,20.0
75%,30.0
max,30.0


#Code

In [20]:
class SplitRow(beam.DoFn):
    def process(self, element):
        yield element.split(',')

class FilterAccountsEmployee(beam.DoFn):
    def process(self, element):
        if element[3] == 'Accounts':
            yield element

class PairEmployees(beam.DoFn):
    def process(self, element):
        yield (element[3] + ": " + element[1], 1)

class Counting(beam.DoFn):
    def process(self, element):
        key, values = element
        yield (key, sum(values))

def run():
    p1 = beam.Pipeline()

    attendance_count = (
        p1
        | 'ReadData' >> beam.io.ReadFromText('005_dept_data.txt')
        | 'SplitRows' >> beam.ParDo(SplitRow())
        | 'FilterAccounts' >> beam.ParDo(FilterAccountsEmployee())
        | 'PairEmployees' >> beam.ParDo(PairEmployees())
        | 'GroupByKey' >> beam.GroupByKey()
        | 'CountOccurrences' >> beam.ParDo(Counting())
        | 'WriteOutput' >> beam.io.WriteToText('data/output_new_final')
    )

    p1.run()

if __name__ == '__main__':
    run()



In [21]:
!{('head -n 20 /content/data/output_new_final-00000-of-00001')}

('Accounts: Marco', 31)
('Accounts: Rebekah', 31)
('Accounts: Itoe', 31)
('Accounts: Edouard', 31)
('Accounts: Kyle', 62)
('Accounts: Kumiko', 31)
('Accounts: Gaston', 31)
('Accounts: Ayumi', 30)


In [29]:
class SplitRow(beam.DoFn):
    def process(self, element):
        yield element.split(',')

class FilterAccountsEmployee(beam.DoFn):
    def process(self, element):
        if element[3] == 'Accounts':
            yield element

class PairEmployees(beam.DoFn):
    def process(self, element):
        yield (element[3] + ": " + element[1], 1)    # ("Accounts: Marco", 1)

class Counting(beam.DoFn):
    def process(self, element):
        key, values = element           # [Marco, Accounts  [1,1,1,1....] , Rebekah, Accounts [1,1,1,1,....] ]
        yield (key, sum(values))

def run():
    p1 = beam.Pipeline()

    attendance_count = (
        p1
        | 'ReadData' >> beam.io.ReadFromText('005_dept_data.txt')
        | 'SplitRows' >> beam.ParDo(SplitRow()) # ['331593PS', 'Beryl', '20', 'HR', '1-01-2019']
        | 'FilterAccounts' >> beam.ParDo(FilterAccountsEmployee()) # ['149633CM', 'Marco', '10', 'Accounts', '1-01-2019']
        | 'PairEmployees' >> beam.ParDo(PairEmployees()) # ('Accounts: Marco', 1)
        | 'GroupByKey' >> beam.GroupByKey() # ('Accounts: Marco', [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
        | 'CountOccurrences' >> beam.ParDo(Counting()) # ('Accounts: Marco', 31)
        | 'WriteOutput' >> beam.io.WriteToText('data/output_test')
    )

    p1.run()

if __name__ == '__main__':
    run()



In [30]:
!{('head -n 20 /content/data/output_test-00000-of-00001')}

('Accounts: Marco', 31)
('Accounts: Rebekah', 31)
('Accounts: Itoe', 31)
('Accounts: Edouard', 31)
('Accounts: Kyle', 62)
('Accounts: Kumiko', 31)
('Accounts: Gaston', 31)
('Accounts: Ayumi', 30)
