#Intro

In [1]:
!pip install apache-beam

Collecting apache-beam
  Downloading apache_beam-2.61.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting crcmod<2.0,>=1.7 (from apache-beam)
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.2,>=0.3.1.1 (from apache-beam)
  Downloading dill-0.3.1.1.tar.gz (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cloudpickle~=2.2.1 (from apache-beam)
  Downloading cloudpickle-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting fastavro<2,>=0.23.6 (from apache-beam)
  Downloading fastavro-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting fasteners<1.0,>=0.3 (from apache-beam)
  D

In [2]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.transforms.core import DoFn

In [3]:
!mkdir -p data

In [4]:
from google.colab import files
uploaded = files.upload()

Saving 005_dept_data.txt to 005_dept_data.txt


In [5]:
import pandas as pd

columns = ['EmployeeID', 'Name', 'DepartmentID', 'Department', 'StartDate']

df = pd.read_csv('005_dept_data.txt', header=None, names=columns, delimiter=',')
df.head()

Unnamed: 0,EmployeeID,Name,DepartmentID,Department,StartDate
0,149633CM,Marco,10,Accounts,1-01-2019
1,212539MU,Rebekah,10,Accounts,1-01-2019
2,231555ZZ,Itoe,10,Accounts,1-01-2019
3,503996WI,Edouard,10,Accounts,1-01-2019
4,704275DC,Kyle,10,Accounts,1-01-2019


In [6]:
df.describe()

Unnamed: 0,DepartmentID
count,898.0
mean,20.356347
std,8.088555
min,10.0
25%,10.0
50%,20.0
75%,30.0
max,30.0


#Code

In [9]:
def SplitRow(element):
    return element.split(',')

p = beam.Pipeline()


input_collection = (
                      p
                      | "Read from text file" >> beam.io.ReadFromText('005_dept_data.txt')
                      | "Split rows" >> beam.Map(SplitRow)
                   )

accounts_count = (
                    input_collection
                    | 'Get all Accounts dept persons' >> beam.Filter(lambda record: record[3] == 'Accounts')
                    | 'Pair each accounts employee with 1' >> beam.Map(lambda record: ("Accounts, " +record[1], 1))
                    | 'Group and sum1' >> beam.CombinePerKey(sum)
                    | 'Write results for account' >> beam.io.WriteToText('data/Account')
                 )

hr_count = (
                    input_collection
                    | 'Get all HR dept persons' >> beam.Filter(lambda record: record[3] == 'HR')
                    | 'Pair each hr employee with 1' >> beam.Map(lambda record: ("HR, " +record[1], 1))
                    | 'Group and sum' >> beam.CombinePerKey(sum)
                    | 'Write results for hr' >> beam.io.WriteToText('data/HR')
           )

output =(
    (accounts_count,hr_count)
    | beam.Flatten()
    | beam.io.WriteToText('data/both')
)



p.run()



<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7e45f7a7f580>

In [8]:
!{('head -n 20 /content/data/both-00000-of-00001')}

('Accounts, Marco', 31)
('Accounts, Rebekah', 31)
('Accounts, Itoe', 31)
('Accounts, Edouard', 31)
('Accounts, Kyle', 62)
('Accounts, Kumiko', 31)
('Accounts, Gaston', 31)
('Accounts, Ayumi', 30)
('HR, Beryl', 62)
('HR, Olga', 31)
('HR, Leslie', 31)
('HR, Mindy', 31)
('HR, Vicky', 31)
('HR, Richard', 31)
('HR, Kirk', 31)
('HR, Kaori', 31)
('HR, Oscar', 31)


In [11]:
!{('head -n 20 /content/data/Account-00000-of-00001')}

('Accounts, Marco', 31)
('Accounts, Rebekah', 31)
('Accounts, Itoe', 31)
('Accounts, Edouard', 31)
('Accounts, Kyle', 62)
('Accounts, Kumiko', 31)
('Accounts, Gaston', 31)
('Accounts, Ayumi', 30)


In [10]:
!{('head -n 20 /content/data/HR-00000-of-00001')}

('HR, Beryl', 62)
('HR, Olga', 31)
('HR, Leslie', 31)
('HR, Mindy', 31)
('HR, Vicky', 31)
('HR, Richard', 31)
('HR, Kirk', 31)
('HR, Kaori', 31)
('HR, Oscar', 31)


#word_count

In [28]:
def run():
    input_file = '/content/data.txt'
    output_file = 'data/wordcount_output_1'

    with beam.Pipeline() as p:
        (
            p
            | "Read File" >> beam.io.ReadFromText(input_file)  # Odczyt danych z pliku
            | "Split into Words" >> beam.FlatMap(lambda line: line.split())  # Rozdzielenie wierszy na słowa
            | "Pair with 1" >> beam.Map(lambda word: (word, 1))  # Mapowanie każdego słowa na parę (słowo, 1)
            | "Count Words" >> beam.CombinePerKey(sum)  # Sumowanie wartości dla każdego słowa
            | "Format Results" >> beam.Map(lambda word_count: f"word: {word_count[0]} count: {word_count[1]}")  # Formatowanie wyników
            | "Write Results" >> beam.io.WriteToText(output_file, file_name_suffix='.txt')  # Zapis wyników do pliku
        )

# Uruchomienie funkcji
if __name__ == "__main__":
    run()

In [29]:
!{('head -n 20 /content/data/wordcount_output_1-00000-of-00001.txt')}

word: KING count: 242
word: LEAR count: 222
word: DRAMATIS count: 1
word: PERSONAE count: 1
word: king count: 29
word: of count: 439
word: Britain count: 1
word: (KING count: 1
word: LEAR:) count: 1
word: OF count: 15
word: FRANCE: count: 1
word: DUKE count: 3
word: BURGUNDY count: 6
word: (BURGUNDY:) count: 1
word: CORNWALL count: 56
word: (CORNWALL:) count: 1
word: ALBANY count: 60
word: (ALBANY:) count: 1
word: EARL count: 2
word: KENT count: 136


In [30]:
import re

input_file = '/content/data.txt'
output_file = 'data/wordcount_output_2'

with beam.Pipeline() as p:
  (
      p
      | 'Read lines' >> beam.io.ReadFromText(input_file)
      | 'Find words' >> beam.FlatMap(lambda line: re.findall(r"[a-zA-Z']+", line))
      | 'Pair words with 1' >> beam.Map(lambda word: (word, 1))
      | 'Group and sum' >> beam.CombinePerKey(sum)
    #   | 'Format results' >> beam.Map(lambda word_count: str(word_count))
      | "Format Results" >> beam.Map(lambda word_count: f"word: {word_count[0]} count: {word_count[1]}")
      | 'Write results' >> beam.io.WriteToText(output_file, file_name_suffix='.txt')

  )

In [31]:
!{('head -n 20 /content/data/wordcount_output_2-00000-of-00001.txt')}

word: KING count: 243
word: LEAR count: 236
word: DRAMATIS count: 1
word: PERSONAE count: 1
word: king count: 65
word: of count: 447
word: Britain count: 2
word: OF count: 15
word: FRANCE count: 10
word: DUKE count: 3
word: BURGUNDY count: 8
word: CORNWALL count: 63
word: ALBANY count: 67
word: EARL count: 2
word: KENT count: 156
word: GLOUCESTER count: 141
word: EDGAR count: 126
word: son count: 29
word: to count: 438
word: Gloucester count: 26
