#Intro

In [1]:
!pip install apache-beam

Collecting apache-beam
  Downloading apache_beam-2.61.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting crcmod<2.0,>=1.7 (from apache-beam)
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.2,>=0.3.1.1 (from apache-beam)
  Downloading dill-0.3.1.1.tar.gz (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cloudpickle~=2.2.1 (from apache-beam)
  Downloading cloudpickle-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting fastavro<2,>=0.23.6 (from apache-beam)
  Downloading fastavro-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting fasteners<1.0,>=0.3 (from apache-beam)
  D

In [10]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.transforms.core import DoFn

In [5]:
!mkdir -p data

In [None]:
from google.colab import files
uploaded = files.upload()

#Code

##p2

In [21]:
p2 = beam.Pipeline()

# list/array = []
# set = ()
# dictionary = {}

lines = (
            p2
            | beam.Create([
               'Using create transform ',
               'to generate in memory data ',
               'This is 3rd line ',
               'Thanks '])

            | beam.io.WriteToText('data/outCreate1', file_name_suffix='.txt')
          )
p2.run()

<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7f2b1e5b6f80>

In [22]:
!{('head -n 20 /content/data/outCreate1-00000-of-00001.txt')}

Using create transform 
to generate in memory data 
This is 3rd line 
Thanks 


In [19]:
class AddLineNumbersFn(DoFn):
    """Dodaje numer linii do każdego elementu."""
    def __init__(self):
        self.index = 0  # Inicjalizuje licznik indeksów jako 0.

    def process(self, element):
        # Zwiększa licznik o 1 dla każdego elementu.
        self.index += 1
        # Dodaje numer linii do elementu i zwraca w formacie "{numer}. {element}".
        yield f"{self.index}. {element}"


def main():
    # Ustawienia opcji potoku
    pipeline_options = PipelineOptions()
    p = beam.Pipeline(options=pipeline_options)

    # Dane wejściowe w pamięci
    input_data = [
        'Using create transform',
        'to generate in memory data',
        'This is 3rd line',
        'Thanks'
    ]

    lines = (
        p
        # Tworzenie danych wejściowych za pomocą transformacji `Create`
        | 'CreateInput' >> beam.Create(input_data)

        # Dodanie numerów linii (indeksu) za pomocą transformacji `ParDo`
        | 'AddLineNumbers' >> beam.ParDo(AddLineNumbersFn())

        # Filtracja krótkich linii
        | 'FilterShortLines' >> beam.Filter(
            lambda line: len(line) > 20
        )

        # Zapis wyników do pliku tekstowego
        | 'WriteToFiles' >> beam.io.WriteToText(
            'data/processed_text',
            file_name_suffix='.txt',  # Format pliku tekstowego
            num_shards=1,  # Jeden plik
            header='# Processed text data',  # Nagłówek pliku
            footer='# End of file'  # Stopka pliku
        )
    )

    # Uruchomienie potoku
    p.run().wait_until_finish()


if __name__ == '__main__':
    main()



In [20]:
!{('head -n 20 /content/data/processed_text-00000-of-00001.txt')}

# Processed text data
1. Using create transform
2. to generate in memory data
# End of file


##p3

In [23]:
p3 = beam.Pipeline()

lines1 = (p3

           | beam.Create([1,2,3,4,5,6,7,8,9])

           | beam.io.WriteToText('data/outCreate2', file_name_suffix='.txt')
          )
p3.run()

<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7f2b1d451a20>

In [24]:
!{('head -n 20 /content/data/outCreate2-00000-of-00001.txt')}

1
2
3
4
5
6
7
8
9


##p4

In [25]:
p4 = beam.Pipeline()

lines = (p4
           | beam.Create([("maths",52),("english",75),("science",82), ("computer",65),("maths",85)])

            | beam.io.WriteToText('data/outCreate3', file_name_suffix='.txt')
          )
p4.run()

<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7f2b1d0d8dc0>

In [26]:
!{('head -n 20 /content/data/outCreate3-00000-of-00001.txt')}

('maths', 52)
('english', 75)
('science', 82)
('computer', 65)
('maths', 85)


In [27]:
p5 = beam.Pipeline()

lines = ( p5

       | beam.Create({'row1':[1,2,3,4,5],
                     'row2':[1,2,3,4,5]})

          """Stosuje transformację Map, która w tym przypadku nie zmienia elementów,
          bo przekazywana jest funkcja tożsamościowa (lambda element: element).
          W efekcie każdy element pozostaje taki sam."""
       | beam.Map(lambda element: element)

       | beam.io.WriteToText('data/outCreate4', file_name_suffix='.txt')
  )

p5.run()

<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x7f2b1d230430>

In [28]:
!{('head -n 20 /content/data/outCreate4-00000-of-00001.txt')}

('row1', [1, 2, 3, 4, 5])
('row2', [1, 2, 3, 4, 5])
