In [12]:
!pip install apache-beam[gcp]




In [9]:
# Create input.txt in the Colab environment
with open('input.txt', 'w') as f:
    f.write("his exercise is to learn apache beam\n")
    f.write("I will understand apache beam after this assignment\n")
    f.write("Lets find out what apache beam is\n")


In [10]:
import apache_beam as beam
from apache_beam.transforms.window import FixedWindows
from apache_beam.options.pipeline_options import PipelineOptions
import time

# Define a custom composite transform (e.g., word count)
class CountWords(beam.PTransform):
    def expand(self, pcoll):
        return (
            pcoll
            | "Split words" >> beam.FlatMap(lambda x: x.split())
            | "Pair with 1" >> beam.Map(lambda x: (x, 1))
            | "Group and sum" >> beam.CombinePerKey(sum)
        )

# Create a pipeline
options = PipelineOptions()

with beam.Pipeline(options=options) as p:
    # Step 1: Reading from an input text file
    lines = p | "Read Input" >> beam.io.ReadFromText('input.txt')

    # Step 2: Use the custom composite transform for word counting
    word_counts = lines | "Count words" >> CountWords()

    # Step 3: Windowing
    windowed_counts = (
        word_counts
        | "Apply Fixed Window" >> beam.WindowInto(FixedWindows(60))  # 60-second windows
        | "Sum counts in window" >> beam.CombinePerKey(sum)
    )

    # Step 4: Triggers
    windowed_counts = (
        windowed_counts
        | "Trigger after processing" >> beam.WindowInto(
            FixedWindows(60),
            trigger=beam.transforms.trigger.AfterProcessingTime(10),  # 10 seconds
            accumulation_mode=beam.transforms.trigger.AccumulationMode.DISCARDING
        )
    )

    # Step 5: ParDo
    class ProcessWords(beam.DoFn):
        def process(self, element):
            word, count = element
            yield f"Word: {word}, Count: {count}"

    processed = word_counts | "Process Words with ParDo" >> beam.ParDo(ProcessWords())

    # Step 6: Write output to text file
    processed | "Write Output" >> beam.io.WriteToText('output.txt')





In [11]:
!cat output.txt-00000-of-00001


Word: his, Count: 1
Word: exercise, Count: 1
Word: is, Count: 2
Word: to, Count: 1
Word: learn, Count: 1
Word: apache, Count: 3
Word: beam, Count: 3
Word: I, Count: 1
Word: will, Count: 1
Word: understand, Count: 1
Word: after, Count: 1
Word: this, Count: 1
Word: assignment, Count: 1
Word: Lets, Count: 1
Word: find, Count: 1
Word: out, Count: 1
Word: what, Count: 1
