In [17]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
import csv

dataset1 = read_csv_file('../input_data/dataset1.csv')
dataset2 = read_csv_file('../input_data/dataset2.csv')

def read_csv_file(file_path):
    with open(file_path, newline='') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header row
        return [tuple(row) for row in reader]

def merge_datasets(element):
    invoice_id, legal_entity, counter_party, rating, status, value, tier = element
    return legal_entity, counter_party, tier, rating, status, value

def filter_arap(element):
    legal_entity, counter_party, tier, rating, status, value = element
    return status == 'ARAP'

def filter_accr(element):
    legal_entity, counter_party, tier, rating, status, value = element
    return status == 'ACCR'

def sum_arap_values(elements):
    legal_entity, counter_party, tier, ratings, arap_values, _ = zip(*elements)
    return legal_entity[0], counter_party[0], tier[0], max(ratings), sum(arap_values), 0

def sum_accr_values(elements):
    legal_entity, counter_party, tier, ratings, _, accr_values = zip(*elements)
    return legal_entity[0], counter_party[0], tier[0], max(ratings), 0, sum(accr_values)

def write_to_csv(elements):
    with open('../output_data/output.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['legal_entity', 'counter_party', 'tier', 'max_rating', 'arap_sum', 'accr_sum'])
        for element in elements:
            writer.writerow(element)

with beam.Pipeline(options=PipelineOptions()) as pipeline:
    input1 = pipeline | 'Create dataset1' >> beam.Create(dataset1)
    input2 = pipeline | 'Create dataset2' >> beam.Create(dataset2)

    merged = (
        {'dataset1': input1, 'dataset2': input2}
        | 'CoGroupByKey' >> beam.CoGroupByKey()
        | 'FlatMap Merged Datasets' >> beam.FlatMap(lambda elem: [(elem[0], *value) for value in elem[1]['dataset1']])
        | 'Merge Datasets' >> beam.Map(merge_datasets)
    )

    arap_sums = (
        merged
        | 'Filter ARAP' >> beam.Filter(filter_arap)
        | 'Group ARAP' >> beam.GroupBy(lambda elem: (elem[0], elem[1], elem[2]))
        | 'Sum ARAP Values' >> beam.Map(sum_arap_values)
    )

    accr_sums = (
        merged
        | 'Filter ACCR' >> beam.Filter(filter_accr)
        | 'Group ACCR' >> beam.GroupBy(lambda elem: (elem[0], elem[1], elem[2]))
        | 'Sum ACCR Values' >> beam.Map(sum_accr_values)
    )

    result = (
        (arap_sums, accr_sums)
        | 'Flatten Results' >> beam.Flatten()
        | 'Group Results' >> beam.GroupBy(lambda elem: (elem[0], elem[1], elem[2]))
        | 'Combine Results' >> beam.Map(lambda elem: (elem[0], elem[1], elem[2], max([x[3] for x in elem[1]]), sum([x[4] for x in elem[1]]), sum([x[5] for x in elem[1]])))
    )

    output = result | 'Collect Output' >> beam.CombineGlobally(beam.combiners.ToListCombineFn())
    output | 'Write to CSV' >> beam.Map(write_to_csv)



Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "apache_beam/runners/common.py", line 1418, in apache_beam.runners.common.DoFnRunner.process
  File "apache_beam/runners/common.py", line 625, in apache_beam.runners.common.SimpleInvoker.invoke_process
  File "/Users/emmanuelsekyi/anaconda3/lib/python3.10/site-packages/apache_beam/transforms/core.py", line -1, in <lambda>
TypeError: <lambda>() takes 2 positional arguments but 6 were given

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/emmanuelsekyi/anaconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3460, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/_l/g109827s4sv_6p83rwst2h7h0000gp/T/ipykernel_31577/307886436.py", line 41, in <module>
    with beam.Pipeline(options=PipelineOptions()) as pipeline:
  File "/Users/emmanuelsekyi/anaconda3/lib/python3.10/site-packages/apache_beam/pipeline.py", line 600, in __