<a href="https://colab.research.google.com/github/thecodemancer/study-with-me/blob/main/apache-beam/read_from_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install apache-beam[interactive]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting apache-beam[interactive]
  Downloading apache_beam-2.46.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.5/14.5 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastavro<2,>=0.23.6
  Downloading fastavro-1.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.2,>=0.3.1.1
  Downloading dill-0.3.1.1.tar.gz (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 KB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hdfs<3.0.0,>=2.1.0
  Downloading hdfs-2.7.0-py3-none-any.whl (34 kB)
Collecting fasteners<1.0,>=0.3
  Downloading fastene

In [2]:
import apache_beam as beam
import csv
import json

In [3]:
# Define a function to parse the CSV rows into a dictionary format
def parse_csv(row):
    return {
        'name': row[0],
        'age': int(row[1]),
        'gender': row[2],
        'city': row[3]
    }

In [4]:
# Define the pipeline options
options = beam.options.pipeline_options.PipelineOptions()

In [17]:
# Define the pipeline
with beam.Pipeline(options=options) as p:

    # Read the CSV file
    lines = p | 'ReadCSV' >> beam.io.ReadFromText('/content/sample_data/input.csv', skip_header_lines=1)

    # Parse the CSV rows into a dictionary format
    data = lines | 'ParseCSV' >> beam.Map(lambda row: next(csv.reader([row]))) | beam.Map(parse_csv)
    
    # Filter out rows where the age is less than 18
    filtered_data = data | 'FilterData' >> beam.Filter(lambda row: row['age'] >= 18)

    # Write the remaining data to a JSON file
    filtered_data | 'WriteJSON' >> beam.io.WriteToText('/content/sample_data/output.json', num_shards=1, shard_name_template='')



In [18]:
!cat /content/sample_data/input.csv

﻿Alice,25,F,New York
Bob,30,M,San Francisco
Charlie,20,M,Los Angeles
Dave,15,M,Chicago
Emma,35,F,Miami
Frank,50,M,Dallas
Gina,45,F,Seattle
Hank,28,M,Denver
Irene,19,F,Boston

In [19]:
!cat /content/sample_data/output.json

{'name': 'Bob', 'age': 30, 'gender': 'M', 'city': 'San Francisco'}
{'name': 'Charlie', 'age': 20, 'gender': 'M', 'city': 'Los Angeles'}
{'name': 'Emma', 'age': 35, 'gender': 'F', 'city': 'Miami'}
{'name': 'Frank', 'age': 50, 'gender': 'M', 'city': 'Dallas'}
{'name': 'Gina', 'age': 45, 'gender': 'F', 'city': 'Seattle'}
{'name': 'Hank', 'age': 28, 'gender': 'M', 'city': 'Denver'}
{'name': 'Irene', 'age': 19, 'gender': 'F', 'city': 'Boston'}


---
If you made it this far, follow [David Regalado](https://beacons.ai/davidregalado) for more code!