<a href="https://colab.research.google.com/github/thecodemancer/study-with-me/blob/main/apache_beam/read_from_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install apache-beam

Collecting apache-beam
  Downloading apache_beam-2.57.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting crcmod<2.0,>=1.7 (from apache-beam)
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting orjson<4,>=3.9.7 (from apache-beam)
  Downloading orjson-3.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.2,>=0.3.1.1 (from apache-beam)
  Downloading dill-0.3.1.1.tar.gz (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fastavro<2,>=0.23.6 (from apache-beam)

In [25]:
!ls -l

total 4
-rw-r--r-- 1 root root 2981 Aug  2 03:14 data_input.csv


In [31]:
!head data_input.csv

name,age,gender,city
Person 72,40,Female,London
Person 337,61,Male,Chicago
Person 425,76,Female,Los Angeles
Person 485,55,Female,London
Person 253,65,Female,Tokyo
Person 991,44,Male,Chicago
Person 347,62,Other,Tokyo
Person 151,69,Other,London
Person 463,25,Male,Tokyo


In [26]:
import apache_beam as beam
import csv
import json

In [27]:
def parse_csv(row):
  return {
      'name': row[0],
      'age': int(row[1]),
      'gender' : row[2],
      'city' : row[3]
  }

In [28]:
# Define the pipeline options
options = beam.options.pipeline_options.PipelineOptions()

In [29]:
# Define the pipeline
with beam.Pipeline(options=options) as p:

  #Read the CSV file
  lines = p | 'Read CSV' >> beam.io.ReadFromText('data_input.csv', skip_header_lines=1)

  # Parse the CSV rows into a dictionary format
  data = lines | 'ParseCSV' >> beam.Map(lambda row: next(csv.reader([row]))) | beam.Map(parse_csv)

  # Filter out rows where the age is less than 18
  filtered_data = data | 'FilterData' >> beam.Filter(lambda row: row['age'] >= 18)

  #Write the remaining data to a JSON file
  filtered_data | 'Write JSON' >> beam.io.WriteToText('data_output.json', num_shards=1, shard_name_template='')



In [30]:
!ls -l

total 12
-rw-r--r-- 1 root root 2981 Aug  2 03:14 data_input.csv
-rw-r--r-- 1 root root 7259 Aug  2 03:20 data_output.json


In [32]:
!head data_output.json

{'name': 'Person 72', 'age': 40, 'gender': 'Female', 'city': 'London'}
{'name': 'Person 337', 'age': 61, 'gender': 'Male', 'city': 'Chicago'}
{'name': 'Person 425', 'age': 76, 'gender': 'Female', 'city': 'Los Angeles'}
{'name': 'Person 485', 'age': 55, 'gender': 'Female', 'city': 'London'}
{'name': 'Person 253', 'age': 65, 'gender': 'Female', 'city': 'Tokyo'}
{'name': 'Person 991', 'age': 44, 'gender': 'Male', 'city': 'Chicago'}
{'name': 'Person 347', 'age': 62, 'gender': 'Other', 'city': 'Tokyo'}
{'name': 'Person 151', 'age': 69, 'gender': 'Other', 'city': 'London'}
{'name': 'Person 463', 'age': 25, 'gender': 'Male', 'city': 'Tokyo'}
{'name': 'Person 492', 'age': 21, 'gender': 'Other', 'city': 'London'}


---
If you made it this far, follow [David Regalado](https://beacons.ai/davidregalado) for more code!