<a href="https://colab.research.google.com/github/thecodemancer/study-with-me/blob/main/apache-beam/read_from_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install apache-beam

Collecting apache-beam
  Downloading apache_beam-2.58.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Collecting crcmod<2.0,>=1.7 (from apache-beam)
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting orjson<4,>=3.9.7 (from apache-beam)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.2,>=0.3.1.1 (from apache-beam)
  Downloading dill-0.3.1.1.tar.gz (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fastavro<2,>=0.23.6 (from apache-beam)

In [4]:
!ls -l

total 8
-rw-r--r-- 1 root root 2931 Aug 15 12:05 data_input.csv
drwxr-xr-x 1 root root 4096 Aug 13 13:26 sample_data


In [5]:
!head data_input.csv

name,age,gender,city
Person 164,61,Male,London
Person 81,30,Male,London
Person 742,65,Female,New York
Person 938,18,Female,Tokyo
Person 404,79,Male,Chicago
Person 981,60,Other,Los Angeles
Person 485,60,Other,Tokyo
Person 254,39,Other,New York
Person 281,53,Other,Chicago


In [2]:
import apache_beam as beam
import csv
import json

In [26]:
def parse_csv(row):
  temp = row.split(',')
  return {
          'name': temp[0],
          'age': int(temp[1]),
          'gender' : temp[2],
          'city' : temp[3]
  }

In [27]:
# Define the pipeline options
options = beam.options.pipeline_options.PipelineOptions()

In [28]:
# Define the pipeline
with beam.Pipeline(options=options) as p:

  # Read the CSV file
  lines = p | 'Read CSV' >> beam.io.ReadFromText('data_input.csv', skip_header_lines=1)

  # Parse the CSV rows into a dictionary format
  parsed_data = lines | 'Print CSV' >> beam.Map(lambda row: parse_csv(row))

  # Filter out rows where the age is less than 18
  filtered_data = parsed_data | 'Filter Data' >> beam.Filter(lambda row: row['age'] >= 18)

  # Group by city
  summarized_data_by_city = filtered_data | 'Summarize by City' >> beam.GroupBy(lambda row: row['city']) | beam.Map(lambda row: {'name': row[0], 'count': len(row[1])})

  # Group by gender
  summarized_data_by_gender = filtered_data | 'Summarize by Gender' >> beam.GroupBy(lambda row: row['gender']) | beam.Map(lambda row: {'name': row[0], 'count': len(row[1])})

  # Write the summarized data to a JSON file

  # By city
  summarized_data_by_city | 'Write total by city to a JSON file' >> beam.io.WriteToText('summarized_data_by_city.json', num_shards=1, shard_name_template='')

  # By Gender
  summarized_data_by_gender | 'Write total by gender to a JSON file' >> beam.io.WriteToText('summarized_data_by_gender.json', num_shards=1, shard_name_template='')



In [29]:
!ls -l

total 16
-rw-r--r-- 1 root root 2931 Aug 17 21:49 data_input.csv
drwxr-xr-x 1 root root 4096 Aug 14 13:23 sample_data
-rw-r--r-- 1 root root  167 Aug 17 22:05 summarized_data_by_city.json
-rw-r--r-- 1 root root   93 Aug 17 22:05 summarized_data_by_gender.json


In [30]:
!head summarized_data_by_city.json

{'name': 'London', 'count': 26}
{'name': 'New York', 'count': 17}
{'name': 'Tokyo', 'count': 17}
{'name': 'Chicago', 'count': 22}
{'name': 'Los Angeles', 'count': 18}


In [31]:
!head summarized_data_by_gender.json

{'name': 'Male', 'count': 38}
{'name': 'Female', 'count': 32}
{'name': 'Other', 'count': 30}


---
If you made it this far, follow [David Regalado](https://beacons.ai/davidregalado) for more code!