**Reading and Writing data with apache beam:**
- reference link: https://colab.research.google.com/github/apache/beam/blob/master/examples/notebooks/tour-of-beam/reading-and-writing-data.ipynb#scrollTo=xDXdE9uysriw

In [None]:
import apache_beam as beam

#using beam.io.ReadFromText(file_pattern)
#ReadFromText to parse file for each line
input_files = 'data/*.txt'
with beam.Pipeline() as pipeline:
  (
      pipeline
      | 'Read files' >> beam.io.ReadFromText(input_files)
      | 'Print contents' >> beam.Map(print)
  )

In [None]:
import apache_beam as beam

#using beam.io.WriteToText() to write each line to a file
output_file_name_prefix = 'outputs/file'
with beam.Pipeline() as pipeline:
  (
      pipeline
      | 'Create file lines' >> beam.Create([
          'Each element must be a string.',
          'It writes one element per line.',
          'There are no guarantees on the line order.',
          'The data might be written into multiple files.',
      ])
      | 'Write to files' >> beam.io.WriteToText(
          output_file_name_prefix,
          file_name_suffix='.txt')
  )

In [1]:
import apache_beam as beam
from typing import Iterable

def count(n: int) -> Iterable[int]:
  for i in range(n):
    yield i

n = 5
with beam.Pipeline() as pipeline:
  (
      pipeline
      | 'Create inputs' >> beam.Create([n])
      | 'Generate elements' >> beam.FlatMap(count)
      | 'Print elements' >> beam.Map(print)
  )



0
1
2
3
4


In [None]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from typing import Iterable

#define your custom PTranform with decorator
@beam.ptransform_fn
@beam.typehints.with_input_types(beam.pvalue.PBegin)
@beam.typehints.with_output_types(int)
def Count(pbegin: beam.pvalue.PBegin, n: int) -> beam.PCollection[int]:
  def count(n: int) -> Iterable[int]:
    for i in range(n):
      yield i
  #this will use create Pcollection and then apply FlatMap to return a list 
  return (
      pbegin
      | 'Create inputs' >> beam.Create([n])
      | 'Generate elements' >> beam.FlatMap(count) #FlatMap take input as element return iterable of elements
  )

n = 5
options = PipelineOptions(flags=[], type_check_additional='all')
with beam.Pipeline(options=options) as pipeline:
  (
      pipeline
      | f'Count to {n}' >> Count(n)
      | 'Print elements' >> beam.Map(print)
  )

In [None]:
#read data from csv file
import apache_beam as beam
from apache_beam.io.filesystems import FileSystems as beam_fs
from apache_beam.options.pipeline_options import PipelineOptions
import codecs
import csv
from typing import Dict, Iterable, List

@beam.ptransform_fn
@beam.typehints.with_input_types(beam.pvalue.PBegin)
@beam.typehints.with_output_types(Dict[str, str])
def ReadCsvFiles(pbegin: beam.pvalue.PBegin, file_patterns: List[str]) -> beam.PCollection[Dict[str, str]]:
  def expand_pattern(pattern: str) -> Iterable[str]:
    for match_result in beam_fs.match([pattern])[0].metadata_list:
      yield match_result.path

  def read_csv_lines(file_name: str) -> Iterable[Dict[str, str]]:
    with beam_fs.open(file_name) as f:
      # Beam reads files as bytes, but csv expects strings,
      # so we need to decode the bytes into utf-8 strings.
      for row in csv.DictReader(codecs.iterdecode(f, 'utf-8')):
        yield dict(row)

  return (
      pbegin
      | 'Create file patterns' >> beam.Create(file_patterns)
      | 'Expand file patterns' >> beam.FlatMap(expand_pattern)
      | 'Read CSV lines' >> beam.FlatMap(read_csv_lines)
  )

input_patterns = ['data/*.csv']
options = PipelineOptions(flags=[], type_check_additional='all')
with beam.Pipeline(options=options) as pipeline:
  (
      pipeline
      | 'Read CSV files' >> ReadCsvFiles(input_patterns)
      | 'Print elements' >> beam.Map(print)
  )

In [None]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
import sqlite3
from typing import Iterable, List, Tuple
#how to read data from sql lie
    #beam.Create will create PCollection with each elements is ('table_name', [column1, column2])
    #beam.ParDo() will apply custom tranformation. ParDo will take input with type DoFn
class SQLiteSelect(beam.DoFn):

    def __init__(self, database_file: str):
        self.database_file = database_file
        self.connection = None

    def setup(self):
        self.connection = sqlite3.connect(self.database_file)

    def process(self, query: Tuple[str, List[str]]) -> Iterable[Dict[str, str]]:
        table, columns = query
        cursor = self.connection.cursor()
        cursor.execute(f"SELECT {','.join(columns)} FROM {table}")
        for row in cursor.fetchall():
            yield dict(zip(columns, row)) #return a data point as dict('column_name': value)

    def teardown(self):
        self.connection.close()

@beam.ptransform_fn
@beam.typehints.with_input_types(beam.pvalue.PBegin)
@beam.typehints.with_output_types(Dict[str, str])
def SelectFromSQLite(
    pbegin: beam.pvalue.PBegin,
    database_file: str,
    queries: List[Tuple[str, List[str]]],
) -> beam.PCollection[Dict[str, str]]:
  return (
      pbegin
      | 'Create None' >> beam.Create(queries)
      #for each query fire a query to database with process method
      | 'SQLite SELECT' >> beam.ParDo(SQLiteSelect(database_file)) # database_file just for connection with sqlite
  )

queries = [
    # (table_name, [column1, column2, ...])
    ('moon_phases', ['phase_emoji', 'peak_datetime', 'phase']),
    ('moon_phases', ['phase_emoji', 'phase']),
]

options = PipelineOptions(flags=[], type_check_additional='all')
with beam.Pipeline(options=options) as pipeline:
  (
      pipeline
      | 'Read from SQLite' >> SelectFromSQLite(database_file, queries)
      | 'Print rows' >> beam.Map(print)
  )

In [1]:
#zip to combine each elements of two list
a = ("John", "Charles", "Mike")
b = ("Jenny", "Christy", "Monica")

for x, y in zip(a, b):
    print(x, y)

John Jenny
Charles Christy
Mike Monica
