In [None]:
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.operators.dummy import DummyOperator
from airflow.utils.dates import days_ago
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
import pandas as pd
import tempfile
import os


default_args = {
    'owner':'EMPLOYEE_ID', # owner是DAG的開發者, 例如: 員工8703147
}


def read_s3_write_s3(source_s3_conn_id, source_bucket_name, source_object_key,
                     target_s3_conn_id, target_bucket_name, target_object_key):
    # create s3 connection
    # 透過Connection來取得S3Hook物件的實例
    s3_hook_source = S3Hook(aws_conn_id=source_s3_conn_id)
    s3_hook_target = S3Hook(aws_conn_id=target_s3_conn_id)
    # download s3 object to local temp file
    try:
        # 暫存檔案名
        source_temp_filename = s3_hook_source.download_file(
            bucket_name=source_bucket_name,
            key=source_object_key)
        print(f'Download s3 object to local: {source_temp_filename}, success!')
    except Exception as e:
        print(f"Download s3 object fail:{e}")
    # read parquet to dataframe
    df_nyc_taxi = pd.read_parquet(source_temp_filename)
    print(df_nyc_taxi.info())
    print(df_nyc_taxi.head())
    # remove temp file
    os.remove(source_temp_filename)
    # write dataframe as parquet to s3
    with tempfile.NamedTemporaryFile('w+') as fp2:
        target_temp_filename = fp2.name  # 暫存檔案名
        try:
            df_nyc_taxi.to_parquet(target_temp_filename, compression='gzip', index=False)
            s3_hook_target.load_file(filename=target_temp_filename,
                                     bucket_name=target_bucket_name,
                                     key=target_object_key,
                                     replace=True)
            print(f'Upload parquet to s3: {target_temp_filename}, success!')
        except Exception as e:
            print(f"Upload parquet to s3 fail:{e}")


dag = DAG(
    dag_id="deXX_s3_read_write_demo2", # prefix必需是tenant id, 例如: de00
    description="dag to demo read/write parquet from s3",
    start_date=days_ago(2),
    schedule_interval=None,
    catchup=False,
    default_args=default_args,
    access_control={
        'deXX': {'can_read', 'can_edit'} # 設定DAG歸屬那個團隊[tenant id]與權限
    }
)

read_write_parquet_s3 = PythonOperator(
    task_id='read_s3_wirte_s3',
    python_callable=read_s3_write_s3,
    op_kwargs={
        'source_s3_conn_id': 'deXX_minio',
        'source_bucket_name': 'public',
        'source_object_key': 'de04/data/nyc_taxi_trip_duration.parquet.gz',
        'target_s3_conn_id': 'deXX_minio',
        'target_bucket_name': 'EMPLOYEE_ID',
        'target_object_key': 'de06/data/nyc_taxi_trip_duration2.parquet.gz'
    },
    dag=dag,
)

notify_operator = DummyOperator(task_id='notify_someone', dag=dag)

# Set dependencies between all tasks
read_write_parquet_s3 >> notify_operator