In [None]:
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.operators.dummy import DummyOperator
from airflow.utils.dates import days_ago
import boto3
import pandas as pd
import tempfile


default_args = {
    'owner':'EMPLOYEE_ID', # owner是DAG的開發者, 例如: 員工8703147
}


def read_s3_write_s3(source_s3_url, source_s3_access_key, source_s3_secret, source_bucket_name, source_object_key,
                     target_s3_url, target_s3_access_key, target_s3_secret, target_bucket_name, target_object_key):
    # create s3 connection
    s3_source_resource = boto3.resource('s3',
                                 endpoint_url=source_s3_url,
                                 aws_access_key_id=source_s3_access_key,
                                 aws_secret_access_key=source_s3_secret,
                                 region_name='us-east-1',
                                 )
    s3_target_resource = boto3.resource('s3',
                                        endpoint_url=target_s3_url,
                                        aws_access_key_id=target_s3_access_key,
                                        aws_secret_access_key=target_s3_secret,
                                        region_name='us-east-1',
                                        )
    # download s3 object to local temp file
    with tempfile.NamedTemporaryFile('w+') as fp:
        source_temp_filename = fp.name # 暫存檔案名
        try:
            s3_source_resource.Object(source_bucket_name, source_object_key).download_file(source_temp_filename)
            print(f'Download s3 object to local: {source_temp_filename}, success!')
        except Exception as e:
            print(f"Download s3 object fail:{e}")
        # read parquet to dataframe
        df_nyc_taxi = pd.read_parquet(source_temp_filename)
        print(df_nyc_taxi.info())
        print(df_nyc_taxi.head())

        # write dataframe as parquet to s3
        with tempfile.NamedTemporaryFile('w+') as fp2:
            target_temp_filename = fp2.name  # 暫存檔案名
            try:
                df_nyc_taxi.to_parquet(target_temp_filename, compression='gzip', index=False)

                s3_target_resource.Object(target_bucket_name, target_object_key).upload_file(target_temp_filename)
                print(f'Upload parquet to s3: {target_temp_filename}, success!')
            except Exception as e:
                print(f"Upload parquet to s3 fail:{e}")


dag = DAG(
    dag_id="deXX_s3_read_write_demo", # prefix必需是tenant id, 例如: de00
    description="dag to demo read/write parquet from s3",
    start_date=days_ago(2),
    schedule_interval=None,
    catchup=False,
    default_args=default_args,
    access_control={
        'deXX': {'can_read', 'can_edit'} # 設定DAG歸屬那個團隊[tenant id]與權限
    }
)

read_write_parquet_s3 = PythonOperator(
    task_id='read_s3_wirte_s3',
    python_callable=read_s3_write_s3,
    op_kwargs={
        'source_s3_url': 'http://10.34.124.114:9000',
        'source_s3_access_key': 'EMPLOYEE_ID',
        'source_s3_secret': 'xxxxxxxxx',
        'source_bucket_name': 'public',
        'source_object_key': 'de04/data/nyc_taxi_trip_duration.parquet.gz',
        'target_s3_url': 'http://10.34.124.114:9000',
        'target_s3_access_key': 'EMPLOYEE_ID',
        'target_s3_secret': 'xxxxxxxxx',
        'target_bucket_name': 'EMPLOYEE_ID', # 英文字元要轉為小寫
        'target_object_key': 'de06/data/nyc_taxi_trip_duration.parquet.gz'
    },
    dag=dag,
)
notify_operator = DummyOperator(task_id='notify_someone', dag=dag)
# Set dependencies between all tasks
read_write_parquet_s3 >> notify_operator
