In [None]:
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.providers.postgres.hooks.postgres import PostgresHook
from airflow.utils.dates import days_ago
from airflow.exceptions import AirflowException
import tempfile
import pandas as pd

# download wikipedia pageviews data
def get_data(s3_conn_id, bucket_name, object_key_prefix, pg_conn_id, sql, **context):
    year, month, day, hour, *_ = context['execution_date'].timetuple()
    # create postgres connection
    pg_hook = PostgresHook(postgres_conn_id=pg_conn_id)
    # get sqlalchemy engine instance
    sqlalchemy_engine = pg_hook.get_sqlalchemy_engine()
    try:
        # dump dataframe to sql table via pandas.to_sql() method
        df = pd.read_sql(sql, sqlalchemy_engine)
        print(df.info())
        print(df.head())
    except Exception as e:
        raise AirflowException(f"read table to dataframe fail:{e}")
    # write rocket launch data to s3
    with tempfile.NamedTemporaryFile('wb+') as fp:
        temp_filename = fp.name  # 暫存檔案名
        # use the same file hierarchy pattern
        object_key = (
            f"{object_key_prefix}/de00_readsql:get_data/"
            f"{year}/{year}-{month:0>2}/"
            f"payment-{year}{month:0>2}{day:0>2}.csv.gz"
        )
        try:
            # export dataframe to csv
            df.to_csv(temp_filename, index=False, compression='gzip')
            # upload to s3
            s3_hook = S3Hook(aws_conn_id=s3_conn_id)
            s3_hook.load_file(filename=temp_filename,
                              bucket_name=bucket_name,
                              key=object_key,
                              replace=True)
            print(f'put query data to s3: [{bucket_name}] -> {object_key}, success!')
        except Exception as e:
            raise AirflowException(f"put query data to s3 fail:{e}")


default_args = {
    'owner':'EMPLOYEE_ID', # owner是DAG的開發者, 例如: 員工8703147
}

dag = DAG(
    dag_id="deXX_readsql", # prefix必需是tenant id, 例如: de00
    description="dag to read dvdrental database to s3",
    start_date=days_ago(2),
    schedule_interval="@daily",
    catchup=False,
    max_active_runs=1,
    default_args=default_args,
    access_control={
        'deXX': {'can_read', 'can_edit'} # 設定DAG歸屬那個團隊[tenant id]與權限
    },
    tags=['de08'],
)

sql_stmt = """
SELECT
	c.customer_id,
	c.first_name customer_first_name,
	c.last_name customer_last_name,
	s.first_name staff_first_name,
	s.last_name staff_last_name,
	amount,
	payment_date
FROM
	customer c
INNER JOIN payment p 
    ON p.customer_id = c.customer_id
INNER JOIN staff s 
    ON p.staff_id = s.staff_id
ORDER BY payment_date;
"""

# task to download wikipedia pageviews data
task_get_data = PythonOperator(
    task_id='get_data',
    python_callable=get_data,
    op_kwargs={
        's3_conn_id': 'deXX_minio',
        'bucket_name': 'EMPLOYEE_ID',
        'object_key_prefix': 'de08/dvdrental',
        'pg_conn_id': 'dxlab_dvdrental',
        'sql': sql_stmt,
    },
    dag=dag,
)