In [None]:
import pandas as pd
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.operators.dummy import DummyOperator
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.dates import days_ago
from datetime import timedelta
from airflow.exceptions import AirflowException
import requests
import tempfile


# function to retrieve rocket launches data & store in s3
def fetch_events(api_url, s3_conn_id, bucket_name, object_key):
    resp = requests.get(api_url)
    if resp.status_code!=200:
        raise AirflowException(f"user event api call fail, the resp status[{resp.status_code}]")
    # create s3 connection
    # 透過Connection來取得S3Hook物件的實例
    s3_hook = S3Hook(aws_conn_id=s3_conn_id)
    # write rocket launch data to s3
    with tempfile.NamedTemporaryFile('wb+') as fp:
        temp_filename = fp.name  # 暫存檔案名
        try:
            fp.write(resp.content)
            fp.flush()
            s3_hook.load_file(filename=temp_filename,
                                     bucket_name=bucket_name,
                                     key=object_key,
                                     replace=True)
            print(f'upload user event data to s3: [{bucket_name}] -> {object_key}, success!')
        except Exception as e:
            print(f"upload user event data to s3 fail:{e}")


# function to retrieve user event data & calculate stats
def calculate_stats(s3_conn_id, bucket_name, data_object_key, result_object_key):
    s3_hook = S3Hook(aws_conn_id=s3_conn_id)
    # download rocket launches data form s3
    try:
        # 暫存檔案名
        temp_filename = s3_hook.download_file(
            bucket_name=bucket_name,
            key=data_object_key)
        print(f'download user event data to local: {data_object_key}, success!')
        """Calculates event statistics."""
        events = pd.read_json(temp_filename)
        stats = events.groupby(["date", "user"]).size().reset_index()
        # 把結果寫到s3
        with tempfile.NamedTemporaryFile('w+') as stats_file:
            target_temp_filename = stats_file.name  # 暫存檔案名
            stats.to_csv(target_temp_filename, index=False)
            try:
                s3_hook.load_file(filename=target_temp_filename,
                                  bucket_name=bucket_name,
                                  key=result_object_key,
                                  replace=True)
            except Exception as e:
                raise AirflowException(e)
    except Exception as e:
        print(f"download user event data from s3 fail:{e}")
        raise AirflowException(e)


default_args = {
    'owner':'EMPLOYEE_ID', # owner是DAG的開發者, 例如: 員工8703147
}

dag = DAG(
    dag_id="deXX_03_every3days", # prefix必需是tenant id, 例如: de00
    description="dag to calculate user events",
    start_date=days_ago(30),
    schedule_interval=timedelta(days=3),
    catchup=True, # 設定讓Airflow坐時光機去處理過去的數據管道
    max_active_runs=1, # 設定同一個DAG最多只能有一個active的實例
    default_args=default_args,
    access_control={
        'deXX': {'can_read', 'can_edit'} # 設定DAG歸屬那個團隊[tenant id]與權限
    }
)

# task to retrieve rocket data and save to s3
task_fetch_events = PythonOperator(
    task_id='fetch_events',
    python_callable=fetch_events,
    op_kwargs={
        'api_url': 'http://10.34.124.114:5000/events',
        's3_conn_id': 'deXX_minio',
        'bucket_name': 'EMPLOYEE_ID',
        'object_key': 'de07/data/user_events.json'
    },
    dag=dag,
)

# task to get rocket picture
task_calculate_stats = PythonOperator(
    task_id='calculate_stats',
    python_callable=calculate_stats,
    op_kwargs={
        's3_conn_id': 'deXX_minio',
        'bucket_name': 'EMPLOYEE_ID',
        'data_object_key': 'de07/data/user_events.json',
        'result_object_key': 'de07/data/stats.csv',
    },
    dag=dag,
)

# task to notify someone when task is done
task_notify = DummyOperator(task_id='notify', dag=dag)

# Set dependencies between all tasks
task_fetch_events >> task_calculate_stats >> task_notify

