In [None]:
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.operators.dummy import DummyOperator
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
import datetime as dt
import requests
from airflow.exceptions import AirflowException
import tempfile
import pandas as pd
import os


# download wikipedia pageviews data
def get_data(s3_conn_id, bucket_name, object_key_prefix, **context):
    year, month, day, hour, *_ = context['execution_date'].timetuple()
    url = (
        "http://10.34.124.114:7080/other/pageviews/"
        f"{year}/{year}-{month:0>2}/"
        f"pageviews-{year}{month:0>2}{day:0>2}-{hour:0>2}0000.gz"
    )
    print(url) # 打印URL來確認是否符合預期
    # download wiki pageviews
    resp = requests.get(url)
    if resp.status_code != 200:
        raise AirflowException(f"wikipedia pageviews data download fail, the resp status[{resp.status_code}]")
    # 透過Connection來取得S3Hook物件的實例
    s3_hook = S3Hook(aws_conn_id=s3_conn_id)
    # write rocket launch data to s3
    with tempfile.NamedTemporaryFile('wb+') as fp:
        temp_filename = fp.name  # 暫存檔案名
        # use the same file hierarchy pattern
        object_key = (
            f"{object_key_prefix}/task:get_data/"
            f"{year}/{year}-{month:0>2}/"
            f"pageviews-{year}{month:0>2}{day:0>2}-{hour:0>2}0000.gz"
        )
        try:
            fp.write(resp.content)
            fp.flush()
            s3_hook.load_file(filename=temp_filename,
                              bucket_name=bucket_name,
                              key=object_key,
                              replace=True)
            print(f'put wikipedia pageviews data to s3: [{bucket_name}] -> {object_key}, success!')
        except Exception as e:
            raise AirflowException(f"put wikipedia pageviews data to s3 fail:{e}")

# task to unzip and extract data
def extract_data(s3_conn_id, bucket_name, object_key_prefix, **context):
    # datetime elements for data partition
    year, month, day, hour, *_ = context['execution_date'].timetuple()
    # create s3 connection
    s3_hook = S3Hook(aws_conn_id=s3_conn_id)
    # use the same file hierarchy pattern
    source_object_key = (
        f"{object_key_prefix}/task:get_data/"
        f"{year}/{year}-{month:0>2}/"
        f"pageviews-{year}{month:0>2}{day:0>2}-{hour:0>2}0000.gz"
    )
    # download s3 object to local temp file
    try:
        # 暫存檔案名
        source_temp_filename = s3_hook.download_file(bucket_name=bucket_name,key=source_object_key)
        print(f'download wikipedia pageviews data to local: {source_temp_filename}, success!')
        # read wiki pageviews to dataframe
        print(f'read and extract:{source_temp_filename}!')
        df = pd.read_csv(source_temp_filename,
                         names=['domain', 'title', 'view_count', 'response_size'],
                         compression='gzip',
                         delimiter=' ')
        df2 = df[
            (df['domain'] == 'en') &
            (df['title'].isin(["Google", "Amazon", "Apple", "Microsoft", "Facebook"]))
            ]
        print(df2.info())
        print(df2.head())
        # remove temp file
        os.remove(source_temp_filename)
        # use the same file hierarchy pattern
        target_object_key = (
            f"{object_key_prefix}/task:extract_data/"
            f"{year}/{year}-{month:0>2}/"
            f"pageviews-{year}{month:0>2}{day:0>2}-{hour:0>2}0000.csv"
        )
        # write dataframe as parquet to s3
        with tempfile.NamedTemporaryFile('w+') as fp2:
            target_temp_filename = fp2.name  # 暫存檔案名
            try:
                df2.to_csv(target_temp_filename, index=False)
                s3_hook.load_file(filename=target_temp_filename,
                                  bucket_name=bucket_name,
                                  key=target_object_key,
                                  replace=True)
                print(f'upload wikipedia pageviews extract data to s3: {target_temp_filename}, success!')
            except Exception as e:
                raise AirflowException(f"upload wikipedia pageviews data to s3 fail:{e}")
    except Exception as e:
        raise AirflowException(f"download wikipedia pageviews data fail:{e}")



default_args = {
    'owner':'EMPLOYEE_ID', # owner是DAG的開發者, 例如: 員工8703147
}

dag = DAG(
    dag_id="deXX_pageviews_etl", # prefix必需是tenant id, 例如: de00
    description="dag to download wikipedia pageviews",
    start_date=dt.datetime(2019,7,1),
    schedule_interval="@hourly",
    end_date=dt.datetime(2019,7,2),
    catchup=True,
    max_active_runs=1,
    default_args=default_args,
    access_control={
        'deXX': {'can_read', 'can_edit'} # 設定DAG歸屬那個團隊[tenant id]與權限
    },
    tags=['de08'],
)

# task to download wikipedia pageviews data
task_get_data = PythonOperator(
    task_id='get_data',
    python_callable=get_data,
    op_kwargs={
        's3_conn_id': 'deXX_minio',
        'bucket_name': 'EMPLOYEE_ID', # 有英文字元要轉為小寫
        'object_key_prefix': 'de08'
    },
    dag=dag,
)

# task to unzip and extract data
task_extract_data = PythonOperator(
    task_id='extract_data',
    python_callable=extract_data,
    op_kwargs={
            's3_conn_id': 'deXX_minio',
            'bucket_name': 'EMPLOYEE_ID', # 有英文字元要轉為小寫
            'object_key_prefix': 'de08',
        },
    dag=dag,
)

task_load_data = DummyOperator(
    task_id='load_data',
    dag=dag,
)

# Set dependencies between all tasks
task_get_data >> task_extract_data >> task_load_data