pulling madmen variable in GCP

In [None]:
def pull_madmen_monthly(params):
    import sys
    from datetime import datetime
    sys.path.append("dpu-latest.jar")

    start_ts = datetime.now()

    from pyspark.sql.functions import udf, col
    from pyspark.sql.types import StringType

    from py_dpu import load_pig, load_parquet, save_pig, rename_df, loadByDriver, HdfsManager, Statistics
    from automation_utils.common.mail import send_email

    var_list = params.get("var_list")
    driver_date_column = params.get('driver_date_column')
    driver_monthly_column = params.get('driver_monthly_column')
    checkpoint = params.get('checkpoint')
    madmen_time_range = params.get('madmen_time_range')
    email_to = params.get("email_to")
    group_size = params.get("group_size", -1)
    driver_gcs_dir = params.get("driver_gcs_dir")
    madmen_gcs_dir = params.get("madmen_gcs_dir")
    monthly = params.get("monthly")

    @udf(returnType=StringType())
    def clean_trans_id(trans_id):
        return trans_id.split('.')[0]

    driver_df = load_pig(spark, driver_gcs_dir)
    driver_set = rename_df(spark, driver_df, prefix='driver_')
    driver_set = driver_set.withColumn('driver_trans_id_clean', clean_trans_id(driver_set['driver_trans_id']))
    driver_date_column = 'driver_' + driver_date_column
    driver_monthly_column = 'driver_' + driver_monthly_column

    filter_expr = f"{driver_monthly_column}='{monthly}'"
    print(f'filter expr', filter_expr)
    driver_set = driver_set.filter(filter_expr)
    print('filtered rec num:')
    driver_set.groupBy(driver_date_column).count().sort(col(driver_date_column)).show(1000)

    hdfs = HdfsManager(spark)
    hdfs.delete(madmen_gcs_dir + "_parquet")

    print(f"{driver_date_column} from {madmen_time_range}")
    final_df = loadByDriver(
        spark,
        driver_set,
        checkpoint=checkpoint,
        time=madmen_time_range,
        variables=var_list,
        dateColumn=driver_date_column,
        driverKeys=['driver_trans_id_clean'],
        madmenKeys=['transaction'],
        outputPath=madmen_gcs_dir + "_parquet",
        groupSize=int(group_size),
    )
    print(f'pulled variable rec num:')
    final_df.groupBy(driver_date_column).count().sort(col(driver_date_column)).show(100)

    hdfs.delete(madmen_gcs_dir)
    save_pig(spark, final_df, madmen_gcs_dir, "\x07")

    hdfs.delete(madmen_gcs_dir + "_parquet")

    run_seconds = (datetime.now() - start_ts).total_seconds()
    run_time_expr = f"job {run_seconds / 60:.2f} minutes to finish"
    send_email(email_to, 
               subject=f"[{madmen_time_range}] data pull routine job done",
               content=f"data saved to {madmen_gcs_dir}, {run_time_expr}",
              )

    


In [None]:
import json
import pytz
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta

import aml.cloud_v1 as cloud
from model_automation.gcp import dataproc_config


model_name = 'guxia_ucc23'

gcp_project_id = 'ccg24-hrzana-gds-focus'

# run_dt = datetime.now(pytz.utc) + relativedelta(hours=8)
run_dt = datetime(2023, 11, 3)

driver_gcs_dir = 'gs://pypl-bkt-rsh-row-std-gds-focus/user/guxia/UCC23_V2/data/driver/20231101/20220401_20230731'
madmen_gcs_dir = f"gs://pypl-bkt-rsh-row-std-gds-focus/user/guxia/UCC23_V2/data/madmen/{run_dt.strftime('%Y%m%d')}"

print(madmen_gcs_dir)

driver_start_dt = datetime(2022, 4, 1)
driver_end_dt = datetime(2022, 6, 30)
dates = pd.date_range(start=driver_start_dt, end=driver_end_dt, freq='1M')

all_var = [
    'fd_super_cookie',
]



In [None]:
job_params = []
for d in dates[:2]:
    job_start_dt = d + relativedelta(day=1)
    job_end_dt = d + relativedelta(days=1, seconds=-1)
    monthly = d.strftime('%Y-%m')
    madmen_time_range = f"{job_start_dt.strftime('%Y-%m-%d %H:%M:%S')} to {job_end_dt.strftime('%Y-%m-%d %H:%M:%S')}"
    print(monthly, madmen_time_range)

    # job params
    params = {
        'var_list': all_var[:10],
        'driver_date_column': 'pmt_start_date',
        'driver_monthly_column': 'monthly',
        'checkpoint': 'ConsolidatedFunding',
        'madmen_time_range': madmen_time_range,
        'email_to': 'guxia@paypal.com',
        'driver_gcs_dir': driver_gcs_dir,
        'madmen_gcs_dir': madmen_gcs_dir + f"/{job_start_dt.strftime('%Y%m%d')}_{job_end_dt.strftime('%Y%m%d')}",
        'monthly': monthly,
    }

    print(json.dumps(params, indent=4))

    job_params.append(params)


In [None]:

gcp_client = cloud.DataProcClient(gcp_project=gcp_project_id)
job_ids = []

for params in job_params:
    # submit job to dataproc
    job_id = gcp_client.create_spark_job(
    # function run on dataproc
        func=pull_madmen_monthly,
        packages_to_install=['automation_utils==0.3.0', 'gcsfs'],
        wait_for_completion=False,
    # function kwargs
        params=params,
    # dataproc config
        **dataproc_config['medium'],
    )
    print(f'job created: {job_id}')
    job_ids.append(job_id)

for job_id in job_ids:
    print('job id', job_id)
    
    # wait until job finished
    status = gcp_client.wait_job_for_completion(job_id)
    if status == "FAILED":
        print(f"job failed, job id: {job_id}")

    # save job log to local
    log_name = job_id.split(os.sep)[-1]
    os.makedirs('log', exist_ok=True)
    with open(os.path.join('log', log_name), 'w') as f:
        gcp_client.get_job_logs(job_id, file=f)

    print(f'finish running data job')
