# data movement

## copy from HDFS to BQ

In [None]:
%%bash
#!/usr/bin/env bash

# 2023-11-02: deprecated

cd $(dirname "$0")

echo $(pwd)

sudo -Eu dt_hdp_ops /opt/conda/bin/kinit -kt /etl/LVS/dt_etldata6/dt_hdp_ops/.keytabs/dt_hdp_ops.keytab dt_hdp_ops

source_dir=''
target_dir=''

sudo -Eu dt_hdp_ops \
    /etl/LVS/dt_etldata6/dt_hdp_ops/distcp/chadoop distcp \
    -Dmapreduce.map.java.opts=-Xmx2048m \
    -Dmapreduce.map.memory.mb=2048 \
    -overwrite \
    -delete \
    ${source_dir} ${target_dir}

## V2: use `maglev` magic in maglev kernal

In [None]:
source_data_dir = '' # source data dir
target_data_dir = '' # target data dir

In [None]:
%maglev copy $source_data_dir $target_data_dir

# submit job to GCP cluster

In [None]:
import aml.cloud_v1 as cloud
from model_automation.gcp import dataproc_config

import job_funcs
importlib.reload(job_funcs)
from job_funcs import data_func


gcp_project_id = 'ccg24-hrzana-gds-focus'
gcp_client = cloud.DataProcClient(gcp_project_id)

bq_project = 'pypl-bods'
bq_dataset = 'rsh_row_gds_focus'
packages_to_install = ['automation_utils==0.3.0', 'gcsfs']

# object here must be json serializable
params = {
    'gcs_data_dir': 'gs://pypl-bkt-rsh-row-std-gds-focus/user/guxia/ucc_latam/cc_trust/simulation',
    'bq_project': bq_project,
    'bq_dataset': bq_dataset,
}


In [None]:



# create a one-time cluster and submit a job
# submit job
job_id = gcp_client.create_spark_job (
    # function run on dataproc
    func=data_move_job,
    packages_to_install = packages_to_install,
    pyfiles_to_import=['job_funcs.py'],
    custom_billing_tag='guxia',
    # function kwargs
    params=params,
    # dataproc config
    **dataproc_config['bq_only'],
)

# wait until job finished
status = gcp_client.wait_job_for_completion(job_id)
if status == "FAILED":
    print(f"job failed, job id: {job_id}")

# save job log to local
log_name = job_id.split(os.sep)[-1]
os.makedirs('log', exist_ok=True)
with open(os.path.join('log', log_name), 'w') as f:
    gcp_client.get_job_logs(job_id, file=f)

print(f'finish running data job')


# move driver from BQ to GCS

In [None]:
import aml.cloud_v1 as cloud
from model_automation.gcp import dataproc_config


run_dt = datetime(2023, 11, 1)
print('running dt', run_dt)

driver_start_dt = datetime(2023, 7, 1)
driver_end_dt = datetime(2023, 7, 2)
print(driver_end_dt)

bq_driver_table = 'pypl-bods.rsh_row_gds_focus.ucc_rmr_driver_sampled'
driver_dt_column = 'pmt_start_date'
gcs_driver_dir = f"gs://pypl-bkt-rsh-row-std-gds-focus/user/guxia/UCC23_V2/data/driver/{run_dt.strftime('%Y%m%d')}/{driver_start_dt.strftime('%Y%m%d')}_{driver_end_dt.strftime('%Y%m%d')}"
print(gcs_driver_dir)

gcp_project_id = 'ccg24-hrzana-gds-focus'
bq_project = 'pypl-bods'
bq_dataset = 'rsh_row_gds_focus'

In [None]:
def data_move_job(params):
    import sys
    sys.path.append("dpu-latest.jar")
    from datetime import datetime

    from pyspark.sql.functions import col, to_date
    
    from py_dpu import BigqueryManager, HdfsManager
    from py_dpu import load_pig, save_pig
    
    driver_start_dt = params.get('driver_start_dt', None)
    driver_end_dt = params.get('driver_end_dt', None)
    bq_driver_table = params.get('bq_driver_table', None)
    gcs_driver_dir = params.get('gcs_driver_dir', None)
    bq_project = params.get('bq_project', None)
    bq_dataset = params.get('bq_dataset', None)
    driver_dt_column = params.get('driver_dt_column', None)
    
    print('driver start dt', driver_start_dt)
    print('driver end dt', driver_end_dt)
    
    # write your job code here
    query = f"""
        select *
        from {bq_driver_table}
        where
            {driver_dt_column} between '{driver_start_dt}' and '{driver_end_dt}'
    """
    print('query to run', query)

    bq_mgr = BigqueryManager(spark)
    hdfs_mgr = HdfsManager(spark)

    tmp_gcs_driver_dir = gcs_driver_dir + '_tmp'
    hdfs_mgr.delete(tmp_gcs_driver_dir)
    bq_mgr.store_query_to_hdfs(query, project=bq_project, dataset=bq_dataset, output_path=tmp_gcs_driver_dir)
    print(f'data saved to {tmp_gcs_driver_dir}')
    
    driver_df = spark.read.options(delimiter='\x07', header=True).csv(f"{tmp_gcs_driver_dir}/*csv")

    print('driver daily rec num')
    driver_df.groupby(driver_dt_column).count().sort(col(driver_dt_column)).show(1000)

    save_pig(spark, driver_df.coalesce(16), gcs_driver_dir, delimiter='\x07')
    print(f'data saved to {gcs_driver_dir}')

    hdfs_mgr.delete(tmp_gcs_driver_dir)


In [None]:


gcp_client = cloud.DataProcClient(gcp_project_id)

# object here must be json serializable
params = {
    'driver_start_dt': driver_start_dt.strftime('%Y-%m-%d'),
    'driver_end_dt': driver_end_dt.strftime('%Y-%m-%d'),
    'bq_driver_table': bq_driver_table,
    'gcs_driver_dir': gcs_driver_dir,
    'driver_dt_column': driver_dt_column,
    'bq_project': bq_project,
    'bq_dataset': bq_dataset,
}

print('job params', json.dumps(params, indent=4))

packages_to_install = ['automation_utils==0.3.0', 'gcsfs']

# submit job
job_id = gcp_client.create_spark_job (
    # function run on dataproc
    func=data_move_job,
    packages_to_install = packages_to_install,
    pyfiles_to_import=[],
    custom_billing_tag='guxia',
    # function kwargs
    params=params,
    # dataproc config
    **dataproc_config['bq_only'],
)

# wait until job finished
status = gcp_client.wait_job_for_completion(job_id)
if status == "FAILED":
    print(f"job failed, job id: {job_id}")

# save job log to local
log_name = job_id.split(os.sep)[-1]
os.makedirs('log', exist_ok=True)
with open(os.path.join('log', log_name), 'w') as f:
    gcp_client.get_job_logs(job_id, file=f)

print(f'finish running data movement job')

