Glue job
Vaquar Khan edited this page Oct 28, 2022
·
2 revisions
import sys
import boto3
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
client = boto3.client('glue',region_name='ap-southeast-2')
databaseName = 'tpc-ds-csv'
print '\ndatabaseName: ' + databaseName
Tables = client.get_tables( DatabaseName = databaseName )
tableList = Tables ['TableList']
for table in tableList:
tableName = table['Name']
print '\n-- tableName: '+tableName
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "tpc-ds-csv",
table_name = tableName, transformation_ctx = "datasource0")
datasink4 = glueContext.write_dynamic_frame.from_options(frame = datasource0, connection_type = "s3",
connection_options = {"path": "s3://aws-glue-tpcds-parquet/"+ tableName + "/"},
format = "parquet", transformation_ctx = "datasink4")
job.commit()