# ETL Glue Job for Harmonizing NYC Taxi Data Schema
We need to unify the schema of different parquet files of NYC Taxi data before we can analyze it using Athena or any other tool
- We process 5 years of data that is around 100-150M rows in total
- A pyspark script based Glue ETL job is used to harmonize the schema of the data
- Reads data from S3, harmonizes the schema, and writes the corrected data back to S3
- Create a Glue data catalog table from the processed data (s3)

In [112]:
import boto3
import sagemaker

In [113]:
# Configurations
job_name = "harmonize-nyc-taxi"
role_name = "AmazonSageMakerServiceCatalogProductsGlueRole"  # Change to your IAM role name
script_local_path = "scripts/harmonize_schema.py"
BUCKET = "sagemaker-us-east-1-205930620783"
PREFIX = "NYC_Taxi_Prediction"
script_local_path = "harmonize_schema_nyc_data.py"
script_s3_path = "s3://{}/{}/scripts".format(BUCKET, PREFIX)
input_path = "s3://{}/{}/data/raw_data".format(BUCKET, PREFIX)
output_path = "s3://{}/{}/data/schema_corrected_data".format(BUCKET, PREFIX)
glue_version = "4.0"
worker_type = "G.1X"
number_of_workers = 5
region = "us-east-1"

In [114]:
s3 = boto3.client("s3")
glue = boto3.client("glue")

### First Copy the Script to S3 Location

In [None]:
bucket, key = script_s3_path.replace("s3://", "").split("/",1)
print(bucket,key)
#help(s3.put_object)
s3.upload_file(Filename=script_local_path, Bucket=bucket, Key=f"{key}/harmonize_schema_nyc_data.py")

In [None]:
bucket, key = input_path.replace("s3://", "").split("/",1)
print(bucket,key)
response = s3.list_objects_v2(Bucket=bucket, Prefix=key)
#print(response)
files = [f"s3://{bucket}/{obj['Key']}" for obj in response['Contents'] if obj['Key'].endswith('.parquet')]
print(files[0])
print(len(files))

In [None]:
f"s3://{script_s3_path.split('/')[2]}/glue-temp/"

### Create Glue ETL Job

In [118]:
try: 
    response = glue.create_job(
        Name=job_name,
        Role=role_name,
        ExecutionProperty={"MaxConcurrentRuns": 1}, # This says if job is already running the second instance won't run 
        Command={
            "Name": "glueetl",
            "ScriptLocation": f"{script_s3_path}/harmonize_schema_nyc_data.py",
            "PythonVersion":"3"
        },
        GlueVersion=glue_version,
        WorkerType=worker_type,
        NumberOfWorkers=number_of_workers,
        DefaultArguments={
            "--job-language": "python",
            "--enable-continuous-cloudwatch-log": "true",
            "--enable-metrics": "",
            "--input_path": input_path,
            "--output_path": output_path,
            "--TempDir": f"s3://{script_s3_path.split('/')[2]}/glue-temp/"
        },
        MaxRetries=0,
        Timeout=60
    )
    print(f"Glue Job created: {response['Name']}")

except glue.exceptions.AlreadyExistsException:
    print(f"Glue Job already exists: {job_name}")

Glue Job created: harmonize-nyc-taxi


### Run the Glue Job

In [None]:
response = glue.start_job_run(JobName=job_name)
run_id = response["JobRunId"] 
print(f"Job started: {job_name}, Run ID: {run_id}")

In [120]:
import time
while True:
    response = glue.get_job_run(JobName=job_name, RunId=run_id)
    state = response["JobRun"]["JobRunState"]
    print(f"Job status: {state}")
    if state in ["SUCCEEDED", "FAILED", "STOPPED", "TIMEOUT"]:
        break
    time.sleep(30)
    print(f"Final status: {state}")

Job status: RUNNING
Final status: RUNNING
Job status: RUNNING
Final status: RUNNING
Job status: RUNNING
Final status: RUNNING
Job status: RUNNING
Final status: RUNNING
Job status: RUNNING
Final status: RUNNING
Job status: RUNNING
Final status: RUNNING
Job status: RUNNING
Final status: RUNNING
Job status: RUNNING
Final status: RUNNING
Job status: RUNNING
Final status: RUNNING
Job status: RUNNING
Final status: RUNNING
Job status: RUNNING
Final status: RUNNING
Job status: RUNNING
Final status: RUNNING
Job status: RUNNING
Final status: RUNNING
Job status: SUCCEEDED
