In [3]:
from pyspark.sql.functions import *
from pyspark.sql.window import *

# Azure open datasets storage information (this storage is in EastUS, ~50GB of data)
# Leave the SAS token empty
blob_account_name = "azureopendatastorage"
blob_container_name = "nyctlc"
blob_relative_path = "yellow"
blob_sas_token = r""

# Allow SPARK to read from Blob remotely
# If using Synapse Spark with DEP enabled workspace, this will be blocked (so use a workspace with no DEP enabled)
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set(
  'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
  blob_sas_token)
print('Source blob path: ' + wasbs_path)

# Target storage location
# Synapse authenticates automatically using the current user to the synapse default adls storage
# If using Databricks or other spark, use a SAS token and set it in spark conf like previous step
adls_account_name = '<your-adls-account-name>'
adls_container_name = '<your-container-name>'
adls_relative_path = '<your-folder-name>'
adls_path = 'abfss://%s@%s.dfs.core.windows.net/%s/' % (adls_container_name,adls_account_name,adls_relative_path)
print('Target blob path: ' + adls_path)

# SPARK read parquet
df = spark.read.parquet(wasbs_path)

# Generate a new column by combining tpepPickupDateTime and tpepDropoffDateTime
df1=df.withColumn("hashCol", concat(date_format('tpepPickupDateTime', "yyyyMMddhhmmss") , date_format('tpepDropoffDateTime', "yyyyMMddhhmmss")))

# This will copy 50GB of data from Azure open dataset source (EastUS) to your adls storage and write it as parquet with no partitions (50GB ~ 500 files ~ 100MB per file)
df1.write.parquet(adls_path,mode='overwrite')

StatementMeta(spark32large, 10, 3, Finished, Available)

Remote blob path: wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/yellow
