In [1]:
import os

hdfs_fs = 'hdfs://hadoop-master.hadoop-domain.default-tenant.svc.cluster.local:9000'
v3io_fs =  os.getenv('V3IO_HOME_URL')

print(f"HDFS: {hdfs_fs}")
print(f"V3IO: {v3io_fs}")

HDFS: hdfs://hadoop-master.hadoop-domain.default-tenant.svc.cluster.local:9000
V3IO: v3io://users/admin


In [2]:
krb5_cc_name = 'FILE:/User/spark/krb5kdc_ccache'
hadoop_conf_dir = '/User/spark/hadoop/'
krb5_config_file = '/User/spark/hadoop/krb5.conf'
jvm_config_option = f"-Dsun.zip.disableMemoryMapping=true -Djava.security.krb5.conf={krb5_config_file} -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug=true"

print(f"KRB5CCNAME: {krb5_cc_name}")
print(f"HADOOP_CONF_DIR: {hadoop_conf_dir}")
print(f"KRB5_CONFIG: {krb5_config_file}")
print(f"JVM config: {jvm_config_option}")

KRB5CCNAME: FILE:/User/spark/krb5kdc_ccache
HADOOP_CONF_DIR: /User/spark/hadoop/
KRB5_CONFIG: /User/spark/hadoop/krb5.conf
JVM config: -Dsun.zip.disableMemoryMapping=true -Djava.security.krb5.conf=/User/spark/hadoop/krb5.conf -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug=true


In [3]:
os.environ['KRB5CCNAME'] = krb5_cc_name
os.environ['HADOOP_CONF_DIR'] = hadoop_conf_dir
os.environ['KRB5_CONFIG'] = krb5_config_file

# os.environ['SPARK_SUBMIT_OPTS'] = jvm_config_option

In [4]:
!kinit -k -t /User/spark/krb5.keytab hdfs/hadoop-master.hadoop-domain.default-tenant.svc.cluster.local@EXAMPLE.COM
!klist

Ticket cache: FILE:/User/spark/krb5kdc_ccache
Default principal: hdfs/hadoop-master.hadoop-domain.default-tenant.svc.cluster.local@EXAMPLE.COM

Valid starting     Expires            Service principal
12/22/20 12:04:53  12/23/20 12:04:53  krbtgt/EXAMPLE.COM@EXAMPLE.COM


In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Example") \
    .config('fs.v3io.impl','io.iguaz.v3io.hcfs.V3IOFileSystem') \
    .config('fs.AbstractFileSystem.v3io.impl','io.iguaz.v3io.hcfs.V3IOAbstractFileSystem') \
    .config('spark.driver.extraJavaOptions', jvm_config_option) \
    .config('spark.executor.extraJavaOptions', jvm_config_option) \
    .config('spark.executorEnv.KRB5_CONFIG',krb5_config_file) \
    .config('spark.executorEnv.KRB5CCNAME', krb5_cc_name) \
    .config('spark.executorEnv.HADOOP_CONF_DIR', hadoop_conf_dir) \
    .getOrCreate()

# This will enable using the Hadoop native libs (rather than the libs packages with Spark)
#    .config('spark.executorEnv.LD_LIBRARY_PATH', '/hadoop/lib/native') \
# Run in local mode (not using remote executors)
#    .master('local') \
# These seem to only work in Spark >=3.0
#    .config('spark.kerberos.access.hadoopFileSystems', hdfs_fs) \
#    .config('spark.kerberos.renewal.credentials','ccache') \

In [None]:
hdfs_path = os.path.join(hdfs_fs,'data.csv')
print(hdfs_path)

hdfs_df = spark.read.csv(hdfs_path)
hdfs_df.show()

In [None]:
spark.stop()

In [None]:
v3io_path = os.path.join(v3io_fs,'examples','demo.csv')
print(v3io_path)

v3io_df = spark.read.csv(v3io_path)
v3io_df.show()

In [None]:
hdfs_df.write.parquet(f'{hdfs_fs}/output.parquet', mode='overwrite')
hdfs_df.write.csv(f'{hdfs_fs}/output.csv', mode='overwrite')

In [None]:
import os

if os.environ.get('HADOOP_CONF_DIR'):
    os.environ.pop('HADOOP_CONF_DIR')
if os.environ.get('KRB5CCNAME'):
   os.environ.pop('KRB5CCNAME')
if os.environ.get('KRB5_CONFIG'):
    os.environ.pop('KRB5_CONFIG')