In [1]:
import os

hdfs_fs = 'hdfs://hadoop-master.hadoop-domain.default-tenant.svc.cluster.local:9000'
v3io_fs =  os.getenv('V3IO_HOME_URL')

print(f"HDFS: {hdfs_fs}")
print(f"V3IO: {v3io_fs}")

HDFS: hdfs://hadoop-master.hadoop-domain.default-tenant.svc.cluster.local:9000
V3IO: v3io://users/admin


In [2]:
!kinit -k -t /User/conf/kerberos/krb5.keytab hdfs/hadoop-master.hadoop-domain.default-tenant.svc.cluster.local@EXAMPLE.COM
!klist

Ticket cache: FILE:/User/conf/kerberos/krb5_ccache
Default principal: hdfs/hadoop-master.hadoop-domain.default-tenant.svc.cluster.local@EXAMPLE.COM

Valid starting     Expires            Service principal
01/14/21 17:32:54  01/15/21 17:32:54  krbtgt/EXAMPLE.COM@EXAMPLE.COM


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Example") \
    .master('k8s://https://kubernetes.default.svc:443') \
    .config('spark.kerberos.principal','hdfs/hadoop-master.hadoop-domain.default-tenant.svc.cluster.local@EXAMPLE.COM') \
    .getOrCreate()


In [4]:
v3io_path = os.path.join(v3io_fs,'examples','demo.csv')
print(v3io_path)

v3io_df = spark.read.csv(v3io_path)
v3io_df.show()

v3io://users/admin/examples/demo.csv
+---+--------------------+--------------+-----+-----+----+-----+------+-----------+--------------------+------+---------+-----------+
|_c0|                 _c1|           _c2|  _c3|  _c4| _c5|  _c6|   _c7|        _c8|                 _c9|  _c10|     _c11|       _c12|
+---+--------------------+--------------+-----+-----+----+-----+------+-----------+--------------------+------+---------+-----------+
| id|              street|          city|  zip|state|beds|baths|sq__ft|       type|           sale_date| price| latitude|  longitude|
|  1|        3526 HIGH ST|    SACRAMENTO|95838|   CA|   2|    1|   836|Residential|Wed May 21 00:00:...| 59222|38.631913|-121.434879|
|  2|         51 OMAHA CT|    SACRAMENTO|95823|   CA|   3|    1|  1167|Residential|Wed May 21 00:00:...| 68212|38.478902|-121.431028|
|  3|      2796 BRANCH ST|    SACRAMENTO|95815|   CA|   2|    1|   796|Residential|Wed May 21 00:00:...| 68880|38.618305|-121.443839|
|  4|    2805 JANETTE WAY

In [5]:
output_path = os.path.join(hdfs_fs,'user/hdfs/output.csv')
print(output_path)

v3io_df.write.csv(output_path, mode='overwrite')

hdfs://hadoop-master.hadoop-domain.default-tenant.svc.cluster.local:9000/user/hdfs/output.csv


In [6]:
hdfs_df = spark.read.csv(output_path)
hdfs_df.show()

+---+--------------------+--------------+-----+-----+----+-----+------+-----------+--------------------+------+---------+-----------+
|_c0|                 _c1|           _c2|  _c3|  _c4| _c5|  _c6|   _c7|        _c8|                 _c9|  _c10|     _c11|       _c12|
+---+--------------------+--------------+-----+-----+----+-----+------+-----------+--------------------+------+---------+-----------+
| id|              street|          city|  zip|state|beds|baths|sq__ft|       type|           sale_date| price| latitude|  longitude|
|  1|        3526 HIGH ST|    SACRAMENTO|95838|   CA|   2|    1|   836|Residential|Wed May 21 00:00:...| 59222|38.631913|-121.434879|
|  2|         51 OMAHA CT|    SACRAMENTO|95823|   CA|   3|    1|  1167|Residential|Wed May 21 00:00:...| 68212|38.478902|-121.431028|
|  3|      2796 BRANCH ST|    SACRAMENTO|95815|   CA|   2|    1|   796|Residential|Wed May 21 00:00:...| 68880|38.618305|-121.443839|
|  4|    2805 JANETTE WAY|    SACRAMENTO|95815|   CA|   2|    

In [7]:
spark.stop()