In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9870/trabalhoarvores'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
from pyspark.sql.functions import substring, avg, sum

# read air_quality from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/trabalhoarvores/silver/Projeto.db/new_york_tree_census_2015"

tree_census = spark\
             .read\
             .load(hdfs_path)


In [3]:
from pyspark.sql.functions import count
gold_tree_census = tree_census \
    .groupBy("status")\
    .agg(
        count(tree_census.tree_id).alias("contagem_status")
    ) \
   
gold_tree_census.toPandas()


Unnamed: 0,status,contagem_status
0,Dead,13961
1,Alive,652173
2,Stump,17654


In [4]:
# create air_quality table in gold
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto_gold.Tabela_arvores4
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto_gold.Tabela_arvores4 (
        status VARCHAR(250),
        contagem_status LONG
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/trabalhoarvores/gold/Projeto_gold.db/Tabela_arvores4/'
    """
)
        

DataFrame[]

In [5]:
# write to delta table
gold_tree_census \
    .write  \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/trabalhoarvores/gold/Projeto_gold.db/Tabela_arvores4/")

In [6]:
# check the results in the table
spark.table("Projeto_gold.Tabela_arvores4").toPandas()


Unnamed: 0,status,contagem_status
0,Dead,13961
1,Alive,652173
2,Stump,17654


In [7]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/trabalhoarvores/gold/Projeto_gold.db/Tabela_arvores4/`
""").show()

++
||
++
++



In [8]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.Tabela_arvores_presto4
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE Projeto_gold.Tabela_arvores_presto4 (
        status VARCHAR(250),
        contagem_status LONG
        )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/trabalhoarvores/gold/Projeto_gold.db/Tabela_arvores4/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++

