In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Projeto'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
from pyspark.sql.functions import substring, avg, sum

# read air_quality from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/esquilos"

squirrel_census = spark\
             .read\
             .load(hdfs_path)

squirrel_census.toPandas()

Unnamed: 0,X,Y,Unique_Squirrel_ID,Hectare,Shift,Date,Hectare_Squirrel_Number,Age,Primary_Fur_Color,Highlight_Fur_Color,...,Kuks,Quaas,Moans,Tail_Flags,Tail_Twitches,Approaches,Indifferent,Runs_From,Other_Interactions,mes_registo
0,-73.9561344937861,40.7940823884086,37F-PM-1014-03,37F,PM,10142018,3,Sem Valor,Sem Valor,Sem Valor,...,False,False,False,False,False,False,False,False,Sem Valor,10
1,-73.9688574691102,40.7837825208444,21B-AM-1019-04,21B,AM,10192018,4,Sem Valor,Sem Valor,Sem Valor,...,False,False,False,False,False,False,False,False,Sem Valor,10
2,-73.9742811484852,40.775533619083,11B-PM-1014-08,11B,PM,10142018,8,Sem Valor,Gray,Sem Valor,...,False,False,False,False,False,False,False,False,Sem Valor,10
3,-73.9596413903948,40.7903128889029,32E-PM-1017-14,32E,PM,10172018,14,Adult,Gray,Sem Valor,...,False,False,False,False,False,False,False,True,Sem Valor,10
4,-73.9702676472613,40.7762126854894,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,False,False,Sem Valor,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3018,-73.9639431360458,40.7908677445466,30B-AM-1007-04,30B,AM,10072018,4,Adult,Gray,Sem Valor,...,False,False,False,False,False,False,False,True,Sem Valor,10
3019,-73.9704015859639,40.7825600069973,19A-PM-1013-05,19A,PM,10132018,5,Adult,Gray,White,...,False,False,False,False,False,False,True,False,Sem Valor,10
3020,-73.9665871993517,40.7836775064883,22D-PM-1012-07,22D,PM,10122018,7,Adult,Gray,"Black, Cinnamon, White",...,False,False,False,False,False,False,True,False,Sem Valor,10
3021,-73.9639941227864,40.7899152327912,29B-PM-1010-02,29B,PM,10102018,2,Sem Valor,Gray,"Cinnamon, White",...,False,False,False,False,False,False,True,False,Sem Valor,10


In [4]:
from pyspark.sql.functions import count
gold_squirrel_census = squirrel_census \
    .groupBy("Hectare")\
    .agg(
        count(squirrel_census.Unique_Squirrel_ID).alias("Contagem")
    ) \
   
gold_squirrel_census.toPandas()


Unnamed: 0,Hectare,Contagem
0,13C,9
1,08C,10
2,31I,4
3,20E,2
4,01B,27
...,...,...
334,38A,10
335,13E,24
336,30C,3
337,23D,1


In [5]:
# create air_quality table in gold
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto_gold.Tabela_esquilos
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto_gold.Tabela_esquilos (
        Hectare String,
        Contagem LONG
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/Projeto/gold/Projeto_gold.db/Tabela_esquilos/'
    """
)
        

DataFrame[]

In [6]:
# write to delta table
gold_squirrel_census \
    .write  \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Projeto/gold/Projeto_gold.db/Tabela_esquilos/")

In [7]:
# check the results in the table
spark.table("Projeto_gold.Tabela_esquilos").toPandas()


Unnamed: 0,Hectare,Contagem
0,13C,9
1,08C,10
2,31I,4
3,20E,2
4,01B,27
...,...,...
334,38A,10
335,13E,24
336,30C,3
337,23D,1


In [8]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/Projeto/gold/Projeto_gold.db/Tabela_esquilos/`
""").show()

++
||
++
++



In [9]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.Tabela_esquios_presto 
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE Projeto_gold.Tabela_esquilos_presto (
        Hectare String,  
        Contagem LONG
        )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/Projeto/gold/Projeto_gold.db/Tabela_esquilos/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++

