In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9870/trabalhoarvores'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
from pyspark.sql.functions import substring, avg, sum


hdfs_path = "hdfs://hdfs-nn:9000/trabalhoarvores/silver/Projeto.db/new_york_tree_census_2015"

tree_census = spark\
             .read\
             .load(hdfs_path)

tree_census.toPandas()

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,nta,nta_name,boro_ct,state,latitude,longitude,x_sp,y_sp,mes_registo,boroname
0,593226,312503,19-06-2016,,0,OnCurb,Alive,Good,Quercus rubra,northern red oak,...,QN51,Murray Hill,4115500,New York,407.638.694,-7.381.264.965,1.036.148.853,2.176.304.614,06,Queens
1,606945,305778,28-06-2016,,0,OnCurb,Alive,Good,Fraxinus pennsylvanica,green ash,...,QN37,Kew Gardens Hills,4125700,New York,4.072.433.932,-7.380.518.011,1.038.250.055,2.032.329.417,06,Queens
2,664717,346386,14-08-2016,,0,OnCurb,Alive,Good,Quercus palustris,pin oak,...,QN21,Middle Village,4066501,New York,4.071.850.431,-7.388.729.218,1.015.493.035,2.010.671.173,08,Queens
3,160321,341273,19-08-2015,,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,...,QN28,Jackson Heights,4030902,New York,4.075.662.595,-7.389.416.679,1.013.570.588,2.149.536.472,08,Queens
4,384135,344621,27-10-2015,,0,OnCurb,Alive,Good,Platanus x acerifolia,London planetree,...,QN19,Glendale,4063900,New York,4.070.214.731,-7.385.802.834,1.023.614.734,1.951.195.685,10,Queens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683783,238461,101080,19-09-2015,,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,...,MN24,SoHo-TriBeCa-Civic Center-Little Italy,1004100,New York,4.072.004.767,-7.399.587.402,985393.71,201.609.336,09,Manhattan
683784,153416,102158,17-08-2015,,0,OnCurb,Alive,Good,Ginkgo biloba,ginkgo,...,MN23,West Village,1007700,New York,4.073.845.527,-7.400.173.293,9.837.697.687,208.315.778,08,Manhattan
683785,22883,107355,16-06-2015,,0,OnCurb,Dead,Desconhecida,Desconhecido,Desconhecido,...,MN31,Lenox Hill-Roosevelt Island,1012600,New York,407.671.907,-7.395.849.465,9.957.470.397,2.187.877.455,06,Manhattan
683786,68840,109537,15-07-2015,,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,...,MN03,Central Harlem North-Polo Grounds,1022102,New York,4.081.949.976,-7.394.448.698,9.996.151.028,2.378.478.882,07,Manhattan


In [3]:
from pyspark.sql.functions import count
gold_tree_census = tree_census \
    .groupBy("boroname")\
    .agg(
        count(tree_census.tree_id).alias("Contagem")
    ) \
   
gold_tree_census.toPandas()


Unnamed: 0,boroname,Contagem
0,Queens,250551
1,Brooklyn,177293
2,Staten Island,105318
3,Bronx,85203
4,Manhattan,65423


In [4]:
# create air_quality table in gold
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto_gold.Tabela_arvores
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto_gold.Tabela_arvores (
        boroname VARCHAR(250),
        Contagem LONG
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/trabalhoarvores/gold/Projeto_gold.db/Tabela_arvores/'
    """
)
        

DataFrame[]

In [5]:
# write to delta table
gold_tree_census \
    .write  \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/trabalhoarvores/gold/Projeto_gold.db/Tabela_arvores/")

In [6]:
# check the results in the table
spark.table("Projeto_gold.Tabela_arvores").toPandas()


Unnamed: 0,boroname,Contagem
0,Queens,250551
1,Brooklyn,177293
2,Staten Island,105318
3,Bronx,85203
4,Manhattan,65423


In [7]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/trabalhoarvores/gold/Projeto_gold.db/Tabela_arvores/`
""").show()

++
||
++
++



In [8]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.Tabela_arvores_presto 
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE Projeto_gold.Tabela_arvores_presto (
        boroname VARCHAR(250),  
        Contagem LONG
        )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/trabalhoarvores/gold/Projeto_gold.db/Tabela_arvores/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++

