In [1]:
# Install dependencies

import sys
!{sys.executable} -m pip install hdfs



In [2]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/trabalho/gold'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
from pyspark.sql.functions import substring, avg, sum

# read DrinkingFountains from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/trabalho/silver/Projeto.db/DrinkingFountains"

DrinkingFountains = spark\
             .read\
             .load(hdfs_path)

DrinkingFountains.toPandas()

Unnamed: 0,FountainTy,the_geom,OBJECTID,Position,Painted,GISPROPNUM,SIGNNAME,FountainCo,GISOBJID,SYSTEM,DEPARTMENT,PARENTID,DESCRIPTIO,FEATURESTA,Data_Collection,Ano,BOROUGH
0,F High Low,POINT (-73.98659181365889 40.60753207315604),1589,Out in Open,no,B100,Seth Low Playground/ Bealin Square,2,100038957.0,B100-DF0647,B-11,B100,"F High Low, Out in Open",Active,2018-02-10,2018,Brooklyn
1,C,POINT (-73.94639259466328 40.69933443616386),2917,"In Shade, In Playground",,B302,Charlie's Place,1,100040304.0,B302-DF0897,B-03,B302,"C, In Shade, In Playground",Active,2018-05-29,2018,Brooklyn
2,E Wheelchair,POINT (-73.92412001948213 40.703148631662614),23,Out in Open,yes,B016,Maria Hernandez Park,1,100037411.0,B016-DF0013,B-04,B016,"E Wheelchair, Out in Open",Active,2018-01-11,2018,Brooklyn
3,C,POINT (-74.01152114448477 40.630582530228764),1898,In Playground,no,B052,Leif Ericson Park,1,100039289.0,B052-DF0762,B-10,B052,"C, In Playground",Active,2018-02-15,2018,Brooklyn
4,C,POINT (-73.98647089425904 40.660201255822955),2894,In Shade,,B255G,Butterfly Gardens,1,100040265.0,B255G-DF0895,B-07,B255G,"C, In Shade",Active,2018-05-29,2018,Brooklyn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,E Wheelchair,POINT (-74.0792508385633 40.62266148329357),1533,In Playground,yes,R061,Stapleton Playground,1,100038927.0,R061-DF0068,R-01,R061,"E Wheelchair, In Playground",Active,2018-02-08,2018,Staten Island
3116,D,POINT (-74.21031591361488 40.533181277394675),1868,Near Ballfield,yes,R106,Bloomingdale Park,1,100039195.0,R106-DF0138,R-03,R106,"D, Near Ballfield",Active,2018-02-15,2018,Staten Island
3117,D,POINT (-74.09878669564895 40.613386552308874),1762,Out in Open,no,R069,Terrace Playground,1,100039136.0,R069-DF0110,R-01,R069,"D, Out in Open",Active,2018-02-13,2018,Staten Island
3118,A,POINT (-74.16297563648502 40.616399379236725),1427,Near Ballfield,yes,R075A,Father Macris Park,1,100038865.0,R075A-DF0021,R-02,R075A,"A, Near Ballfield",Active,2018-02-05,2018,Staten Island


In [4]:
from pyspark.sql.functions import count
gold_DrinkingFountains = DrinkingFountains \
    .groupBy("FountainTy", "FEATURESTA")\
    .agg(
        count(DrinkingFountains.FountainTy).alias("Contagem")
    ) \
   
gold_DrinkingFountains.toPandas()


Unnamed: 0,FountainTy,FEATURESTA,Contagem
0,F Dog,Active,10
1,B,Active,125
2,Trough,Active,57
3,Pedestal,Active,28
4,F High Low,Active,36
5,A,Active,986
6,Bottle Filler Fountain,Active,16
7,CS Concrete,Active,60
8,E Wheelchair,Active,374
9,E,Active,7


In [5]:
# create DrinkingFountains table in gold
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto_gold.DrinkingFountains2
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto_gold.DrinkingFountains2 (
        FountainTy VARCHAR(250),
        FEATURESTA VARCHAR(500), 
        Contagem LONG
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/trabalho/gold/Projeto_gold.db/DrinkingFountains2/'
    """
)
        

DataFrame[]

In [6]:
# write to delta table
gold_DrinkingFountains \
    .write  \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/trabalho/gold/Projeto_gold.db/DrinkingFountains2/")

In [7]:
# check the results in the table
spark.table("Projeto_gold.DrinkingFountains2").toPandas()


Unnamed: 0,FountainTy,FEATURESTA,Contagem
0,F Dog,Active,10
1,B,Active,125
2,Trough,Active,57
3,Pedestal,Active,28
4,F High Low,Active,36
5,A,Active,986
6,Bottle Filler Fountain,Active,16
7,CS Concrete,Active,60
8,E Wheelchair,Active,374
9,E,Active,7


In [8]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/trabalho/gold/Projeto_gold.db/DrinkingFountains2/`
""").show()

++
||
++
++



In [9]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.DrinkingFountains_Presto2 
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE Projeto_gold.DrinkingFountains_Presto2 (
        FountainTy VARCHAR(250),
        FEATURESTA VARCHAR(500), 
        Contagem LONG
        )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/trabalho/gold/Projeto_gold.db/DrinkingFountains2/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++

