In [54]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9870/Trabalho'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
# create gold database
spark.sql(
    """
    DROP DATABASE IF EXISTS Projeto_gold CASCADE
    """
)
spark.sql(
    """
    create database Projeto_gold location 'hdfs://hdfs-nn:9000/Trabalho/gold/Projeto_gold.db'
    """
)


DataFrame[]

In [55]:
from pyspark.sql.functions import substring, avg, sum

hdfs_path = "hdfs://hdfs-nn:9000/Trabalho/silver/Projeto.db/urban_park_ranger_animal_condition_response"

Animal_Condition = spark\
             .read\
             .load(hdfs_path)

Animal_Condition.toPandas()

Unnamed: 0,Date_and_Time_of_initial_call,Date_and_time_of_Ranger_response,Borough,Property,Location,Species_Description,Call_Source,Species_Status,Animal_Condition,Duration_of_Response,...,Final_Ranger_Action,of_Animals,PEP_Response,Animal_Monitored,Rehabilitator,Hours_spent_monitoring,Police_Response,ESU_Response,ACC_Intake_Number,registo_mes
0,06/23/2021 04:45:00 PM,06/24/2021 08:00:00 AM,Brooklyn,Sternberg Park,Inside locked athletic field under construction,Chukar,Other,Exotic,Healthy,6,...,ACC,6,false,false,Desconhecido/a,,False,False,163537.0,06
1,06/24/2021 10:00:00 AM,06/24/2021 11:00:00 AM,Bronx,Haffen Park,Haffen Pool,Sparrow,Central,Native,Healthy,1.75,...,Rehabilitator,4,false,false,Desconhecido/a,,False,False,,06
2,06/23/2021 02:30:00 PM,06/23/2021 02:30:00 PM,Bronx,Pelham Bay Park,Pelham Bay South,White-tailed Deer,Employee,Native,,1,...,Unfounded,0,false,false,Desconhecido/a,,False,False,,06
3,06/23/2021 01:00:00 PM,06/23/2021 01:10:00 PM,Staten Island,Willowbrook Park,The carousel,Raccoon,Employee,Native,,2,...,Unfounded,0,false,false,Desconhecido/a,,False,False,,06
4,06/23/2021 09:20:00 AM,06/23/2021 09:20:00 AM,Queens,Judge Moses Weinstein Playground,Garbage can,Virginia Opossum,Central,Native,Healthy,2.25,...,ACC,1,false,false,Desconhecido/a,,False,False,119833.0,06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,06/05/2018 12:00:00 AM,06/05/2018 12:01:00 AM,Manhattan,Abingdon Square,Desconhecido/a,raccoon,Central,Native,Healthy,0.75,...,Relocated/Condition Corrected,1,false,true,Desconhecido/a,1.0,False,False,,06
2605,06/01/2018 12:00:00 PM,06/01/2018 12:30:00 PM,Manhattan,Central Park,park,Raccoon,Employee,Native,Injured,1.25,...,ACC,1,false,false,Desconhecido/a,,False,False,36061.0,06
2606,05/16/2018 09:00:00 AM,05/17/2018 10:10:00 AM,Manhattan,Morningside Park,Desconhecido/a,Raccoon,Employee,Native,DOA,1.5,...,ACC,2,false,true,Desconhecido/a,0.5,False,False,28316.0,05
2607,05/02/2018 09:30:00 AM,05/02/2018 12:00:00 PM,Manhattan,Central Park,park sewer drain,Raccoon,Public,Native,Healthy,0.75,...,Unfounded,0,,,Desconhecido/a,,,False,,05


In [56]:
from pyspark.sql.functions import count
gold_Animal_Condition = Animal_Condition \
    .groupBy("Borough")\
    .agg(
        count(Animal_Condition.Date_and_Time_of_initial_call).alias("Contagem")
    ) \
   
gold_Animal_Condition.toPandas()
#Qual o concelho com maior numero de chamadas efetuadas para o guarda florestal?

Unnamed: 0,Borough,Contagem
0,Queens,458
1,Brooklyn,461
2,Staten Island,452
3,Manhattan,960
4,Bronx,278


In [57]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto_gold.TabelaAnimalCondition2
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto_gold.TabelaAnimalCondition2 (
        Borough VARCHAR(250),
        Contagem LONG
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/Trabalho/gold/Projeto_gold.db/TabelaAnimalCondition2/'
    """
)
       

DataFrame[]

In [58]:
# write to delta table
gold_Animal_Condition \
    .write  \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Trabalho/gold/Projeto_gold.db/TabelaAnimalCondition2")

In [59]:
# check the results in the table
spark.table("Projeto_gold.TabelaAnimalCondition2").toPandas()

Unnamed: 0,Borough,Contagem
0,Queens,458
1,Brooklyn,461
2,Staten Island,452
3,Manhattan,960
4,Bronx,278


In [60]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/Trabalho/gold/Projeto_gold.db/TabelaAnimalCondition2/`
""").show()

++
||
++
++



In [61]:
spark.sql("""
DROP TABLE IF EXISTS Projeto_gold.TabelaAnimalCondition2_presto 
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE Projeto_gold.TabelaAnimalCondition2_presto (
        Borough VARCHAR(250),  
        Contagem LONG
        )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/Trabalho/gold/Projeto_gold.db/TabelaAnimalCondition2/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++

