In [38]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.types import StructType


# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Trabalho'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [39]:
hdfs_path = "hdfs://hdfs-nn:9000/Trabalho/bronze/Urban_Park_Ranger_Animal_Condition_Response.csv"


customSchema = StructType([
    StructField("Date and Time of initial call", StringType(), True),
    StructField("Date and time of Ranger response", StringType(), True),
    StructField("Borough", StringType(), True),
    StructField("Property", StringType(), True),
    StructField("Location", StringType(), True),
    StructField("Species Description", StringType(), True),
    StructField("Call Source",StringType(), True),
    StructField("Species Status", StringType(), True),
    StructField("Animal Condition", StringType(), True),
    StructField("Duration of Response", FloatType(), True),
    StructField("Age", StringType(), True),
    StructField("Animal Class", StringType(), True),
    StructField("311SR Number", StringType(), True),
    StructField("Final Ranger Action", StringType(), True),
    StructField("# of Animals", IntegerType(), True),
    StructField("PEP Response", StringType(), True),
    StructField("Animal Monitored", StringType(), True),
    StructField("Rehabilitator", StringType(), True),
    StructField("Hours spent monitoring", FloatType(), True),
    StructField("Police Response", StringType(), True),
    StructField("ESU Response", StringType(), True),
    StructField("ACC Intake Number", IntegerType(), True),
    
    ])

In [40]:
hdfs_path = "hdfs://hdfs-nn:9000/Trabalho/bronze/Urban_Park_Ranger_Animal_Condition_Response.csv"

Urban_Park_Ranger_Animal_Condition_Response = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .option("inferSchema", True) \
            .csv(hdfs_path)

In [41]:
Urban_Park_Ranger_Animal_Condition_Response1=Urban_Park_Ranger_Animal_Condition_Response.withColumn(
    "Final Ranger Action",
    when ((col("Final Ranger Action").isNull() | (col("Final Ranger Action") == None)),
          "Desconhecido/a"
         ).otherwise(col("Final Ranger Action")).cast("String"))

Urban_Park_Ranger_Animal_Condition_Response2=Urban_Park_Ranger_Animal_Condition_Response1.withColumn(
    "Location",
    when ((col("Location").isNull() | (col("Location") == None)),
          "Desconhecido/a"
         ).otherwise(col("Location")).cast("String"))

Urban_Park_Ranger_Animal_Condition_Response3=Urban_Park_Ranger_Animal_Condition_Response2.withColumn(
    "311SR Number",
    when ((col("311SR Number").isNull() | (col("311SR Number") == None)),
          "Desconhecido/a"
         ).otherwise(col("311SR Number")).cast("String"))

Urban_Park_Ranger_Animal_Condition_Response4=Urban_Park_Ranger_Animal_Condition_Response3.withColumn(
    "Rehabilitator",
    when ((col("Rehabilitator").isNull() | (col("Rehabilitator") == None)),
          "Desconhecido/a"
         ).otherwise(col("Rehabilitator")).cast("String"))

Urban_Park_Ranger_Animal_Condition_Response5=Urban_Park_Ranger_Animal_Condition_Response4.withColumn(
    "Hours spent monitoring",
    when ((col("Hours spent monitoring").isNull() | (col("Hours spent monitoring") == None)),
          "Desconhecido/a"
         ).otherwise(col("Hours spent monitoring")).cast("FLOAT"))

Urban_Park_Ranger_Animal_Condition_Response6=Urban_Park_Ranger_Animal_Condition_Response5.withColumn(
    "ACC Intake Number",
    when ((col("ACC Intake Number").isNull() | (col("ACC Intake Number") == None)),
          "Desconhecido/a"
         ).otherwise(col("ACC Intake Number")).cast("INT"))

Urban_Park_Ranger_Animal_Condition_Response6.toPandas()

Unnamed: 0,Date and Time of initial call,Date and time of Ranger response,Borough,Property,Location,Species Description,Call Source,Species Status,Animal Condition,Duration of Response,...,311SR Number,Final Ranger Action,# of Animals,PEP Response,Animal Monitored,Rehabilitator,Hours spent monitoring,Police Response,ESU Response,ACC Intake Number
0,06/23/2021 04:45:00 PM,06/24/2021 08:00:00 AM,Brooklyn,Sternberg Park,Inside locked athletic field under construction,Chukar,Other,Exotic,Healthy,6,...,311-06712416,ACC,6,false,false,Desconhecido/a,,False,False,163537.0
1,06/24/2021 10:00:00 AM,06/24/2021 11:00:00 AM,Bronx,Haffen Park,Haffen Pool,Sparrow,Central,Native,Healthy,1.75,...,311-06714879,Rehabilitator,4,false,false,Desconhecido/a,,False,False,
2,06/23/2021 02:30:00 PM,06/23/2021 02:30:00 PM,Bronx,Pelham Bay Park,Pelham Bay South,White-tailed Deer,Employee,Native,,1,...,Desconhecido/a,Unfounded,0,false,false,Desconhecido/a,,False,False,
3,06/23/2021 01:00:00 PM,06/23/2021 01:10:00 PM,Staten Island,Willowbrook Park,The carousel,Raccoon,Employee,Native,,2,...,Desconhecido/a,Unfounded,0,false,false,Desconhecido/a,,False,False,
4,06/23/2021 09:20:00 AM,06/23/2021 09:20:00 AM,Queens,Judge Moses Weinstein Playground,Garbage can,Virginia Opossum,Central,Native,Healthy,2.25,...,311-06699415,ACC,1,false,false,Desconhecido/a,,False,False,119833.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,06/05/2018 12:00:00 AM,06/05/2018 12:01:00 AM,Manhattan,Abingdon Square,Desconhecido/a,raccoon,Central,Native,Healthy,0.75,...,Desconhecido/a,Relocated/Condition Corrected,1,false,true,Desconhecido/a,1.0,False,False,
2605,06/01/2018 12:00:00 PM,06/01/2018 12:30:00 PM,Manhattan,Central Park,park,Raccoon,Employee,Native,Injured,1.25,...,1-1-1568786600,ACC,1,false,false,Desconhecido/a,,False,False,36061.0
2606,05/16/2018 09:00:00 AM,05/17/2018 10:10:00 AM,Manhattan,Morningside Park,Desconhecido/a,Raccoon,Employee,Native,DOA,1.5,...,Desconhecido/a,ACC,2,false,true,Desconhecido/a,0.5,False,False,28316.0
2607,05/02/2018 09:30:00 AM,05/02/2018 12:00:00 PM,Manhattan,Central Park,park sewer drain,Raccoon,Public,Native,Healthy,0.75,...,Desconhecido/a,Unfounded,0,,,Desconhecido/a,,,False,


In [42]:
NewColumns=(column.replace(' ','_') for column in Urban_Park_Ranger_Animal_Condition_Response6.columns)
Urban_Park_Ranger_Animal_Condition_Response7 = Urban_Park_Ranger_Animal_Condition_Response6.toDF(*NewColumns) \

Urban_Park_Ranger_Animal_Condition_Response8 = Urban_Park_Ranger_Animal_Condition_Response7.withColumnRenamed("#_of_Animals","of_Animals") \

In [43]:
Urban_Park_Ranger_Animal_Condition_Response9 = Urban_Park_Ranger_Animal_Condition_Response8.withColumn('registo_mes', split(Urban_Park_Ranger_Animal_Condition_Response8['Date_and_Time_of_initial_call'], '/').getItem(0))
Urban_Park_Ranger_Animal_Condition_Response9.toPandas()

Unnamed: 0,Date_and_Time_of_initial_call,Date_and_time_of_Ranger_response,Borough,Property,Location,Species_Description,Call_Source,Species_Status,Animal_Condition,Duration_of_Response,...,Final_Ranger_Action,of_Animals,PEP_Response,Animal_Monitored,Rehabilitator,Hours_spent_monitoring,Police_Response,ESU_Response,ACC_Intake_Number,registo_mes
0,06/23/2021 04:45:00 PM,06/24/2021 08:00:00 AM,Brooklyn,Sternberg Park,Inside locked athletic field under construction,Chukar,Other,Exotic,Healthy,6,...,ACC,6,false,false,Desconhecido/a,,False,False,163537.0,06
1,06/24/2021 10:00:00 AM,06/24/2021 11:00:00 AM,Bronx,Haffen Park,Haffen Pool,Sparrow,Central,Native,Healthy,1.75,...,Rehabilitator,4,false,false,Desconhecido/a,,False,False,,06
2,06/23/2021 02:30:00 PM,06/23/2021 02:30:00 PM,Bronx,Pelham Bay Park,Pelham Bay South,White-tailed Deer,Employee,Native,,1,...,Unfounded,0,false,false,Desconhecido/a,,False,False,,06
3,06/23/2021 01:00:00 PM,06/23/2021 01:10:00 PM,Staten Island,Willowbrook Park,The carousel,Raccoon,Employee,Native,,2,...,Unfounded,0,false,false,Desconhecido/a,,False,False,,06
4,06/23/2021 09:20:00 AM,06/23/2021 09:20:00 AM,Queens,Judge Moses Weinstein Playground,Garbage can,Virginia Opossum,Central,Native,Healthy,2.25,...,ACC,1,false,false,Desconhecido/a,,False,False,119833.0,06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,06/05/2018 12:00:00 AM,06/05/2018 12:01:00 AM,Manhattan,Abingdon Square,Desconhecido/a,raccoon,Central,Native,Healthy,0.75,...,Relocated/Condition Corrected,1,false,true,Desconhecido/a,1.0,False,False,,06
2605,06/01/2018 12:00:00 PM,06/01/2018 12:30:00 PM,Manhattan,Central Park,park,Raccoon,Employee,Native,Injured,1.25,...,ACC,1,false,false,Desconhecido/a,,False,False,36061.0,06
2606,05/16/2018 09:00:00 AM,05/17/2018 10:10:00 AM,Manhattan,Morningside Park,Desconhecido/a,Raccoon,Employee,Native,DOA,1.5,...,ACC,2,false,true,Desconhecido/a,0.5,False,False,28316.0,05
2607,05/02/2018 09:30:00 AM,05/02/2018 12:00:00 PM,Manhattan,Central Park,park sewer drain,Raccoon,Public,Native,Healthy,0.75,...,Unfounded,0,,,Desconhecido/a,,,False,,05


In [44]:
#criacao da base de dados(database)
spark.sql(
    """
    CREATE DATABASE IF NOT EXISTS Projeto LOCATION "hdfs://hdfs-nn:9000/Trabalho/silver/Projeto.db/"
     """
)

DataFrame[]

In [45]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.urban_park_ranger_animal_condition_response
    """
)

spark.sql(
    
    """
    CREATE TABLE Projeto.urban_park_ranger_animal_condition_response (
    Date_and_Time_of_initial_call VARCHAR(300),
    Date_and_time_of_Ranger_response VARCHAR(300),
    Borough VARCHAR(300),
    Property VARCHAR(300),
    Location VARCHAR(300),
    Species_Description VARCHAR(300),
    Call_Source VARCHAR(300),
    Species_Status VARCHAR(300),
    Animal_Condition VARCHAR(300),
    Duration_of_Response VARCHAR(300),
    Age VARCHAR(300),
    Animal_Class VARCHAR(300),
    311SR_Number VARCHAR(300), 
    Final_Ranger_Action VARCHAR(300),
    of_Animals VARCHAR(30),
    PEP_Response VARCHAR(300),
    Animal_Monitored VARCHAR(30),
    Rehabilitator VARCHAR(300),
    Hours_spent_monitoring FLOAT,
    Police_Response Boolean,
    ESU_Response Boolean,
    ACC_Intake_Number INT,
    registo_mes VARCHAR(50)
    )
    USING DELTA
    LOCATION "hdfs://hdfs-nn:9000/Trabalho/silver/Projeto.db/urban_park_ranger_animal_condition_response/"
    """
)

DataFrame[]

In [46]:
spark.sql("USE Projeto")
spark.sql("SHOW tables").show()
Urban_Park_Ranger_Animal_Condition_Response6.printSchema()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|  projeto|urban_park_ranger...|      false|
+---------+--------------------+-----------+

root
 |-- Date and Time of initial call: string (nullable = true)
 |-- Date and time of Ranger response: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Property: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Species Description: string (nullable = true)
 |-- Call Source: string (nullable = true)
 |-- Species Status: string (nullable = true)
 |-- Animal Condition: string (nullable = true)
 |-- Duration of Response: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Animal Class: string (nullable = true)
 |-- 311SR Number: string (nullable = true)
 |-- Final Ranger Action: string (nullable = true)
 |-- # of Animals: string (nullable = true)
 |-- PEP Response: string (nullable = true)
 |-- Animal Monitored: 

In [47]:
Urban_Park_Ranger_Animal_Condition_Response9 \
    .select("Date_and_Time_of_initial_call","Date_and_time_of_Ranger_response","Borough","Property","Location","Species_Description","Call_Source",
            "Species_Status","Animal_Condition","Duration_of_Response","Age","Animal_Class","311SR_Number","Final_Ranger_Action","of_Animals","PEP_Response",
            "Animal_Monitored","Rehabilitator","Hours_spent_monitoring","Police_Response","ESU_Response","ACC_Intake_Number", "registo_mes") \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .option("mergeSchema", "true") \
    .save("hdfs://hdfs-nn:9000/Trabalho/silver/Projeto.db/urban_park_ranger_animal_condition_response")

In [48]:
spark.table("Projeto.urban_park_ranger_animal_condition_response").toPandas()

Unnamed: 0,Date_and_Time_of_initial_call,Date_and_time_of_Ranger_response,Borough,Property,Location,Species_Description,Call_Source,Species_Status,Animal_Condition,Duration_of_Response,...,Final_Ranger_Action,of_Animals,PEP_Response,Animal_Monitored,Rehabilitator,Hours_spent_monitoring,Police_Response,ESU_Response,ACC_Intake_Number,registo_mes
0,06/23/2021 04:45:00 PM,06/24/2021 08:00:00 AM,Brooklyn,Sternberg Park,Inside locked athletic field under construction,Chukar,Other,Exotic,Healthy,6,...,ACC,6,false,false,Desconhecido/a,,False,False,163537.0,06
1,06/24/2021 10:00:00 AM,06/24/2021 11:00:00 AM,Bronx,Haffen Park,Haffen Pool,Sparrow,Central,Native,Healthy,1.75,...,Rehabilitator,4,false,false,Desconhecido/a,,False,False,,06
2,06/23/2021 02:30:00 PM,06/23/2021 02:30:00 PM,Bronx,Pelham Bay Park,Pelham Bay South,White-tailed Deer,Employee,Native,,1,...,Unfounded,0,false,false,Desconhecido/a,,False,False,,06
3,06/23/2021 01:00:00 PM,06/23/2021 01:10:00 PM,Staten Island,Willowbrook Park,The carousel,Raccoon,Employee,Native,,2,...,Unfounded,0,false,false,Desconhecido/a,,False,False,,06
4,06/23/2021 09:20:00 AM,06/23/2021 09:20:00 AM,Queens,Judge Moses Weinstein Playground,Garbage can,Virginia Opossum,Central,Native,Healthy,2.25,...,ACC,1,false,false,Desconhecido/a,,False,False,119833.0,06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,06/05/2018 12:00:00 AM,06/05/2018 12:01:00 AM,Manhattan,Abingdon Square,Desconhecido/a,raccoon,Central,Native,Healthy,0.75,...,Relocated/Condition Corrected,1,false,true,Desconhecido/a,1.0,False,False,,06
2605,06/01/2018 12:00:00 PM,06/01/2018 12:30:00 PM,Manhattan,Central Park,park,Raccoon,Employee,Native,Injured,1.25,...,ACC,1,false,false,Desconhecido/a,,False,False,36061.0,06
2606,05/16/2018 09:00:00 AM,05/17/2018 10:10:00 AM,Manhattan,Morningside Park,Desconhecido/a,Raccoon,Employee,Native,DOA,1.5,...,ACC,2,false,true,Desconhecido/a,0.5,False,False,28316.0,05
2607,05/02/2018 09:30:00 AM,05/02/2018 12:00:00 PM,Manhattan,Central Park,park sewer drain,Raccoon,Public,Native,Healthy,0.75,...,Unfounded,0,,,Desconhecido/a,,,False,,05
