In [1]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [2]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.functions import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType,FloatType

warehouse_location = 'hdfs://hdfs-nn:9870/trabalho'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
hdfs_path = "hdfs://hdfs-nn:9000/trabalho/bronze/DrinkingFountains.csv"

customSchema = StructType([
    StructField("FountainTy", StringType(), True),        
    StructField("the_geom", StringType(), True),
    StructField("OBJECTID", IntegerType(), True),
    StructField("Position", StringType(), True),
    StructField("Collection", StringType(), True),
    StructField("Painted", StringType(), True),
    StructField("GISPROPNUM", StringType(), True), 
    StructField("SIGNNAME", StringType(), True), 
    StructField("BOROUGH", StringType(), True), 
    StructField("FountainCo", IntegerType(), True), 
    StructField("GISOBJID", IntegerType(), True),
    StructField("SYSTEM", StringType(), True),
    StructField("DEPARTMENT", StringType(), True),
    StructField("PARENTID", StringType(), True),
    StructField("DESCRIPTIO", StringType(), True),
    StructField("FEATURESTA", StringType(), True),
    

])

projeto_green = spark \
            .read\
            .option("delimiter",";")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
projeto_green.toPandas()

Unnamed: 0,FountainTy,the_geom,OBJECTID,Position,Collection,Painted,GISPROPNUM,SIGNNAME,BOROUGH,FountainCo,GISOBJID,SYSTEM,DEPARTMENT,PARENTID,DESCRIPTIO,FEATURESTA
0,F High Low,POINT (-73.98659181365889 40.60753207315604),1589,Out in Open,02/10/2018 12:00:00 AM +0000,no,B100,Seth Low Playground/ Bealin Square,B,2,100038957.0,B100-DF0647,B-11,B100,"F High Low, Out in Open",Active
1,C,POINT (-73.96862765692859 40.7480871896291),1921,In Playground,02/22/2018 12:00:00 AM +0000,no,M158,Robert Moses Playground,M,1,100039303.0,M158-DF0068,M-06,M158,"C, In Playground",Active
2,D,POINT (-74.0003631604654 40.750036026114756),2253,"Under Tree, Near Ballfield, Just Outside Playg...",02/27/2018 12:00:00 AM +0000,yes,M011,Chelsea Park,M,1,100039632.0,M011-DF0209,M-04,M011,"D, Under Tree, Near Ballfield, Just Outside Pl...",Active
3,D,POINT (-73.97290101781864 40.72386196549101),2585,"Just Outside Playground, Near Ballfield",04/04/2018 12:00:00 AM +0000,no,M144,John V. Lindsay East River Park,M,1,100039936.0,M144-DF0392,M-03,M144,"D, Just Outside Playground, Near Ballfield",Active
4,C,POINT (-73.94639259466328 40.69933443616386),2917,"In Shade, In Playground",05/29/2018 12:00:00 AM +0000,,B302,Charlie's Place,B,1,100040304.0,B302-DF0897,B-03,B302,"C, In Shade, In Playground",Active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,D,POINT (-73.90020355545877 40.62611516481206),2418,"Out in Open, Near Ballfield",03/05/2018 12:00:00 AM +0000,no,B018,Canarsie Park,B,1,100039874.0,B018-DF0817,B-18,B018,"D, Out in Open, Near Ballfield",Active
3116,C,POINT (-73.7688486244796 40.77023172437061),2750,Just Outside Playground,05/01/2018 12:00:00 AM +0000,no,Q012,Crocheron Park,Q,1,100040158.0,Q012-DF0750,Q-11,Q012,"C, Just Outside Playground",Active
3117,A,POINT (-73.86256527267528 40.83342686523247),3082,,11/30/1899 12:00:00 AM +0000,,X148L1,Virginia Park,X,1,100040464.0,X148L1-DF0470,X-09,X148L1,A,Active
3118,E Wheelchair,POINT (-73.97375555702659 40.7182972633732),2564,Near Ballfield,04/04/2018 12:00:00 AM +0000,yes,M144,John V. Lindsay East River Park,M,1,100039941.0,M144-DF0371,M-03,M144,"E Wheelchair, Near Ballfield",Active


In [4]:
projeto_green1 = projeto_green.withColumn(
    "BOROUGH",
    when(projeto_green.BOROUGH.endswith('B'),regexp_replace(projeto_green.BOROUGH,'B','Brooklyn')) \
    .when(projeto_green.BOROUGH.endswith('M'),regexp_replace(projeto_green.BOROUGH,'M','Manhattan')) \
    .when(projeto_green.BOROUGH.endswith('Q'),regexp_replace(projeto_green.BOROUGH,'Q','Queens')) \
    .when(projeto_green.BOROUGH.endswith('R'),regexp_replace(projeto_green.BOROUGH,'R','Staten Island')) \
    .when(projeto_green.BOROUGH.endswith('X'),regexp_replace(projeto_green.BOROUGH,'X','Bronx')) \
)

projeto_green1.toPandas()

Unnamed: 0,FountainTy,the_geom,OBJECTID,Position,Collection,Painted,GISPROPNUM,SIGNNAME,BOROUGH,FountainCo,GISOBJID,SYSTEM,DEPARTMENT,PARENTID,DESCRIPTIO,FEATURESTA
0,F High Low,POINT (-73.98659181365889 40.60753207315604),1589,Out in Open,02/10/2018 12:00:00 AM +0000,no,B100,Seth Low Playground/ Bealin Square,Brooklyn,2,100038957.0,B100-DF0647,B-11,B100,"F High Low, Out in Open",Active
1,C,POINT (-73.96862765692859 40.7480871896291),1921,In Playground,02/22/2018 12:00:00 AM +0000,no,M158,Robert Moses Playground,Manhattan,1,100039303.0,M158-DF0068,M-06,M158,"C, In Playground",Active
2,D,POINT (-74.0003631604654 40.750036026114756),2253,"Under Tree, Near Ballfield, Just Outside Playg...",02/27/2018 12:00:00 AM +0000,yes,M011,Chelsea Park,Manhattan,1,100039632.0,M011-DF0209,M-04,M011,"D, Under Tree, Near Ballfield, Just Outside Pl...",Active
3,D,POINT (-73.97290101781864 40.72386196549101),2585,"Just Outside Playground, Near Ballfield",04/04/2018 12:00:00 AM +0000,no,M144,John V. Lindsay East River Park,Manhattan,1,100039936.0,M144-DF0392,M-03,M144,"D, Just Outside Playground, Near Ballfield",Active
4,C,POINT (-73.94639259466328 40.69933443616386),2917,"In Shade, In Playground",05/29/2018 12:00:00 AM +0000,,B302,Charlie's Place,Brooklyn,1,100040304.0,B302-DF0897,B-03,B302,"C, In Shade, In Playground",Active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,D,POINT (-73.90020355545877 40.62611516481206),2418,"Out in Open, Near Ballfield",03/05/2018 12:00:00 AM +0000,no,B018,Canarsie Park,Brooklyn,1,100039874.0,B018-DF0817,B-18,B018,"D, Out in Open, Near Ballfield",Active
3116,C,POINT (-73.7688486244796 40.77023172437061),2750,Just Outside Playground,05/01/2018 12:00:00 AM +0000,no,Q012,Crocheron Park,Queens,1,100040158.0,Q012-DF0750,Q-11,Q012,"C, Just Outside Playground",Active
3117,A,POINT (-73.86256527267528 40.83342686523247),3082,,11/30/1899 12:00:00 AM +0000,,X148L1,Virginia Park,Bronx,1,100040464.0,X148L1-DF0470,X-09,X148L1,A,Active
3118,E Wheelchair,POINT (-73.97375555702659 40.7182972633732),2564,Near Ballfield,04/04/2018 12:00:00 AM +0000,yes,M144,John V. Lindsay East River Park,Manhattan,1,100039941.0,M144-DF0371,M-03,M144,"E Wheelchair, Near Ballfield",Active


In [5]:
projeto_green2 = projeto_green1.withColumn(
    "Position",
    when(
        (col("Position").isNull() | (col("Position") == None)), 
        "Desconhecida"
    ).otherwise(col("Position")))
projeto_green2.toPandas()

Unnamed: 0,FountainTy,the_geom,OBJECTID,Position,Collection,Painted,GISPROPNUM,SIGNNAME,BOROUGH,FountainCo,GISOBJID,SYSTEM,DEPARTMENT,PARENTID,DESCRIPTIO,FEATURESTA
0,F High Low,POINT (-73.98659181365889 40.60753207315604),1589,Out in Open,02/10/2018 12:00:00 AM +0000,no,B100,Seth Low Playground/ Bealin Square,Brooklyn,2,100038957.0,B100-DF0647,B-11,B100,"F High Low, Out in Open",Active
1,C,POINT (-73.96862765692859 40.7480871896291),1921,In Playground,02/22/2018 12:00:00 AM +0000,no,M158,Robert Moses Playground,Manhattan,1,100039303.0,M158-DF0068,M-06,M158,"C, In Playground",Active
2,D,POINT (-74.0003631604654 40.750036026114756),2253,"Under Tree, Near Ballfield, Just Outside Playg...",02/27/2018 12:00:00 AM +0000,yes,M011,Chelsea Park,Manhattan,1,100039632.0,M011-DF0209,M-04,M011,"D, Under Tree, Near Ballfield, Just Outside Pl...",Active
3,D,POINT (-73.97290101781864 40.72386196549101),2585,"Just Outside Playground, Near Ballfield",04/04/2018 12:00:00 AM +0000,no,M144,John V. Lindsay East River Park,Manhattan,1,100039936.0,M144-DF0392,M-03,M144,"D, Just Outside Playground, Near Ballfield",Active
4,C,POINT (-73.94639259466328 40.69933443616386),2917,"In Shade, In Playground",05/29/2018 12:00:00 AM +0000,,B302,Charlie's Place,Brooklyn,1,100040304.0,B302-DF0897,B-03,B302,"C, In Shade, In Playground",Active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,D,POINT (-73.90020355545877 40.62611516481206),2418,"Out in Open, Near Ballfield",03/05/2018 12:00:00 AM +0000,no,B018,Canarsie Park,Brooklyn,1,100039874.0,B018-DF0817,B-18,B018,"D, Out in Open, Near Ballfield",Active
3116,C,POINT (-73.7688486244796 40.77023172437061),2750,Just Outside Playground,05/01/2018 12:00:00 AM +0000,no,Q012,Crocheron Park,Queens,1,100040158.0,Q012-DF0750,Q-11,Q012,"C, Just Outside Playground",Active
3117,A,POINT (-73.86256527267528 40.83342686523247),3082,Desconhecida,11/30/1899 12:00:00 AM +0000,,X148L1,Virginia Park,Bronx,1,100040464.0,X148L1-DF0470,X-09,X148L1,A,Active
3118,E Wheelchair,POINT (-73.97375555702659 40.7182972633732),2564,Near Ballfield,04/04/2018 12:00:00 AM +0000,yes,M144,John V. Lindsay East River Park,Manhattan,1,100039941.0,M144-DF0371,M-03,M144,"E Wheelchair, Near Ballfield",Active


In [6]:
projeto_green2 = projeto_green1.withColumn(
    "Painted",
    when(
        (col("Painted").isNull() | (col("Painted") == None)), 
        "Desconhecido"
    ).otherwise(col("Painted")))
projeto_green2.toPandas()

Unnamed: 0,FountainTy,the_geom,OBJECTID,Position,Collection,Painted,GISPROPNUM,SIGNNAME,BOROUGH,FountainCo,GISOBJID,SYSTEM,DEPARTMENT,PARENTID,DESCRIPTIO,FEATURESTA
0,F High Low,POINT (-73.98659181365889 40.60753207315604),1589,Out in Open,02/10/2018 12:00:00 AM +0000,no,B100,Seth Low Playground/ Bealin Square,Brooklyn,2,100038957.0,B100-DF0647,B-11,B100,"F High Low, Out in Open",Active
1,C,POINT (-73.96862765692859 40.7480871896291),1921,In Playground,02/22/2018 12:00:00 AM +0000,no,M158,Robert Moses Playground,Manhattan,1,100039303.0,M158-DF0068,M-06,M158,"C, In Playground",Active
2,D,POINT (-74.0003631604654 40.750036026114756),2253,"Under Tree, Near Ballfield, Just Outside Playg...",02/27/2018 12:00:00 AM +0000,yes,M011,Chelsea Park,Manhattan,1,100039632.0,M011-DF0209,M-04,M011,"D, Under Tree, Near Ballfield, Just Outside Pl...",Active
3,D,POINT (-73.97290101781864 40.72386196549101),2585,"Just Outside Playground, Near Ballfield",04/04/2018 12:00:00 AM +0000,no,M144,John V. Lindsay East River Park,Manhattan,1,100039936.0,M144-DF0392,M-03,M144,"D, Just Outside Playground, Near Ballfield",Active
4,C,POINT (-73.94639259466328 40.69933443616386),2917,"In Shade, In Playground",05/29/2018 12:00:00 AM +0000,Desconhecido,B302,Charlie's Place,Brooklyn,1,100040304.0,B302-DF0897,B-03,B302,"C, In Shade, In Playground",Active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,D,POINT (-73.90020355545877 40.62611516481206),2418,"Out in Open, Near Ballfield",03/05/2018 12:00:00 AM +0000,no,B018,Canarsie Park,Brooklyn,1,100039874.0,B018-DF0817,B-18,B018,"D, Out in Open, Near Ballfield",Active
3116,C,POINT (-73.7688486244796 40.77023172437061),2750,Just Outside Playground,05/01/2018 12:00:00 AM +0000,no,Q012,Crocheron Park,Queens,1,100040158.0,Q012-DF0750,Q-11,Q012,"C, Just Outside Playground",Active
3117,A,POINT (-73.86256527267528 40.83342686523247),3082,,11/30/1899 12:00:00 AM +0000,Desconhecido,X148L1,Virginia Park,Bronx,1,100040464.0,X148L1-DF0470,X-09,X148L1,A,Active
3118,E Wheelchair,POINT (-73.97375555702659 40.7182972633732),2564,Near Ballfield,04/04/2018 12:00:00 AM +0000,yes,M144,John V. Lindsay East River Park,Manhattan,1,100039941.0,M144-DF0371,M-03,M144,"E Wheelchair, Near Ballfield",Active


In [7]:
projeto_green2 = projeto_green1.withColumn(
    "GISPROPNUM",
    when(
        (col("GISPROPNUM").isNull() | (col("GISPROPNUM") == None)), 
        "Desconhecido"
    ).otherwise(col("GISPROPNUM")))
projeto_green2.toPandas()

Unnamed: 0,FountainTy,the_geom,OBJECTID,Position,Collection,Painted,GISPROPNUM,SIGNNAME,BOROUGH,FountainCo,GISOBJID,SYSTEM,DEPARTMENT,PARENTID,DESCRIPTIO,FEATURESTA
0,F High Low,POINT (-73.98659181365889 40.60753207315604),1589,Out in Open,02/10/2018 12:00:00 AM +0000,no,B100,Seth Low Playground/ Bealin Square,Brooklyn,2,100038957.0,B100-DF0647,B-11,B100,"F High Low, Out in Open",Active
1,C,POINT (-73.96862765692859 40.7480871896291),1921,In Playground,02/22/2018 12:00:00 AM +0000,no,M158,Robert Moses Playground,Manhattan,1,100039303.0,M158-DF0068,M-06,M158,"C, In Playground",Active
2,D,POINT (-74.0003631604654 40.750036026114756),2253,"Under Tree, Near Ballfield, Just Outside Playg...",02/27/2018 12:00:00 AM +0000,yes,M011,Chelsea Park,Manhattan,1,100039632.0,M011-DF0209,M-04,M011,"D, Under Tree, Near Ballfield, Just Outside Pl...",Active
3,D,POINT (-73.97290101781864 40.72386196549101),2585,"Just Outside Playground, Near Ballfield",04/04/2018 12:00:00 AM +0000,no,M144,John V. Lindsay East River Park,Manhattan,1,100039936.0,M144-DF0392,M-03,M144,"D, Just Outside Playground, Near Ballfield",Active
4,C,POINT (-73.94639259466328 40.69933443616386),2917,"In Shade, In Playground",05/29/2018 12:00:00 AM +0000,,B302,Charlie's Place,Brooklyn,1,100040304.0,B302-DF0897,B-03,B302,"C, In Shade, In Playground",Active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,D,POINT (-73.90020355545877 40.62611516481206),2418,"Out in Open, Near Ballfield",03/05/2018 12:00:00 AM +0000,no,B018,Canarsie Park,Brooklyn,1,100039874.0,B018-DF0817,B-18,B018,"D, Out in Open, Near Ballfield",Active
3116,C,POINT (-73.7688486244796 40.77023172437061),2750,Just Outside Playground,05/01/2018 12:00:00 AM +0000,no,Q012,Crocheron Park,Queens,1,100040158.0,Q012-DF0750,Q-11,Q012,"C, Just Outside Playground",Active
3117,A,POINT (-73.86256527267528 40.83342686523247),3082,,11/30/1899 12:00:00 AM +0000,,X148L1,Virginia Park,Bronx,1,100040464.0,X148L1-DF0470,X-09,X148L1,A,Active
3118,E Wheelchair,POINT (-73.97375555702659 40.7182972633732),2564,Near Ballfield,04/04/2018 12:00:00 AM +0000,yes,M144,John V. Lindsay East River Park,Manhattan,1,100039941.0,M144-DF0371,M-03,M144,"E Wheelchair, Near Ballfield",Active


In [8]:
projeto_green2 = projeto_green1.withColumn(
    "SIGNNAME",
    when(
        (col("SIGNNAME").isNull() | (col("SIGNNAME") == None)), 
        "Desconhecido"
    ).otherwise(col("SIGNNAME")))
projeto_green2.toPandas()

Unnamed: 0,FountainTy,the_geom,OBJECTID,Position,Collection,Painted,GISPROPNUM,SIGNNAME,BOROUGH,FountainCo,GISOBJID,SYSTEM,DEPARTMENT,PARENTID,DESCRIPTIO,FEATURESTA
0,F High Low,POINT (-73.98659181365889 40.60753207315604),1589,Out in Open,02/10/2018 12:00:00 AM +0000,no,B100,Seth Low Playground/ Bealin Square,Brooklyn,2,100038957.0,B100-DF0647,B-11,B100,"F High Low, Out in Open",Active
1,C,POINT (-73.96862765692859 40.7480871896291),1921,In Playground,02/22/2018 12:00:00 AM +0000,no,M158,Robert Moses Playground,Manhattan,1,100039303.0,M158-DF0068,M-06,M158,"C, In Playground",Active
2,D,POINT (-74.0003631604654 40.750036026114756),2253,"Under Tree, Near Ballfield, Just Outside Playg...",02/27/2018 12:00:00 AM +0000,yes,M011,Chelsea Park,Manhattan,1,100039632.0,M011-DF0209,M-04,M011,"D, Under Tree, Near Ballfield, Just Outside Pl...",Active
3,D,POINT (-73.97290101781864 40.72386196549101),2585,"Just Outside Playground, Near Ballfield",04/04/2018 12:00:00 AM +0000,no,M144,John V. Lindsay East River Park,Manhattan,1,100039936.0,M144-DF0392,M-03,M144,"D, Just Outside Playground, Near Ballfield",Active
4,C,POINT (-73.94639259466328 40.69933443616386),2917,"In Shade, In Playground",05/29/2018 12:00:00 AM +0000,,B302,Charlie's Place,Brooklyn,1,100040304.0,B302-DF0897,B-03,B302,"C, In Shade, In Playground",Active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,D,POINT (-73.90020355545877 40.62611516481206),2418,"Out in Open, Near Ballfield",03/05/2018 12:00:00 AM +0000,no,B018,Canarsie Park,Brooklyn,1,100039874.0,B018-DF0817,B-18,B018,"D, Out in Open, Near Ballfield",Active
3116,C,POINT (-73.7688486244796 40.77023172437061),2750,Just Outside Playground,05/01/2018 12:00:00 AM +0000,no,Q012,Crocheron Park,Queens,1,100040158.0,Q012-DF0750,Q-11,Q012,"C, Just Outside Playground",Active
3117,A,POINT (-73.86256527267528 40.83342686523247),3082,,11/30/1899 12:00:00 AM +0000,,X148L1,Virginia Park,Bronx,1,100040464.0,X148L1-DF0470,X-09,X148L1,A,Active
3118,E Wheelchair,POINT (-73.97375555702659 40.7182972633732),2564,Near Ballfield,04/04/2018 12:00:00 AM +0000,yes,M144,John V. Lindsay East River Park,Manhattan,1,100039941.0,M144-DF0371,M-03,M144,"E Wheelchair, Near Ballfield",Active


In [9]:
projeto_green2 = projeto_green1.withColumn(
    "GISOBJID",
    when(
        (col("GISOBJID").isNull() | (col("GISOBJID") == None)), 
        "Desconhecido"
    ).otherwise(col("GISOBJID")))
projeto_green2.toPandas()

Unnamed: 0,FountainTy,the_geom,OBJECTID,Position,Collection,Painted,GISPROPNUM,SIGNNAME,BOROUGH,FountainCo,GISOBJID,SYSTEM,DEPARTMENT,PARENTID,DESCRIPTIO,FEATURESTA
0,F High Low,POINT (-73.98659181365889 40.60753207315604),1589,Out in Open,02/10/2018 12:00:00 AM +0000,no,B100,Seth Low Playground/ Bealin Square,Brooklyn,2,100038957,B100-DF0647,B-11,B100,"F High Low, Out in Open",Active
1,C,POINT (-73.96862765692859 40.7480871896291),1921,In Playground,02/22/2018 12:00:00 AM +0000,no,M158,Robert Moses Playground,Manhattan,1,100039303,M158-DF0068,M-06,M158,"C, In Playground",Active
2,D,POINT (-74.0003631604654 40.750036026114756),2253,"Under Tree, Near Ballfield, Just Outside Playg...",02/27/2018 12:00:00 AM +0000,yes,M011,Chelsea Park,Manhattan,1,100039632,M011-DF0209,M-04,M011,"D, Under Tree, Near Ballfield, Just Outside Pl...",Active
3,D,POINT (-73.97290101781864 40.72386196549101),2585,"Just Outside Playground, Near Ballfield",04/04/2018 12:00:00 AM +0000,no,M144,John V. Lindsay East River Park,Manhattan,1,100039936,M144-DF0392,M-03,M144,"D, Just Outside Playground, Near Ballfield",Active
4,C,POINT (-73.94639259466328 40.69933443616386),2917,"In Shade, In Playground",05/29/2018 12:00:00 AM +0000,,B302,Charlie's Place,Brooklyn,1,100040304,B302-DF0897,B-03,B302,"C, In Shade, In Playground",Active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,D,POINT (-73.90020355545877 40.62611516481206),2418,"Out in Open, Near Ballfield",03/05/2018 12:00:00 AM +0000,no,B018,Canarsie Park,Brooklyn,1,100039874,B018-DF0817,B-18,B018,"D, Out in Open, Near Ballfield",Active
3116,C,POINT (-73.7688486244796 40.77023172437061),2750,Just Outside Playground,05/01/2018 12:00:00 AM +0000,no,Q012,Crocheron Park,Queens,1,100040158,Q012-DF0750,Q-11,Q012,"C, Just Outside Playground",Active
3117,A,POINT (-73.86256527267528 40.83342686523247),3082,,11/30/1899 12:00:00 AM +0000,,X148L1,Virginia Park,Bronx,1,100040464,X148L1-DF0470,X-09,X148L1,A,Active
3118,E Wheelchair,POINT (-73.97375555702659 40.7182972633732),2564,Near Ballfield,04/04/2018 12:00:00 AM +0000,yes,M144,John V. Lindsay East River Park,Manhattan,1,100039941,M144-DF0371,M-03,M144,"E Wheelchair, Near Ballfield",Active


In [10]:
projeto_green2 = projeto_green1.withColumn(
    "PARENTID",
    when(
        (col("PARENTID").isNull() | (col("PARENTID") == None)), 
        "Desconhecido"
    ).otherwise(col("PARENTID")))
projeto_green2.toPandas()

Unnamed: 0,FountainTy,the_geom,OBJECTID,Position,Collection,Painted,GISPROPNUM,SIGNNAME,BOROUGH,FountainCo,GISOBJID,SYSTEM,DEPARTMENT,PARENTID,DESCRIPTIO,FEATURESTA
0,F High Low,POINT (-73.98659181365889 40.60753207315604),1589,Out in Open,02/10/2018 12:00:00 AM +0000,no,B100,Seth Low Playground/ Bealin Square,Brooklyn,2,100038957.0,B100-DF0647,B-11,B100,"F High Low, Out in Open",Active
1,C,POINT (-73.96862765692859 40.7480871896291),1921,In Playground,02/22/2018 12:00:00 AM +0000,no,M158,Robert Moses Playground,Manhattan,1,100039303.0,M158-DF0068,M-06,M158,"C, In Playground",Active
2,D,POINT (-74.0003631604654 40.750036026114756),2253,"Under Tree, Near Ballfield, Just Outside Playg...",02/27/2018 12:00:00 AM +0000,yes,M011,Chelsea Park,Manhattan,1,100039632.0,M011-DF0209,M-04,M011,"D, Under Tree, Near Ballfield, Just Outside Pl...",Active
3,D,POINT (-73.97290101781864 40.72386196549101),2585,"Just Outside Playground, Near Ballfield",04/04/2018 12:00:00 AM +0000,no,M144,John V. Lindsay East River Park,Manhattan,1,100039936.0,M144-DF0392,M-03,M144,"D, Just Outside Playground, Near Ballfield",Active
4,C,POINT (-73.94639259466328 40.69933443616386),2917,"In Shade, In Playground",05/29/2018 12:00:00 AM +0000,,B302,Charlie's Place,Brooklyn,1,100040304.0,B302-DF0897,B-03,B302,"C, In Shade, In Playground",Active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,D,POINT (-73.90020355545877 40.62611516481206),2418,"Out in Open, Near Ballfield",03/05/2018 12:00:00 AM +0000,no,B018,Canarsie Park,Brooklyn,1,100039874.0,B018-DF0817,B-18,B018,"D, Out in Open, Near Ballfield",Active
3116,C,POINT (-73.7688486244796 40.77023172437061),2750,Just Outside Playground,05/01/2018 12:00:00 AM +0000,no,Q012,Crocheron Park,Queens,1,100040158.0,Q012-DF0750,Q-11,Q012,"C, Just Outside Playground",Active
3117,A,POINT (-73.86256527267528 40.83342686523247),3082,,11/30/1899 12:00:00 AM +0000,,X148L1,Virginia Park,Bronx,1,100040464.0,X148L1-DF0470,X-09,X148L1,A,Active
3118,E Wheelchair,POINT (-73.97375555702659 40.7182972633732),2564,Near Ballfield,04/04/2018 12:00:00 AM +0000,yes,M144,John V. Lindsay East River Park,Manhattan,1,100039941.0,M144-DF0371,M-03,M144,"E Wheelchair, Near Ballfield",Active


In [11]:
projeto_green3 = projeto_green2.withColumn('Data_Collection', split(projeto_green2['Collection'], ' ').getItem(0)) \
                                                .drop(col("Collection"))
projeto_green3.toPandas()

Unnamed: 0,FountainTy,the_geom,OBJECTID,Position,Painted,GISPROPNUM,SIGNNAME,BOROUGH,FountainCo,GISOBJID,SYSTEM,DEPARTMENT,PARENTID,DESCRIPTIO,FEATURESTA,Data_Collection
0,F High Low,POINT (-73.98659181365889 40.60753207315604),1589,Out in Open,no,B100,Seth Low Playground/ Bealin Square,Brooklyn,2,100038957.0,B100-DF0647,B-11,B100,"F High Low, Out in Open",Active,02/10/2018
1,C,POINT (-73.96862765692859 40.7480871896291),1921,In Playground,no,M158,Robert Moses Playground,Manhattan,1,100039303.0,M158-DF0068,M-06,M158,"C, In Playground",Active,02/22/2018
2,D,POINT (-74.0003631604654 40.750036026114756),2253,"Under Tree, Near Ballfield, Just Outside Playg...",yes,M011,Chelsea Park,Manhattan,1,100039632.0,M011-DF0209,M-04,M011,"D, Under Tree, Near Ballfield, Just Outside Pl...",Active,02/27/2018
3,D,POINT (-73.97290101781864 40.72386196549101),2585,"Just Outside Playground, Near Ballfield",no,M144,John V. Lindsay East River Park,Manhattan,1,100039936.0,M144-DF0392,M-03,M144,"D, Just Outside Playground, Near Ballfield",Active,04/04/2018
4,C,POINT (-73.94639259466328 40.69933443616386),2917,"In Shade, In Playground",,B302,Charlie's Place,Brooklyn,1,100040304.0,B302-DF0897,B-03,B302,"C, In Shade, In Playground",Active,05/29/2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,D,POINT (-73.90020355545877 40.62611516481206),2418,"Out in Open, Near Ballfield",no,B018,Canarsie Park,Brooklyn,1,100039874.0,B018-DF0817,B-18,B018,"D, Out in Open, Near Ballfield",Active,03/05/2018
3116,C,POINT (-73.7688486244796 40.77023172437061),2750,Just Outside Playground,no,Q012,Crocheron Park,Queens,1,100040158.0,Q012-DF0750,Q-11,Q012,"C, Just Outside Playground",Active,05/01/2018
3117,A,POINT (-73.86256527267528 40.83342686523247),3082,,,X148L1,Virginia Park,Bronx,1,100040464.0,X148L1-DF0470,X-09,X148L1,A,Active,11/30/1899
3118,E Wheelchair,POINT (-73.97375555702659 40.7182972633732),2564,Near Ballfield,yes,M144,John V. Lindsay East River Park,Manhattan,1,100039941.0,M144-DF0371,M-03,M144,"E Wheelchair, Near Ballfield",Active,04/04/2018


In [12]:
projeto_green4 = projeto_green3.withColumn("Data_Collection", to_date(col("Data_Collection"), "MM/dd/yyyy")) \


projeto_green4 = projeto_green4.withColumn('Mes', (split(projeto_green4['Data_Collection'], '-').getItem(1)).cast(IntegerType()))
projeto_green4.toPandas()

Unnamed: 0,FountainTy,the_geom,OBJECTID,Position,Painted,GISPROPNUM,SIGNNAME,BOROUGH,FountainCo,GISOBJID,SYSTEM,DEPARTMENT,PARENTID,DESCRIPTIO,FEATURESTA,Data_Collection,Mes
0,F High Low,POINT (-73.98659181365889 40.60753207315604),1589,Out in Open,no,B100,Seth Low Playground/ Bealin Square,Brooklyn,2,100038957.0,B100-DF0647,B-11,B100,"F High Low, Out in Open",Active,2018-02-10,2
1,C,POINT (-73.96862765692859 40.7480871896291),1921,In Playground,no,M158,Robert Moses Playground,Manhattan,1,100039303.0,M158-DF0068,M-06,M158,"C, In Playground",Active,2018-02-22,2
2,D,POINT (-74.0003631604654 40.750036026114756),2253,"Under Tree, Near Ballfield, Just Outside Playg...",yes,M011,Chelsea Park,Manhattan,1,100039632.0,M011-DF0209,M-04,M011,"D, Under Tree, Near Ballfield, Just Outside Pl...",Active,2018-02-27,2
3,D,POINT (-73.97290101781864 40.72386196549101),2585,"Just Outside Playground, Near Ballfield",no,M144,John V. Lindsay East River Park,Manhattan,1,100039936.0,M144-DF0392,M-03,M144,"D, Just Outside Playground, Near Ballfield",Active,2018-04-04,4
4,C,POINT (-73.94639259466328 40.69933443616386),2917,"In Shade, In Playground",,B302,Charlie's Place,Brooklyn,1,100040304.0,B302-DF0897,B-03,B302,"C, In Shade, In Playground",Active,2018-05-29,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,D,POINT (-73.90020355545877 40.62611516481206),2418,"Out in Open, Near Ballfield",no,B018,Canarsie Park,Brooklyn,1,100039874.0,B018-DF0817,B-18,B018,"D, Out in Open, Near Ballfield",Active,2018-03-05,3
3116,C,POINT (-73.7688486244796 40.77023172437061),2750,Just Outside Playground,no,Q012,Crocheron Park,Queens,1,100040158.0,Q012-DF0750,Q-11,Q012,"C, Just Outside Playground",Active,2018-05-01,5
3117,A,POINT (-73.86256527267528 40.83342686523247),3082,,,X148L1,Virginia Park,Bronx,1,100040464.0,X148L1-DF0470,X-09,X148L1,A,Active,1899-11-30,11
3118,E Wheelchair,POINT (-73.97375555702659 40.7182972633732),2564,Near Ballfield,yes,M144,John V. Lindsay East River Park,Manhattan,1,100039941.0,M144-DF0371,M-03,M144,"E Wheelchair, Near Ballfield",Active,2018-04-04,4


In [17]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.DrinkingFountains
    """
)

spark.sql(
    """
    DROP DATABASE IF EXISTS Projeto CASCADE
    """
)

DataFrame[]

In [18]:
spark.sql(
    """
    create database Projeto location 'hdfs://hdfs-nn:9000/trabalho/silver/Projeto.db'
    """
)

DataFrame[]

In [19]:
spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto.DrinkingFountains (
        FountainTy VARCHAR(50),
        the_geom VARCHAR(50),
        OBJECTID INT,
        Position VARCHAR(250),
        Painted VARCHAR(50),
        GISPROPNUM VARCHAR(250),
        SIGNNAME VARCHAR(250),
        FountainCo INT,
        GISOBJID INT,
        SYSTEM VARCHAR(250),
        DEPARTMENT VARCHAR(250),
        PARENTID VARCHAR(250),
        DESCRIPTIO VARCHAR(250),
        FEATURESTA VARCHAR(250),
        Data_Collection date,
        Mes INT
    )
    USING DELTA
    PARTITIONED BY (
        BOROUGH VARCHAR(250)
        
    )
    LOCATION 'hdfs://hdfs-nn:9000/trabalho/silver/Projeto.db/DrinkingFountains'
    """
)

DataFrame[]

In [20]:
projeto_green4 \
    .select("FountainTy","the_geom","OBJECTID","Position","Painted",
            "GISPROPNUM","SIGNNAME", "BOROUGH", "FountainCo","GISOBJID","SYSTEM","DEPARTMENT",
            "PARENTID", "DESCRIPTIO", "FEATURESTA", "Data_Collection", "Mes") \
    .write \
    .mode("overwrite") \
    .partitionBy("BOROUGH") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/trabalho/silver/Projeto.db/DrinkingFountains")

In [21]:
spark.table("Projeto.DrinkingFountains").toPandas()

Unnamed: 0,FountainTy,the_geom,OBJECTID,Position,Painted,GISPROPNUM,SIGNNAME,FountainCo,GISOBJID,SYSTEM,DEPARTMENT,PARENTID,DESCRIPTIO,FEATURESTA,Data_Collection,Mes,BOROUGH
0,F High Low,POINT (-73.98659181365889 40.60753207315604),1589,Out in Open,no,B100,Seth Low Playground/ Bealin Square,2,100038957.0,B100-DF0647,B-11,B100,"F High Low, Out in Open",Active,2018-02-10,2,Brooklyn
1,C,POINT (-73.94639259466328 40.69933443616386),2917,"In Shade, In Playground",,B302,Charlie's Place,1,100040304.0,B302-DF0897,B-03,B302,"C, In Shade, In Playground",Active,2018-05-29,5,Brooklyn
2,E Wheelchair,POINT (-73.92412001948213 40.703148631662614),23,Out in Open,yes,B016,Maria Hernandez Park,1,100037411.0,B016-DF0013,B-04,B016,"E Wheelchair, Out in Open",Active,2018-01-11,1,Brooklyn
3,C,POINT (-74.01152114448477 40.630582530228764),1898,In Playground,no,B052,Leif Ericson Park,1,100039289.0,B052-DF0762,B-10,B052,"C, In Playground",Active,2018-02-15,2,Brooklyn
4,C,POINT (-73.98647089425904 40.660201255822955),2894,In Shade,,B255G,Butterfly Gardens,1,100040265.0,B255G-DF0895,B-07,B255G,"C, In Shade",Active,2018-05-29,5,Brooklyn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,E Wheelchair,POINT (-74.0792508385633 40.62266148329357),1533,In Playground,yes,R061,Stapleton Playground,1,100038927.0,R061-DF0068,R-01,R061,"E Wheelchair, In Playground",Active,2018-02-08,2,Staten Island
3116,D,POINT (-74.21031591361488 40.533181277394675),1868,Near Ballfield,yes,R106,Bloomingdale Park,1,100039195.0,R106-DF0138,R-03,R106,"D, Near Ballfield",Active,2018-02-15,2,Staten Island
3117,D,POINT (-74.09878669564895 40.613386552308874),1762,Out in Open,no,R069,Terrace Playground,1,100039136.0,R069-DF0110,R-01,R069,"D, Out in Open",Active,2018-02-13,2,Staten Island
3118,A,POINT (-74.16297563648502 40.616399379236725),1427,Near Ballfield,yes,R075A,Father Macris Park,1,100038865.0,R075A-DF0021,R-02,R075A,"A, Near Ballfield",Active,2018-02-05,2,Staten Island
