In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
from pyspark.sql.types import *
from pyspark.sql import functions as F

df = spark.read.csv("./work/datasets/Attributes/Apartments.csv", header=True, schema=StructType([
    StructField('apartmentId', IntegerType()),
    StructField('rentalCost', FloatType()),
    StructField('maxOccupancy', IntegerType()),
    StructField('numberOfRooms', IntegerType()),
    StructField('location', StringType()),
    StructField('buildingId', IntegerType()),
])).\
    withColumn('locationX', F.regexp_extract('location', 'POINT \(([^ ]+) ([^ ]+)\)', 1).cast('float')).\
    withColumn('locationY', F.regexp_extract('location', 'POINT \(([^ ]+) ([^ ]+)\)', 2).cast('float'))

df.printSchema()
df.show(2)

root
 |-- apartmentId: integer (nullable = true)
 |-- rentalCost: float (nullable = true)
 |-- maxOccupancy: integer (nullable = true)
 |-- numberOfRooms: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- buildingId: integer (nullable = true)
 |-- locationX: float (nullable = true)
 |-- locationY: float (nullable = true)

+-----------+----------+------------+-------------+--------------------+----------+----------+---------+
|apartmentId|rentalCost|maxOccupancy|numberOfRooms|            location|buildingId| locationX|locationY|
+-----------+----------+------------+-------------+--------------------+----------+----------+---------+
|          1|    768.16|           2|            4|POINT (1077.69794...|       340|  1077.698| 648.4427|
|          2|   1014.55|           2|            1|POINT (-185.92928...|       752|-185.92929|1520.3271|
+-----------+----------+------------+-------------+--------------------+----------+----------+---------+
only showing top 2 rows



In [3]:
df.write.mode('overwrite').parquet("./work/tobi/parquet/apartments.parquet")

In [15]:
df2 = spark.read.csv("./work/datasets/Attributes/Buildings.csv", header=True, schema=StructType([
    StructField('buildingId', IntegerType()),
    StructField('location', StringType()),
    StructField('buildingType', StringType()),
    StructField('maxOccupancy', IntegerType()),
    StructField('units', StringType()),
])).\
    withColumn('units', F.split(F.regexp_extract('units', '^\[(.*)\]$', 1), ','))

df2.printSchema()
df2.show(2)

df2.write.mode('overwrite').parquet("./work/tobi/parquet/buildings.parquet")

root
 |-- buildingId: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- buildingType: string (nullable = true)
 |-- maxOccupancy: integer (nullable = true)
 |-- units: array (nullable = true)
 |    |-- element: string (containsNull = false)

+----------+--------------------+------------+------------+--------------------+
|buildingId|            location|buildingType|maxOccupancy|               units|
+----------+--------------------+------------+------------+--------------------+
|         1|POLYGON ((350.063...|  Commercial|        null|                null|
|         2|POLYGON ((-1926.9...|  Residental|          12|[481, 498, 534, 6...|
+----------+--------------------+------------+------------+--------------------+
only showing top 2 rows

