In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
from pyspark.sql.types import *
from pyspark.sql import functions as F

df = spark.read.csv("./work/datasets/Attributes/Apartments.csv", header=True, schema=StructType([
    StructField('apartmentId', IntegerType()),
    StructField('rentalCost', FloatType()),
    StructField('maxOccupancy', IntegerType()),
    StructField('numberOfRooms', IntegerType()),
    StructField('location', StringType()),
    StructField('buildingId', IntegerType()),
])).\
    withColumn('locationX', F.regexp_extract('location', 'POINT \(([^ ]+) ([^ ]+)\)', 1).cast('float')).\
    withColumn('locationY', F.regexp_extract('location', 'POINT \(([^ ]+) ([^ ]+)\)', 2).cast('float'))

df.printSchema()
df.show(2)

root
 |-- apartmentId: integer (nullable = true)
 |-- rentalCost: float (nullable = true)
 |-- maxOccupancy: integer (nullable = true)
 |-- numberOfRooms: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- buildingId: integer (nullable = true)
 |-- locationX: float (nullable = true)
 |-- locationY: float (nullable = true)

+-----------+----------+------------+-------------+--------------------+----------+----------+---------+
|apartmentId|rentalCost|maxOccupancy|numberOfRooms|            location|buildingId| locationX|locationY|
+-----------+----------+------------+-------------+--------------------+----------+----------+---------+
|          1|    768.16|           2|            4|POINT (1077.69794...|       340|  1077.698| 648.4427|
|          2|   1014.55|           2|            1|POINT (-185.92928...|       752|-185.92929|1520.3271|
+-----------+----------+------------+-------------+--------------------+----------+----------+---------+
only showing top 2 rows



In [3]:
df.write.mode('overwrite').parquet("./work/tobi/parquet/apartments.parquet")

In [4]:
df2 = spark.read.csv("./work/datasets/Attributes/Buildings.csv", header=True, schema=StructType([
    StructField('buildingId', IntegerType()),
    StructField('location', StringType()),
    StructField('buildingType', StringType()),
    StructField('maxOccupancy', IntegerType()),
    StructField('units', StringType()),
])).\
    withColumn('units', F.split(F.regexp_extract('units', '^\[(.*)\]$', 1), ','))

df2.printSchema()
df2.show(2)

df2.write.mode('overwrite').parquet("./work/tobi/parquet/buildings.parquet")

root
 |-- buildingId: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- buildingType: string (nullable = true)
 |-- maxOccupancy: integer (nullable = true)
 |-- units: array (nullable = true)
 |    |-- element: string (containsNull = false)

+----------+--------------------+------------+------------+--------------------+
|buildingId|            location|buildingType|maxOccupancy|               units|
+----------+--------------------+------------+------------+--------------------+
|         1|POLYGON ((350.063...|  Commercial|        null|                null|
|         2|POLYGON ((-1926.9...|  Residental|          12|[481, 498, 534, 6...|
+----------+--------------------+------------+------------+--------------------+
only showing top 2 rows



In [5]:
df3 = spark.read.csv("./work/datasets/Attributes/Employers.csv", header=True, schema=StructType([
    StructField('employerId', IntegerType()),
    StructField('location', StringType()),
    StructField('buildingId', IntegerType()),
])).\
    withColumn('locationX', F.regexp_extract('location', 'POINT \(([^ ]+) ([^ ]+)\)', 1).cast('float')).\
    withColumn('locationY', F.regexp_extract('location', 'POINT \(([^ ]+) ([^ ]+)\)', 2).cast('float'))

df3.printSchema()

df3.show(2)
df3.write.mode('overwrite').parquet("./work/tobi/parquet/employers.parquet")

root
 |-- employerId: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- buildingId: integer (nullable = true)
 |-- locationX: float (nullable = true)
 |-- locationY: float (nullable = true)

+----------+--------------------+----------+----------+---------+
|employerId|            location|buildingId| locationX|locationY|
+----------+--------------------+----------+----------+---------+
|       379|POINT (-1849.9971...|       823|-1849.9972|1744.6011|
|       380|POINT (41.5178376...|       154| 41.517838|418.72647|
+----------+--------------------+----------+----------+---------+
only showing top 2 rows



In [9]:
df4 = spark.read.csv("./work/datasets/Attributes/Jobs.csv", header=True, schema=StructType([
    StructField('jobId', IntegerType()),
    StructField('employerId', IntegerType()),
    StructField('hourlyRate', FloatType()),
    StructField('startTime', StringType()),
    StructField('endTime', StringType()),
    StructField('daysToWork', StringType()),
    StructField('educationRequirement', StringType()),
])).\
    withColumn('daysToWork', F.split(F.regexp_extract('daysToWork', '^\[(.*)\]$', 1), ','))

df4.printSchema()

df4.show(2)
df4.write.mode('overwrite').parquet("./work/tobi/parquet/jobs.parquet")

root
 |-- jobId: integer (nullable = true)
 |-- employerId: integer (nullable = true)
 |-- hourlyRate: float (nullable = true)
 |-- startTime: string (nullable = true)
 |-- endTime: string (nullable = true)
 |-- daysToWork: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- educationRequirement: string (nullable = true)

+-----+----------+----------+----------+----------+--------------------+--------------------+
|jobId|employerId|hourlyRate| startTime|   endTime|          daysToWork|educationRequirement|
+-----+----------+----------+----------+----------+--------------------+--------------------+
|    0|       379|      10.0|7:46:00 AM|3:46:00 PM|[Monday, Tuesday,...| HighSchoolOrCollege|
|    1|       379| 22.217634|7:31:00 AM|3:31:00 PM|[Monday, Tuesday,...|           Bachelors|
+-----+----------+----------+----------+----------+--------------------+--------------------+
only showing top 2 rows



In [10]:
df5 = spark.read.csv("./work/datasets/Attributes/Participants.csv", header=True, schema=StructType([
    StructField('participantId', IntegerType()),
    StructField('householdSize', IntegerType()),
    StructField('haveKids', BooleanType()),
    StructField('age', IntegerType()),
    StructField('educationLevel', StringType()),
    StructField('interestGroup', StringType()),
    StructField('joviality', FloatType()),
]))

df5.printSchema()

df5.show(2)
df5.write.mode('overwrite').parquet("./work/tobi/parquet/participants.parquet")

root
 |-- participantId: integer (nullable = true)
 |-- householdSize: integer (nullable = true)
 |-- haveKids: boolean (nullable = true)
 |-- age: integer (nullable = true)
 |-- educationLevel: string (nullable = true)
 |-- interestGroup: string (nullable = true)
 |-- joviality: float (nullable = true)

+-------------+-------------+--------+---+-------------------+-------------+-----------+
|participantId|householdSize|haveKids|age|     educationLevel|interestGroup|  joviality|
+-------------+-------------+--------+---+-------------------+-------------+-----------+
|            0|            3|    true| 36|HighSchoolOrCollege|            H|0.001626703|
|            1|            3|    true| 25|HighSchoolOrCollege|            B|  0.3280865|
+-------------+-------------+--------+---+-------------------+-------------+-----------+
only showing top 2 rows



In [11]:
df6 = spark.read.csv("./work/datasets/Attributes/Pubs.csv", header=True, schema=StructType([
    StructField('pubId', IntegerType()),
    StructField('hourlyCost', FloatType()),
    StructField('maxOccupancy', IntegerType()),
    StructField('location', StringType()),
    StructField('buildingId', IntegerType()),
])).\
    withColumn('locationX', F.regexp_extract('location', 'POINT \(([^ ]+) ([^ ]+)\)', 1).cast('float')).\
    withColumn('locationY', F.regexp_extract('location', 'POINT \(([^ ]+) ([^ ]+)\)', 2).cast('float'))

df6.printSchema()

df6.show(2)
df6.write.mode('overwrite').parquet("./work/tobi/parquet/pubs.parquet")

root
 |-- pubId: integer (nullable = true)
 |-- hourlyCost: float (nullable = true)
 |-- maxOccupancy: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- buildingId: integer (nullable = true)
 |-- locationX: float (nullable = true)
 |-- locationY: float (nullable = true)

+-----+----------+------------+--------------------+----------+---------+---------+
|pubId|hourlyCost|maxOccupancy|            location|buildingId|locationX|locationY|
+-----+----------+------------+--------------------+----------+---------+---------+
|  442|  8.281103|          64|POINT (964.438023...|       556|964.43805|3991.6035|
|  443|  6.417435|          64|POINT (1809.88017...|        29|1809.8801|4339.1724|
+-----+----------+------------+--------------------+----------+---------+---------+
only showing top 2 rows



In [12]:
df7 = spark.read.csv("./work/datasets/Attributes/Restaurants.csv", header=True, schema=StructType([
    StructField('restaurantId', IntegerType()),
    StructField('foodCost', FloatType()),
    StructField('maxOccupancy', IntegerType()),
    StructField('location', StringType()),
    StructField('buildingId', IntegerType()),
])).\
    withColumn('locationX', F.regexp_extract('location', 'POINT \(([^ ]+) ([^ ]+)\)', 1).cast('float')).\
    withColumn('locationY', F.regexp_extract('location', 'POINT \(([^ ]+) ([^ ]+)\)', 2).cast('float'))

df7.printSchema()

df7.show(2)
df7.write.mode('overwrite').parquet("./work/tobi/parquet/restaurants.parquet")

root
 |-- restaurantId: integer (nullable = true)
 |-- foodCost: float (nullable = true)
 |-- maxOccupancy: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- buildingId: integer (nullable = true)
 |-- locationX: float (nullable = true)
 |-- locationY: float (nullable = true)

+------------+--------+------------+--------------------+----------+---------+---------+
|restaurantId|foodCost|maxOccupancy|            location|buildingId|locationX|locationY|
+------------+--------+------------+--------------------+----------+---------+---------+
|         445|    5.15|          71|POINT (631.513072...|       304|631.51306|2001.4772|
|         446|    4.17|          82|POINT (413.840000...|       308|   413.84|1194.1287|
+------------+--------+------------+--------------------+----------+---------+---------+
only showing top 2 rows



In [13]:
df8 = spark.read.csv("./work/datasets/Attributes/Schools.csv", header=True, schema=StructType([
    StructField('schoolId', IntegerType()),
    StructField('monthlyFees', FloatType()),
    StructField('maxEnrollment', IntegerType()),
    StructField('location', StringType()),
    StructField('buildingId', IntegerType()),
])).\
    withColumn('locationX', F.regexp_extract('location', 'POINT \(([^ ]+) ([^ ]+)\)', 1).cast('float')).\
    withColumn('locationY', F.regexp_extract('location', 'POINT \(([^ ]+) ([^ ]+)\)', 2).cast('float'))

df8.printSchema()

df8.show(2)
df8.write.mode('overwrite').parquet("./work/tobi/parquet/schools.parquet")

root
 |-- schoolId: integer (nullable = true)
 |-- monthlyFees: float (nullable = true)
 |-- maxEnrollment: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- buildingId: integer (nullable = true)
 |-- locationX: float (nullable = true)
 |-- locationY: float (nullable = true)

+--------+-----------+-------------+--------------------+----------+----------+---------+
|schoolId|monthlyFees|maxEnrollment|            location|buildingId| locationX|locationY|
+--------+-----------+-------------+--------------------+----------+----------+---------+
|       0|  12.812445|          242|POINT (-376.75050...|       662|-376.75052|1607.9844|
|     450|   91.14352|          418|POINT (-2597.4476...|       943|-2597.4478|3194.1548|
+--------+-----------+-------------+--------------------+----------+----------+---------+
only showing top 2 rows

