In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [3]:
from pyspark.sql.types import *
from pyspark.sql import functions as F

df = spark.read.csv("./work/datasets/Journals/CheckinJournal.csv", header=True, schema=StructType([
    StructField('participantId', IntegerType()),
    StructField('timestamp', TimestampType()),
    StructField('venueId', IntegerType()),
    StructField('venueType', StringType()),
]))

df.printSchema()
df.show(2)

df.write.mode('overwrite').parquet("./work/tobi/parquet/checkin_journal.parquet")

root
 |-- participantId: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- venueId: integer (nullable = true)
 |-- venueType: string (nullable = true)

+-------------+-------------------+-------+---------+
|participantId|          timestamp|venueId|venueType|
+-------------+-------------------+-------+---------+
|          619|2022-03-01 05:35:00|   1798|      Pub|
|           15|2022-03-01 05:50:00|   1798|      Pub|
+-------------+-------------------+-------+---------+
only showing top 2 rows



In [4]:
df2 = spark.read.csv("./work/datasets/Journals/FinancialJournal.csv", header=True, schema=StructType([
    StructField('participantId', IntegerType()),
    StructField('timestamp', TimestampType()),
    StructField('category', StringType()),
    StructField('amount', DoubleType()),
]))

df2.printSchema()
df2.show(2)

df2.write.mode('overwrite').parquet("./work/tobi/parquet/financial_journal.parquet")

root
 |-- participantId: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: double (nullable = true)

+-------------+-------------------+--------+------------------+
|participantId|          timestamp|category|            amount|
+-------------+-------------------+--------+------------------+
|            0|2022-03-01 00:00:00|    Wage| 2472.507558921935|
|            0|2022-03-01 00:00:00| Shelter|-554.9886216892347|
+-------------+-------------------+--------+------------------+
only showing top 2 rows



In [5]:
df3 = spark.read.csv("./work/datasets/Journals/SocialNetwork.csv", header=True, schema=StructType([
    StructField('timestamp', TimestampType()),
    StructField('participantIdFrom', IntegerType()),
    StructField('participantIdTo', IntegerType()),
]))

df3.printSchema()
df3.show(2)

df3.write.mode('overwrite').parquet("./work/tobi/parquet/social_network.parquet")

root
 |-- timestamp: timestamp (nullable = true)
 |-- participantIdFrom: integer (nullable = true)
 |-- participantIdTo: integer (nullable = true)

+-------------------+-----------------+---------------+
|          timestamp|participantIdFrom|participantIdTo|
+-------------------+-----------------+---------------+
|2022-03-01 00:00:00|              173|            180|
|2022-03-01 00:00:00|              178|            183|
+-------------------+-----------------+---------------+
only showing top 2 rows



In [7]:
df4 = spark.read.csv("./work/datasets/Journals/TravelJournal.csv", header=True, schema=StructType([
    StructField('participantId', IntegerType()),
    StructField('travelStartTime', TimestampType()),
    StructField('travelStartLocationId', IntegerType()),
    StructField('travelEndTime', TimestampType()),
    StructField('travelEndLocationId', IntegerType()),
    StructField('purpose', StringType()),
    StructField('checkInTime', TimestampType()),
    StructField('checkOutTime', TimestampType()),
    StructField('startingBalance', DoubleType()),
    StructField('endingBalance', DoubleType()),
]))

df4.printSchema()
df4.show(2)

df4.write.mode('overwrite').parquet("./work/tobi/parquet/travel_journal.parquet")

root
 |-- participantId: integer (nullable = true)
 |-- travelStartTime: timestamp (nullable = true)
 |-- travelStartLocationId: integer (nullable = true)
 |-- travelEndTime: timestamp (nullable = true)
 |-- travelEndLocationId: integer (nullable = true)
 |-- purpose: string (nullable = true)
 |-- checkInTime: timestamp (nullable = true)
 |-- checkOutTime: timestamp (nullable = true)
 |-- startingBalance: double (nullable = true)
 |-- endingBalance: double (nullable = true)

+-------------+-------------------+---------------------+-------------------+-------------------+--------------------+-------------------+-------------------+------------------+-----------------+
|participantId|    travelStartTime|travelStartLocationId|      travelEndTime|travelEndLocationId|             purpose|        checkInTime|       checkOutTime|   startingBalance|    endingBalance|
+-------------+-------------------+---------------------+-------------------+-------------------+--------------------+----------