In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('younicam-AI').getOrCreate()

spark

In [2]:
presencesDF = spark.read.json("./data/presences.json", multiLine=True)

presencesDF.show(5, False)

+------------------------+----+------------------------+------------------------+------------------------+----+-----+----+
|_id                     |aula|date                    |inDate                  |outDate                 |polo|posto|sede|
+------------------------+----+------------------------+------------------------+------------------------+----+-----+----+
|5fa8ef7d1bd2a03f4641a15e|1   |2020-11-09T07:27:57.078Z|2020-11-09T07:27:57.078Z|2020-11-09T12:05:00.362Z|1   |1    |1   |
|5fa8efa51bd2a03f4641a15f|1   |2020-11-09T07:28:37.074Z|2020-11-09T07:28:37.074Z|2020-11-09T12:05:00.363Z|1   |2    |1   |
|5fa8f0751bd2a03f4641a160|1   |2020-11-09T07:32:05.879Z|2020-11-09T07:32:05.878Z|2020-11-09T12:05:00.364Z|1   |3    |1   |
|5fa8f0811bd2a03f4641a161|1   |2020-11-09T07:32:17.390Z|2020-11-09T07:32:17.390Z|2020-11-09T07:32:20.897Z|1   |4    |1   |
|5fa8f0891bd2a03f4641a162|1   |2020-11-09T07:32:25.980Z|2020-11-09T07:32:25.980Z|2020-11-09T07:32:36.245Z|1   |5    |1   |
+---------------

In [3]:
presencesDF.count()

9841

In [4]:
presencesDF.columns

['_id', 'aula', 'date', 'inDate', 'outDate', 'polo', 'posto', 'sede']

In [5]:
presencesDF.dtypes

[('_id', 'string'),
 ('aula', 'string'),
 ('date', 'string'),
 ('inDate', 'string'),
 ('outDate', 'string'),
 ('polo', 'string'),
 ('posto', 'string'),
 ('sede', 'string')]

In [6]:
# checking for null values

from pyspark.sql.functions import isnull, when, count, col

presencesDF.select([count(when(isnull(c), c)).alias(c) for c in presencesDF.columns]).show()

+---+----+----+------+-------+----+-----+----+
|_id|aula|date|inDate|outDate|polo|posto|sede|
+---+----+----+------+-------+----+-----+----+
|  0|   0|   0|     0|    223|   0|    0|   0|
+---+----+----+------+-------+----+-----+----+



In [7]:
# delete the null values founded

presencesDF = presencesDF.replace('?', None).dropna(how='any')

presencesDF.count()

9618

In [8]:
# drop unnecessary column

presencesDF = presencesDF.drop("date")

presencesDF.columns

['_id', 'aula', 'inDate', 'outDate', 'polo', 'posto', 'sede']

In [9]:
# cast column inDate and outDate to timestamp

presencesDF = presencesDF.withColumn("inDate", presencesDF["inDate"].cast("timestamp"))

presencesDF = presencesDF.withColumn("outDate", presencesDF["outDate"].cast("timestamp"))

presencesDF.dtypes


[('_id', 'string'),
 ('aula', 'string'),
 ('inDate', 'timestamp'),
 ('outDate', 'timestamp'),
 ('polo', 'string'),
 ('posto', 'string'),
 ('sede', 'string')]

In [10]:
# split the relevant content of inDate and outDate in different columns
# drop the content of useless columns: _id, posto, inDate, outDate

from pyspark.sql.functions import year, month, dayofmonth, hour, minute

presencesDF = presencesDF.withColumn("day", dayofmonth(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("month", month(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("year", year(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("inHour", hour(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("inMinute", minute(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("outHour", hour(presencesDF["outDate"]))
presencesDF = presencesDF.withColumn("outMinute", minute(presencesDF["outDate"]))

presencesDF = presencesDF.drop("_id", "posto", "inDate", "outDate")

presencesDF.dtypes

[('aula', 'string'),
 ('polo', 'string'),
 ('sede', 'string'),
 ('day', 'int'),
 ('month', 'int'),
 ('year', 'int'),
 ('inHour', 'int'),
 ('inMinute', 'int'),
 ('outHour', 'int'),
 ('outMinute', 'int')]

In [11]:
# cast the last string values to int and the _id to a progressive int id

from pyspark.sql.types import IntegerType
from pyspark.sql.functions import monotonically_increasing_id 

presencesDF = presencesDF.withColumn("aula", presencesDF["aula"].cast(IntegerType()))
presencesDF = presencesDF.withColumn("polo", presencesDF["polo"].cast(IntegerType()))
presencesDF = presencesDF.withColumn("sede", presencesDF["sede"].cast(IntegerType()))

presencesDF.show(5, False)

+----+----+----+---+-----+----+------+--------+-------+---------+
|aula|polo|sede|day|month|year|inHour|inMinute|outHour|outMinute|
+----+----+----+---+-----+----+------+--------+-------+---------+
|1   |1   |1   |9  |11   |2020|8     |27      |13     |5        |
|1   |1   |1   |9  |11   |2020|8     |28      |13     |5        |
|1   |1   |1   |9  |11   |2020|8     |32      |13     |5        |
|1   |1   |1   |9  |11   |2020|8     |32      |8      |32       |
|1   |1   |1   |9  |11   |2020|8     |32      |8      |32       |
+----+----+----+---+-----+----+------+--------+-------+---------+
only showing top 5 rows



In [22]:
aggregated = presencesDF.groupBy("aula", "polo", "sede", "day", "month", "year").count()

aggregated.show()

+----+----+----+---+-----+----+-----+
|aula|polo|sede|day|month|year|count|
+----+----+----+---+-----+----+-----+
|  10|   5|   1| 25|   11|2020|    2|
|  32|  13|   1| 26|   11|2020|    3|
|  19|   5|   1| 26|   11|2020|   10|
|   1|   1|   1| 27|   11|2020|   46|
|  21|   1|   1|  3|   12|2020|    3|
|  51|   7|   1| 19|   11|2020|    2|
|   7|   7|   1| 23|   11|2020|   28|
|  19|   5|   1| 25|   11|2020|   17|
|  29|   7|   1| 26|   11|2020|    1|
|   2|   2|   1| 27|   11|2020|    5|
|  26|  14|   2| 11|   11|2020|    1|
|   1|   5|   1|  9|   11|2020|   98|
|  16|   2|   1| 24|   11|2020|    3|
|  54|  12|   1| 30|   11|2020|    1|
|  57|  12|   1| 20|   11|2020|    1|
|  15|  10|   1| 24|   11|2020|    5|
|  43|   7|   1|  1|   12|2020|   44|
|   1|   1|   1|  3|   12|2020|   75|
|   4|   4|   2| 14|   12|2020|   21|
|  17|  11|   1|  9|   11|2020|   13|
+----+----+----+---+-----+----+-----+
only showing top 20 rows



In [15]:
presencesDF.show()

+----+----+----+---+-----+----+------+--------+-------+---------+
|aula|polo|sede|day|month|year|inHour|inMinute|outHour|outMinute|
+----+----+----+---+-----+----+------+--------+-------+---------+
|   1|   1|   1|  9|   11|2020|     8|      27|     13|        5|
|   1|   1|   1|  9|   11|2020|     8|      28|     13|        5|
|   1|   1|   1|  9|   11|2020|     8|      32|     13|        5|
|   1|   1|   1|  9|   11|2020|     8|      32|      8|       32|
|   1|   1|   1|  9|   11|2020|     8|      32|      8|       32|
|   1|   1|   1|  9|   11|2020|     8|      36|      8|       53|
|   2|   2|   1|  9|   11|2020|     8|      36|     11|        3|
|   2|   2|   1|  9|   11|2020|     8|      36|     11|        3|
|   1|   1|   1|  9|   11|2020|     8|      45|      8|       52|
|   1|   1|   1|  9|   11|2020|     8|      45|      8|       52|
|   1|   1|   1|  9|   11|2020|     8|      45|     13|        5|
|   3|   3|   1|  9|   11|2020|     8|      46|     13|        5|
|   3|   3