In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('younicam-AI').getOrCreate()

spark

#### Import the data related to the registered presences which is composed of: 
 - _id: the unique id given by MongoDB
 - aula: the room
 - polo: the building
 - sede: the city
 - inDate: the datetime for the room access
 - outDate: the datetime for the room exit
 - date: the datetime for the last modification made on the record

In [2]:
presencesDF = spark.read.json("./data/presences.json", multiLine=True)

presencesDF.show(5)

+--------------------+----+--------------------+--------------------+--------------------+----+-----+----+
|                 _id|aula|                date|              inDate|             outDate|polo|posto|sede|
+--------------------+----+--------------------+--------------------+--------------------+----+-----+----+
|5fa8ef7d1bd2a03f4...|   1|2020-11-09T07:27:...|2020-11-09T07:27:...|2020-11-09T12:05:...|   1|    1|   1|
|5fa8efa51bd2a03f4...|   1|2020-11-09T07:28:...|2020-11-09T07:28:...|2020-11-09T12:05:...|   1|    2|   1|
|5fa8f0751bd2a03f4...|   1|2020-11-09T07:32:...|2020-11-09T07:32:...|2020-11-09T12:05:...|   1|    3|   1|
|5fa8f0811bd2a03f4...|   1|2020-11-09T07:32:...|2020-11-09T07:32:...|2020-11-09T07:32:...|   1|    4|   1|
|5fa8f0891bd2a03f4...|   1|2020-11-09T07:32:...|2020-11-09T07:32:...|2020-11-09T07:32:...|   1|    5|   1|
+--------------------+----+--------------------+--------------------+--------------------+----+-----+----+
only showing top 5 rows



#### Perform some operations to check the state of the data and change the names to improve readibility

In [3]:
presencesDF.count()

9841

In [4]:
presencesDF.columns

['_id', 'aula', 'date', 'inDate', 'outDate', 'polo', 'posto', 'sede']

In [5]:
presencesDF = presencesDF.withColumnRenamed("aula", "room")
presencesDF = presencesDF.withColumnRenamed("polo", "building")
presencesDF = presencesDF.withColumnRenamed("sede", "city")

presencesDF.columns

['_id', 'room', 'date', 'inDate', 'outDate', 'building', 'posto', 'city']

In [6]:
presencesDF.dtypes

[('_id', 'string'),
 ('room', 'string'),
 ('date', 'string'),
 ('inDate', 'string'),
 ('outDate', 'string'),
 ('building', 'string'),
 ('posto', 'string'),
 ('city', 'string')]

#### Look for null values inside each columns and, if present, delete them.

In [7]:
from pyspark.sql.functions import isnull, when, count, col

presencesDF.select([count(when(isnull(c), c)).alias(c) for c in presencesDF.columns]).show()

+---+----+----+------+-------+--------+-----+----+
|_id|room|date|inDate|outDate|building|posto|city|
+---+----+----+------+-------+--------+-----+----+
|  0|   0|   0|     0|    223|       0|    0|   0|
+---+----+----+------+-------+--------+-----+----+



In [8]:
presencesDF = presencesDF.replace('?', None).dropna(how='any')

presencesDF.count()

9618

#### Drop the column date because stores just the date of the last modification made on the record, so it is redundant since the last modification made on the record is the exit saved with outDate

In [9]:
presencesDF = presencesDF.drop("date")

presencesDF.columns

['_id', 'room', 'inDate', 'outDate', 'building', 'posto', 'city']

#### Cast inDate and outDate into timestamp in order to extrapolate day, month, hour and minutes. Then, delete not needed columns

In [10]:
from pyspark.sql.functions import year, month, dayofmonth, hour, minute

presencesDF = presencesDF.withColumn("inDate", presencesDF["inDate"].cast("timestamp"))

presencesDF = presencesDF.withColumn("outDate", presencesDF["outDate"].cast("timestamp"))

presencesDF = presencesDF.withColumn("day", dayofmonth(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("month", month(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("inHour", hour(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("inMinute", minute(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("outHour", hour(presencesDF["outDate"]))
presencesDF = presencesDF.withColumn("outMinute", minute(presencesDF["outDate"]))

presencesDF = presencesDF.drop("_id", "posto", "inDate", "outDate")

presencesDF.columns

['room',
 'building',
 'city',
 'day',
 'month',
 'inHour',
 'inMinute',
 'outHour',
 'outMinute']

#### Cast the column room, building and city into integer because the machine learning works only with integer values

In [12]:
from pyspark.sql.types import IntegerType

presencesDF = presencesDF.withColumn("room", presencesDF["room"].cast(IntegerType()))
presencesDF = presencesDF.withColumn("building", presencesDF["building"].cast(IntegerType()))
presencesDF = presencesDF.withColumn("city", presencesDF["city"].cast(IntegerType()))

presencesDF.dtypes

[('room', 'int'),
 ('building', 'int'),
 ('city', 'int'),
 ('day', 'int'),
 ('month', 'int'),
 ('inHour', 'int'),
 ('inMinute', 'int'),
 ('outHour', 'int'),
 ('outMinute', 'int')]

#### In order to get the number of people present in a time interval, we can explode a sequence of hours (e.g. for a record with inHour: 8 and outHour 13, the sequence of hours will be: [8,9,10,11,12,13]), group by the hour (and other columns) and get the aggregate count for each group.

In [13]:
import pyspark.sql.functions as F

finalDF = presencesDF.withColumn(
    'hour',
    F.explode(F.sequence('inHour', 'outHour'))
).groupBy(
    'room', 'building', 'city', 'day', 'month', 'hour'
).count()

finalDF.show()

+----+--------+----+---+-----+----+-----+
|room|building|city|day|month|hour|count|
+----+--------+----+---+-----+----+-----+
|   5|       3|   1|  9|   11|   8|    4|
|  16|       2|   1| 10|   11|  13|    1|
|  11|       6|   1| 10|   11|  15|    1|
|  22|       4|   2| 13|   11|  11|    7|
|  26|      14|   2| 13|   11|  11|    1|
|   9|       4|   2| 16|   11|   9|   10|
|   1|       1|   1| 19|   11|  11|   41|
|  21|      16|   3| 24|   11|  18|    8|
|  52|      10|   1| 24|   11|  19|    1|
|   4|       4|   2| 24|   11|  15|   22|
|  60|      11|   1| 25|   11|  19|   16|
|  22|       4|   2| 25|   11|  17|    1|
|  64|      12|   1| 30|   11|  12|   13|
|   7|       7|   1| 30|   11|  16|   11|
|  36|       1|   1|  1|   12|  13|    3|
|   1|       5|   1|  1|   12|  19|   14|
|   4|       4|   2|  1|   12|  18|   23|
|  19|       5|   1|  2|   12|   9|   15|
|  46|       1|   1|  2|   12|  15|    3|
|  13|       9|   4|  2|   12|  19|   13|
+----+--------+----+---+-----+----

#### Create the 1-D array containing the target values and the 2-D array with all the features

In [15]:
import numpy as np

target = np.array(finalDF.select("count").collect()).ravel()

data = np.array(finalDF.select("room", "building", "city", "day", "month", "hour").collect())

In [16]:
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(data, target,
                                                    train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(
    generations=5,
    population_size=100,
    verbosity=2,
    warm_start=True
)

tpot.fit(X_train, y_train)
preds = tpot.predict(X_test)
print(r2_score(y_test, preds))

Optimization Progress:   0%|          | 0/600 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -105.42216235574115

Generation 2 - Current best internal CV score: -104.62318388038477

Generation 3 - Current best internal CV score: -104.16304407489217

Generation 4 - Current best internal CV score: -102.43771964358295

Generation 5 - Current best internal CV score: -96.32285518017609

Best pipeline: DecisionTreeRegressor(AdaBoostRegressor(SelectFwe(input_matrix, alpha=0.02), learning_rate=0.1, loss=exponential, n_estimators=100), max_depth=10, min_samples_leaf=7, min_samples_split=18)
0.48120333243312263
