In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('younicam-AI').getOrCreate()

spark

#### Import the data related to the registered presences which is composed of: 
 - _id: the unique id given by MongoDB
 - aula: the room
 - polo: the building
 - sede: the city
 - inDate: the datetime for the room access
 - outDate: the datetime for the room exit
 - date: the datetime for the last modification made on the record

In [2]:
presencesDF = spark.read.json("./data/presences.json", multiLine=True)

presencesDF.show(5)

+--------------------+----+--------------------+--------------------+--------------------+----+-----+----+
|                 _id|aula|                date|              inDate|             outDate|polo|posto|sede|
+--------------------+----+--------------------+--------------------+--------------------+----+-----+----+
|5fa8ef7d1bd2a03f4...|   1|2020-11-09T07:27:...|2020-11-09T07:27:...|2020-11-09T12:05:...|   1|    1|   1|
|5fa8efa51bd2a03f4...|   1|2020-11-09T07:28:...|2020-11-09T07:28:...|2020-11-09T12:05:...|   1|    2|   1|
|5fa8f0751bd2a03f4...|   1|2020-11-09T07:32:...|2020-11-09T07:32:...|2020-11-09T12:05:...|   1|    3|   1|
|5fa8f0811bd2a03f4...|   1|2020-11-09T07:32:...|2020-11-09T07:32:...|2020-11-09T07:32:...|   1|    4|   1|
|5fa8f0891bd2a03f4...|   1|2020-11-09T07:32:...|2020-11-09T07:32:...|2020-11-09T07:32:...|   1|    5|   1|
+--------------------+----+--------------------+--------------------+--------------------+----+-----+----+
only showing top 5 rows



#### Perform some operations to check the state of the data and change the names to improve readibility

In [3]:
presencesDF.count()

9841

In [4]:
presencesDF.columns

['_id', 'aula', 'date', 'inDate', 'outDate', 'polo', 'posto', 'sede']

In [5]:
presencesDF = presencesDF.withColumnRenamed("aula", "room")
presencesDF = presencesDF.withColumnRenamed("polo", "building")
presencesDF = presencesDF.withColumnRenamed("sede", "city")

presencesDF.columns

['_id', 'room', 'date', 'inDate', 'outDate', 'building', 'posto', 'city']

In [6]:
presencesDF.dtypes

[('_id', 'string'),
 ('room', 'string'),
 ('date', 'string'),
 ('inDate', 'string'),
 ('outDate', 'string'),
 ('building', 'string'),
 ('posto', 'string'),
 ('city', 'string')]

#### Look for null values inside each columns and, if present, delete them.

In [7]:
from pyspark.sql.functions import isnull, when, count, col

presencesDF.select([count(when(isnull(c), c)).alias(c) for c in presencesDF.columns]).show()

+---+----+----+------+-------+--------+-----+----+
|_id|room|date|inDate|outDate|building|posto|city|
+---+----+----+------+-------+--------+-----+----+
|  0|   0|   0|     0|    223|       0|    0|   0|
+---+----+----+------+-------+--------+-----+----+



In [8]:
presencesDF = presencesDF.replace('?', None).dropna(how='any')

presencesDF.count()

9618

#### Drop the column date because stores just the date of the last modification made on the record, so it is redundant since the last modification made on the record is the exit saved with outDate

In [9]:
presencesDF = presencesDF.drop("date")

presencesDF.columns

['_id', 'room', 'inDate', 'outDate', 'building', 'posto', 'city']

#### Cast inDate and outDate into timestamp in order to extrapolate day, month, hour and minutes. Then, delete not needed columns

In [10]:
from pyspark.sql.functions import year, month, dayofmonth, hour, minute

presencesDF = presencesDF.withColumn("inDate", presencesDF["inDate"].cast("timestamp"))

presencesDF = presencesDF.withColumn("outDate", presencesDF["outDate"].cast("timestamp"))

presencesDF = presencesDF.withColumn("day", dayofmonth(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("month", month(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("inHour", hour(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("inMinute", minute(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("outHour", hour(presencesDF["outDate"]))
presencesDF = presencesDF.withColumn("outMinute", minute(presencesDF["outDate"]))

presencesDF = presencesDF.drop("_id", "posto", "inDate", "outDate")

presencesDF.columns

['room',
 'building',
 'city',
 'day',
 'month',
 'inHour',
 'inMinute',
 'outHour',
 'outMinute']

#### Cast the column room, building and city into integer because the machine learning works only with integer values

In [11]:
from pyspark.sql.types import IntegerType

presencesDF = presencesDF.withColumn("room", presencesDF["room"].cast(IntegerType()))
presencesDF = presencesDF.withColumn("building", presencesDF["building"].cast(IntegerType()))
presencesDF = presencesDF.withColumn("city", presencesDF["city"].cast(IntegerType()))

presencesDF.dtypes

[('room', 'int'),
 ('building', 'int'),
 ('city', 'int'),
 ('day', 'int'),
 ('month', 'int'),
 ('inHour', 'int'),
 ('inMinute', 'int'),
 ('outHour', 'int'),
 ('outMinute', 'int')]

#### In order to get the number of people present in a time interval, we can explode a sequence of hours (e.g. for a record with inHour: 8 and outHour 13, the sequence of hours will be: [8,9,10,11,12,13]), group by the hour (and other columns) and get the aggregate count for each group.

In [12]:
import pyspark.sql.functions as F

finalDF = presencesDF.withColumn(
    'hour',
    F.explode(F.sequence('inHour', 'outHour'))
).groupBy(
    'room', 'building', 'city', 'day', 'month', 'hour'
).count()

finalDF.show(5)

+----+--------+----+---+-----+----+-----+
|room|building|city|day|month|hour|count|
+----+--------+----+---+-----+----+-----+
|   5|       3|   1|  9|   11|   8|    4|
|  16|       2|   1| 10|   11|  13|    1|
|  11|       6|   1| 10|   11|  15|    1|
|  22|       4|   2| 13|   11|  11|    7|
|  26|      14|   2| 13|   11|  11|    1|
+----+--------+----+---+-----+----+-----+
only showing top 5 rows



#### Create the 1-D array containing the target values and the 2-D array with all the features

In [13]:
import numpy as np

target = np.array(finalDF.select("count").collect()).ravel()

data = np.array(finalDF.select("room", "building", "city", "day", "month", "hour").collect())

In [14]:
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(data, target,
                                                    train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(
    verbosity=2,
    warm_start=True
)

tpot.fit(X_train, y_train)
preds = tpot.predict(X_test)
print(r2_score(y_test, preds))

tpot.export('tpot_exported_pipeline.py')

np.savetxt("prediction/preds.csv", preds, delimiter=",")

Optimization Progress:   0%|          | 0/10100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -50.68914859113579

Generation 2 - Current best internal CV score: -50.68914859113579

Generation 3 - Current best internal CV score: -50.42684286055529

Generation 4 - Current best internal CV score: -50.42684286055529

Generation 5 - Current best internal CV score: -50.15621012008095

Generation 6 - Current best internal CV score: -50.15621012008095

Generation 7 - Current best internal CV score: -49.87747294021666

Generation 8 - Current best internal CV score: -49.68786907567703

Generation 9 - Current best internal CV score: -49.68786907567703

Generation 10 - Current best internal CV score: -49.68786907567703

Generation 11 - Current best internal CV score: -49.552822076213126

Generation 12 - Current best internal CV score: -49.552822076213126

Generation 13 - Current best internal CV score: -49.552822076213126

Generation 14 - Current best internal CV score: -49.31867454752522

Generation 15 - Current best internal CV score: -49.3

#### The execution of TPOT outputs RandomForestRegressor as the best algorithm with also a python file to execute this algorithm. Below, the execution of RandomForestRegressor

In [26]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from tpot.builtins import StackingEstimator

training_features, testing_features, training_target, testing_target = \
            train_test_split(data, target, random_state=None)

# Average CV score on the training set was: -47.52158656317905
exported_pipeline = make_pipeline(
    MinMaxScaler(),
    RobustScaler(),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=0.1, fit_intercept=False, l1_ratio=0.5, learning_rate="invscaling", loss="squared_loss", penalty="elasticnet", power_t=0.1)),
    SelectFromModel(estimator=ExtraTreesRegressor(max_features=0.9500000000000001, n_estimators=100), threshold=0.05),
    RandomForestRegressor(bootstrap=True, max_features=0.45, min_samples_leaf=1, min_samples_split=5, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
exported_pipeline.score(testing_features, testing_target)