#### The following notebook perform a machine learning pipeline using TPOT on the presences.json file which list all the presences registered in the Unicam buildings. 

In [63]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('younicam-AI').getOrCreate()

spark

## 1 - File preparation

#### Import the data related to the registered presences which is composed of: 
 - _id: the unique id given by MongoDB
 - aula: the room
 - polo: the building
 - sede: the city
 - posto: the seat
 - inDate: the datetime for the room access
 - outDate: the datetime for the room exit
 - date: the datetime for the last modification made on the record

In [97]:
presencesDF = spark.read.json("../data/raw/presences.json", multiLine=True)

presencesDF.toPandas()

Unnamed: 0,_id,aula,date,inDate,outDate,polo,posto,sede
0,5fa8ef7d1bd2a03f4641a15e,1,2020-11-09T07:27:57.078Z,2020-11-09T07:27:57.078Z,2020-11-09T12:05:00.362Z,1,1,1
1,5fa8efa51bd2a03f4641a15f,1,2020-11-09T07:28:37.074Z,2020-11-09T07:28:37.074Z,2020-11-09T12:05:00.363Z,1,2,1
2,5fa8f0751bd2a03f4641a160,1,2020-11-09T07:32:05.879Z,2020-11-09T07:32:05.878Z,2020-11-09T12:05:00.364Z,1,3,1
3,5fa8f0811bd2a03f4641a161,1,2020-11-09T07:32:17.390Z,2020-11-09T07:32:17.390Z,2020-11-09T07:32:20.897Z,1,4,1
4,5fa8f0891bd2a03f4641a162,1,2020-11-09T07:32:25.980Z,2020-11-09T07:32:25.980Z,2020-11-09T07:32:36.245Z,1,5,1
...,...,...,...,...,...,...,...,...
39359,5fd9dcb7ff3b76b96dd7987b,17,2020-12-16T10:08:55.168Z,2020-12-16T10:08:55.168Z,,11,13,1
39360,5fd9ddcfff3b76b96dd7988b,19,2020-12-16T10:13:35.299Z,2020-12-16T10:13:35.299Z,,5,12,1
39361,5fd9ddf1ff3b76b96dd7988e,1,2020-12-16T10:14:09.471Z,2020-12-16T10:14:09.471Z,,1,46,1
39362,5fd9de6cff3b76b96dd79891,13,2020-12-16T10:16:12.267Z,2020-12-16T10:16:12.267Z,,9,15,4


#### Perform some operations to check the state of the data and change the names to improve readibility

In [98]:
presencesDF.dtypes

[('_id', 'string'),
 ('aula', 'string'),
 ('date', 'string'),
 ('inDate', 'string'),
 ('outDate', 'string'),
 ('polo', 'string'),
 ('posto', 'string'),
 ('sede', 'string')]

In [99]:
presencesDF.count()

39364

## 2 - Data cleaning and preparation

In [82]:
presencesDF = presencesDF.withColumnRenamed("aula", "room")
presencesDF = presencesDF.withColumnRenamed("polo", "building")
presencesDF = presencesDF.withColumnRenamed("sede", "city")
presencesDF = presencesDF.withColumnRenamed("posto", "seat")

presencesDF.columns

['_id', 'room', 'date', 'inDate', 'outDate', 'building', 'seat', 'city']

#### Let's look for null values inside each column

In [83]:
from pyspark.sql.functions import isnull, when, count, col

presencesDF.select([count(when(isnull(c), c)).alias(c) for c in presencesDF.columns]).toPandas()

Unnamed: 0,_id,room,date,inDate,outDate,building,seat,city
0,0,0,0,0,892,0,0,0


#### As you can see there are some null values inside the outDate column. This can happen because at the moment of data extraction there were some "active" presences that, of course cannot have the outDate. 
#### Those null values needs to be removed.

In [84]:
presencesDF = presencesDF.replace('?', None).dropna(how='any')

presencesDF.count()

38472

#### Drop the date column because it stores the date of the last modification made on the record and it is redundant since the last update made on the record is perfomed at the exit time that saved in the outDate field.

In [85]:
presencesDF = presencesDF.drop("date")

presencesDF.columns

['_id', 'room', 'inDate', 'outDate', 'building', 'seat', 'city']

#### Cast inDate and outDate into timestamp in order to extrapolate day, month, hour and minutes either for the entrance datetime and the exit datetime. 
#### Then the columns _id, posto, inDate and outDate_ are deleted since they are not needed for the analysis.

In [86]:
from pyspark.sql.functions import year, month, dayofmonth, hour, minute

presencesDF = presencesDF.withColumn("inDate", presencesDF["inDate"].cast("timestamp"))

presencesDF = presencesDF.withColumn("outDate", presencesDF["outDate"].cast("timestamp"))

presencesDF = presencesDF.withColumn("day", dayofmonth(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("month", month(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("inHour", hour(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("inMinute", minute(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("outHour", hour(presencesDF["outDate"]))
presencesDF = presencesDF.withColumn("outMinute", minute(presencesDF["outDate"]))

presencesDF = presencesDF.drop("_id", "posto", "inDate", "outDate")

presencesDF.dtypes

[('room', 'string'),
 ('building', 'string'),
 ('seat', 'string'),
 ('city', 'string'),
 ('day', 'int'),
 ('month', 'int'),
 ('inHour', 'int'),
 ('inMinute', 'int'),
 ('outHour', 'int'),
 ('outMinute', 'int')]

#### Cast the column room, building and city to integer because the machine learning model works only with integer values

In [87]:
from pyspark.sql.types import IntegerType

presencesDF = presencesDF.withColumn("room", presencesDF["room"].cast(IntegerType()))
presencesDF = presencesDF.withColumn("building", presencesDF["building"].cast(IntegerType()))
presencesDF = presencesDF.withColumn("city", presencesDF["city"].cast(IntegerType()))

presencesDF.dtypes

[('room', 'int'),
 ('building', 'int'),
 ('seat', 'string'),
 ('city', 'int'),
 ('day', 'int'),
 ('month', 'int'),
 ('inHour', 'int'),
 ('inMinute', 'int'),
 ('outHour', 'int'),
 ('outMinute', 'int')]

## 3 - Feature engineering

#### Since we need the number of people for a given room in a certain day and in a certain hour, we have to count the number of presences considering room, building, city, day, month, hour. 
#### In order to get the number of people present in a time interval, we can explode a sequence of hours (e.g. for a record with inHour: 8 and outHour 13, the sequence of hours will be: [8,9,10,11,12,13]), group by the hour (and other columns) and get the aggregate count for each group.

In [88]:
import pyspark.sql.functions as F

finalDF = presencesDF.withColumn(
    'hour',
    F.explode(F.sequence('inHour', 'outHour'))
).groupBy(
    'room', 'building', 'city', 'day', 'month', 'hour'
).count()

finalDF.toPandas()

Unnamed: 0,room,building,city,day,month,hour,count
0,5,3,1,9,11,8,16
1,16,2,1,10,11,13,4
2,11,6,1,10,11,15,4
3,22,4,2,13,11,11,28
4,26,14,2,13,11,11,4
...,...,...,...,...,...,...,...
4562,1,1,1,1,12,9,168
4563,9,4,2,2,12,17,64
4564,1,1,1,3,12,8,188
4565,7,7,1,14,12,17,36


#### The generated column _count_ is our target features and all the others are the input values.
#### The dataframe is ready for the ML model so we save it in a dedicated folder.

In [76]:
finalDF.coalesce(1).write.format("json").mode("overwrite").save('../data/processed/presences.json')

In [90]:
finalDF.toPandas().describe()

Unnamed: 0,room,building,city,day,month,hour,count
count,4567.0,4567.0,4567.0,4567.0,4567.0,4567.0,4567.0
mean,23.323407,7.363039,1.423254,14.557697,11.400044,13.389096,35.751259
std,17.275671,5.009367,0.78572,8.203988,0.489961,3.273675,41.163896
min,1.0,1.0,1.0,1.0,11.0,0.0,4.0
25%,9.0,3.0,1.0,9.0,11.0,11.0,8.0
50%,21.0,7.0,1.0,14.0,11.0,13.0,20.0
75%,36.0,12.0,2.0,20.0,12.0,16.0,48.0
max,68.0,21.0,4.0,30.0,12.0,19.0,256.0


## 4 - Model training

#### Create the 1-D array containing the target values and the 2-D array with all the features

In [17]:
import numpy as np

target = np.array(finalDF.select("count").collect()).ravel()

data = np.array(finalDF.select("room", "building", "city", "day", "month", "hour").collect())

In [14]:
/from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(data, target,
                                                    train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(
    verbosity=2,
    warm_start=True
)

tpot.fit(X_train, y_train)
preds = tpot.predict(X_test)
print(r2_score(y_test, preds))

tpot.export('tpot_exported_pipeline.py')

np.savetxt("../prediction/preds.csv", preds, delimiter=",")

Optimization Progress:   0%|          | 0/10100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -50.68914859113579

Generation 2 - Current best internal CV score: -50.68914859113579

Generation 3 - Current best internal CV score: -50.42684286055529

Generation 4 - Current best internal CV score: -50.42684286055529

Generation 5 - Current best internal CV score: -50.15621012008095

Generation 6 - Current best internal CV score: -50.15621012008095

Generation 7 - Current best internal CV score: -49.87747294021666

Generation 8 - Current best internal CV score: -49.68786907567703

Generation 9 - Current best internal CV score: -49.68786907567703

Generation 10 - Current best internal CV score: -49.68786907567703

Generation 11 - Current best internal CV score: -49.552822076213126

Generation 12 - Current best internal CV score: -49.552822076213126

Generation 13 - Current best internal CV score: -49.552822076213126

Generation 14 - Current best internal CV score: -49.31867454752522

Generation 15 - Current best internal CV score: -49.3

#### The execution of TPOT outputs RandomForestRegressor as the best algorithm with also a python file to execute this algorithm. Below, the execution of RandomForestRegressor

In [28]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from tpot.builtins import StackingEstimator

training_features, testing_features, training_target, testing_target = \
            train_test_split(data, target, random_state=None)

# Average CV score on the training set was: -47.52158656317905
exported_pipeline = make_pipeline(
    MinMaxScaler(),
    RobustScaler(),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=0.1, fit_intercept=False, l1_ratio=0.5, learning_rate="invscaling", loss="squared_loss", penalty="elasticnet", power_t=0.1)),
    SelectFromModel(estimator=ExtraTreesRegressor(max_features=0.9500000000000001, n_estimators=100), threshold=0.05),
    RandomForestRegressor(bootstrap=True, max_features=0.45, min_samples_leaf=1, min_samples_split=5, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
exported_pipeline.score(testing_features, testing_target)

0.5805560845350553