#### The following notebook performs data collection, data preparation and feature engineering on the presences.json file which list all the presences registered in the Unicam buildings. 

In [15]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('younicam-AI').getOrCreate()

## 1 - Data collection

#### Import the data related to the registered presences which is composed of: 
 - _id: the unique id given by MongoDB
 - aula: the room
 - polo: the building
 - sede: the city
 - posto: the seat
 - inDate: the datetime for the room access
 - outDate: the datetime for the room exit
 - date: the datetime for the last modification made on the record

In [16]:
presencesDF = spark.read.json("../data/raw/presences.json", multiLine=True)

presencesDF.toPandas()

Unnamed: 0,_id,aula,date,inDate,outDate,polo,posto,sede
0,5fa8ef7d1bd2a03f4641a15e,1,2020-11-09T07:27:57.078Z,2020-11-09T07:27:57.078Z,2020-11-09T12:05:00.362Z,1,1,1
1,5fa8efa51bd2a03f4641a15f,1,2020-11-09T07:28:37.074Z,2020-11-09T07:28:37.074Z,2020-11-09T12:05:00.363Z,1,2,1
2,5fa8f0751bd2a03f4641a160,1,2020-11-09T07:32:05.879Z,2020-11-09T07:32:05.878Z,2020-11-09T12:05:00.364Z,1,3,1
3,5fa8f0811bd2a03f4641a161,1,2020-11-09T07:32:17.390Z,2020-11-09T07:32:17.390Z,2020-11-09T07:32:20.897Z,1,4,1
4,5fa8f0891bd2a03f4641a162,1,2020-11-09T07:32:25.980Z,2020-11-09T07:32:25.980Z,2020-11-09T07:32:36.245Z,1,5,1
...,...,...,...,...,...,...,...,...
14424,6038fa9255deb9a07cdb7bdb,43,2021-02-26T13:41:38.717Z,2021-02-26T13:41:38.717Z,2021-02-26T18:05:00.406Z,7,3,1
14425,6038fadd55deb9a07cdb7bdc,43,2021-02-26T13:42:53.234Z,2021-02-26T13:42:53.234Z,2021-02-26T18:05:00.407Z,7,4,1
14426,6038faef55deb9a07cdb7bdd,43,2021-02-26T13:43:11.421Z,2021-02-26T13:43:11.420Z,2021-02-26T18:05:00.408Z,7,5,1
14427,6038fcb855deb9a07cdb7bde,43,2021-02-26T13:50:48.374Z,2021-02-26T13:50:48.373Z,2021-02-26T18:05:00.409Z,7,6,1


#### Perform some operations to check the state of the data

In [17]:
presencesDF.dtypes

[('_id', 'string'),
 ('aula', 'string'),
 ('date', 'string'),
 ('inDate', 'string'),
 ('outDate', 'string'),
 ('polo', 'string'),
 ('posto', 'string'),
 ('sede', 'string')]

In [18]:
presencesDF.count()

14429

## 2 - Data cleaning and preparation
#### Start be renaming some columns to improve readibility

In [19]:
presencesDF = presencesDF.withColumnRenamed("aula", "room")
presencesDF = presencesDF.withColumnRenamed("polo", "building")
presencesDF = presencesDF.withColumnRenamed("sede", "city")
presencesDF = presencesDF.withColumnRenamed("posto", "seat")

presencesDF.columns

['_id', 'room', 'date', 'inDate', 'outDate', 'building', 'seat', 'city']

#### Let's look for null values inside each column

In [20]:
from pyspark.sql.functions import isnull, when, count, col

presencesDF.select([count(when(isnull(c), c)).alias(c) for c in presencesDF.columns]).toPandas()

Unnamed: 0,_id,room,date,inDate,outDate,building,seat,city
0,0,0,0,0,0,0,0,0


#### Can happen that there are some null values inside the outDate column because at the moment of data extraction there were some "active" presences that, of course cannot have the outDate. 
#### If any, these null values needs to be removed.

In [21]:
presencesDF = presencesDF.replace('?', None).dropna(how='any')

presencesDF.count()

14429

#### Drop the date column because it stores the date of the last modification made on the record and it is redundant since the last update made on the record is perfomed at the exit time that saved in the outDate field.

In [22]:
presencesDF = presencesDF.drop("date")

presencesDF.columns

['_id', 'room', 'inDate', 'outDate', 'building', 'seat', 'city']

#### Cast inDate and outDate into timestamp in order to extrapolate day, month, hour and minutes either for the entrance datetime and the exit datetime. 
#### Then the columns _id, posto, inDate and outDate_ are deleted since they are not needed for the analysis.

In [23]:
from pyspark.sql.functions import year, month, dayofmonth, hour, minute

presencesDF = presencesDF.withColumn("inDate", presencesDF["inDate"].cast("timestamp"))

presencesDF = presencesDF.withColumn("outDate", presencesDF["outDate"].cast("timestamp"))

presencesDF = presencesDF.withColumn("day", dayofmonth(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("month", month(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("inHour", hour(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("inMinute", minute(presencesDF["inDate"]))
presencesDF = presencesDF.withColumn("outHour", hour(presencesDF["outDate"]))
presencesDF = presencesDF.withColumn("outMinute", minute(presencesDF["outDate"]))

presencesDF = presencesDF.drop("_id", "seat", "inDate", "outDate")

presencesDF.toPandas()

Unnamed: 0,room,building,city,day,month,inHour,inMinute,outHour,outMinute
0,1,1,1,9,11,8,27,13,5
1,1,1,1,9,11,8,28,13,5
2,1,1,1,9,11,8,32,13,5
3,1,1,1,9,11,8,32,8,32
4,1,1,1,9,11,8,32,8,32
...,...,...,...,...,...,...,...,...,...
14424,43,7,1,26,2,14,41,19,5
14425,43,7,1,26,2,14,42,19,5
14426,43,7,1,26,2,14,43,19,5
14427,43,7,1,26,2,14,50,19,5


#### Cast the column room, building and city to integer because the machine learning model works only with integer values

In [24]:
from pyspark.sql.types import IntegerType

presencesDF = presencesDF.withColumn("room", presencesDF["room"].cast(IntegerType()))
presencesDF = presencesDF.withColumn("building", presencesDF["building"].cast(IntegerType()))
presencesDF = presencesDF.withColumn("city", presencesDF["city"].cast(IntegerType()))

presencesDF.dtypes

[('room', 'int'),
 ('building', 'int'),
 ('city', 'int'),
 ('day', 'int'),
 ('month', 'int'),
 ('inHour', 'int'),
 ('inMinute', 'int'),
 ('outHour', 'int'),
 ('outMinute', 'int')]

#### Store the processed data in dedicated folder.

In [25]:
import os

NEW_DATA_DIR = "../data/processed/"

try:
    os.mkdir(NEW_DATA_DIR)
except:
    pass

presencesDF.toPandas().to_json(NEW_DATA_DIR + "presences.json")

## 3 - Feature engineering

#### Since we need the number of people for a given room in a certain day and in a certain hour, we have to count the number of presences considering room, building, city, day, month, hour. 
#### In order to get the number of people present in a time interval, we can explode a sequence of hours (e.g. for a record with inHour: 8 and outHour 13, the sequence of hours will be: [8,9,10,11,12,13]), group by the hour (and other columns) and get the aggregate count for each group.

In [26]:
import pyspark.sql.functions as F

finalDF = presencesDF.withColumn(
    'hour',
    F.explode(F.sequence('inHour', 'outHour'))
).groupBy(
    'room', 'building', 'city', 'day', 'month', 'hour'
).count()

finalDF = finalDF.withColumnRenamed("count", "target")

finalDF.toPandas()

Unnamed: 0,room,building,city,day,month,hour,target
0,5,3,1,9,11,8,4
1,16,2,1,10,11,13,1
2,11,6,1,10,11,15,1
3,22,4,2,13,11,11,7
4,26,14,2,13,11,11,1
...,...,...,...,...,...,...,...
8035,14,1,1,22,2,10,1
8036,7,7,1,23,2,8,1
8037,58,4,2,26,2,9,3
8038,36,1,1,26,2,16,1


#### The generated column _count_ is our target feature and all the others are the input values.
#### The dataframe is ready for the ML model so we save it in a dedicated folder.

In [13]:
NEW_DATA_DIR = "../data/engineered/"

try:
    os.mkdir(NEW_DATA_DIR)
except:
    pass

finalDF.toPandas().to_json(NEW_DATA_DIR + "presences.json")