# Build complete Dataframe

Our first goal is to obtain the complete Dataframe of a building, that is, getting a time series starting from the first existing hour in the database, and finishing with the last one. Whenever the information in between is missing, we'll fill it with `NaN`, for later processing.

#### Directory structure

./<br></br>
notebook/<br></br>
    &emsp;|--- data-preprocessing<br></br>
    &emsp;&emsp;&emsp;&emsp;|--- complete_dataframe.ipynb<br></br>
out/

In [2]:
import pandas as pd
import numpy as np
import pymongo as pm
import datetime

In [3]:
HOST = '161.67.142.141'
PORT = 27017
DB = 'differential_uclm_db'
DB_COUNTERRAW = 'CounterRawConsumption'

START_DAY = 5 # Day starts at 5:00 am

### Database connection

In [4]:
def connectDB() -> pm.MongoClient:
    return pm.MongoClient(host=HOST, port=PORT)[DB]

In [5]:
db = connectDB()

## 1. Create hour index

First, we must obtain the building's first and last registered hour, building then the hour index between these dates.

### First and last registered hours
Find first and last registered hours for the specified building ID

In [6]:
def firstHour(db: pm.MongoClient, counter_id: int) -> datetime.datetime:
    return list(db[DB_COUNTERRAW].find({'counterinfo_id': counter_id}).sort('timestamp', pm.ASCENDING).limit(1))[0]['timestamp']

def lastHour(db: pm.MongoClient, counter_id: int) -> datetime.datetime:
    return list(db[DB_COUNTERRAW].find({'counterinfo_id': counter_id}).sort('timestamp', pm.DESCENDING).limit(1))[0]['timestamp']

In [7]:
counter_id = 27 # Building ID example
start, end = firstHour(db, counter_id).replace(hour=5), lastHour(db, counter_id).replace(hour=4) # Fix hours to have 24h days

start, end

(datetime.datetime(2011, 7, 26, 5, 0), datetime.datetime(2020, 2, 27, 4, 0))

### Build hour index
From firstHour lastHour with 1 hour step

In [8]:
def createIndex(first: datetime.datetime, last: datetime.datetime) -> pd.DatetimeIndex:
    return pd.date_range(start=first, end=last, freq='1H')

In [9]:
index = createIndex(start, end)

index

DatetimeIndex(['2011-07-26 05:00:00', '2011-07-26 06:00:00',
               '2011-07-26 07:00:00', '2011-07-26 08:00:00',
               '2011-07-26 09:00:00', '2011-07-26 10:00:00',
               '2011-07-26 11:00:00', '2011-07-26 12:00:00',
               '2011-07-26 13:00:00', '2011-07-26 14:00:00',
               ...
               '2020-02-26 19:00:00', '2020-02-26 20:00:00',
               '2020-02-26 21:00:00', '2020-02-26 22:00:00',
               '2020-02-26 23:00:00', '2020-02-27 00:00:00',
               '2020-02-27 01:00:00', '2020-02-27 02:00:00',
               '2020-02-27 03:00:00', '2020-02-27 04:00:00'],
              dtype='datetime64[ns]', length=75312, freq='H')

## 2. Build complete Dataframe
Now we rebuild complete Dataframe with the consumptions for every hour we got in the index, filling with `NaN` when the value is not found on the database or if it is a negative consumption. This is reindexing the Dataframe with the previous index we obtained

In [10]:
def getDataFrame(db: pm.MongoClient, counter_id: int) -> pd.DataFrame:
    cursor = db[DB_COUNTERRAW].find({'counterinfo_id': counter_id})
    df = pd.DataFrame(list(cursor))
    del df['_id']
    del df['counterinfo_id']
    
    df = df.set_index('timestamp') # Indexing dataframe by timestamp
    
    return df

In [11]:
df = getDataFrame(db, counter_id)
df

Unnamed: 0_level_0,consumption
timestamp,Unnamed: 1_level_1
2011-07-26 17:00:00,111.000000
2011-07-26 18:00:00,43.348334
2011-07-26 19:00:00,41.846246
2011-07-26 20:00:00,22.805419
2011-07-26 21:00:00,20.887574
...,...
2020-02-27 19:00:00,56.849149
2020-02-27 20:00:00,33.789260
2020-02-27 21:00:00,27.924604
2020-02-27 22:00:00,25.509020


### Reindex Dataframe

In [12]:
df = df.reindex(index=index)
df

Unnamed: 0,consumption
2011-07-26 05:00:00,
2011-07-26 06:00:00,
2011-07-26 07:00:00,
2011-07-26 08:00:00,
2011-07-26 09:00:00,
...,...
2020-02-27 00:00:00,23.935151
2020-02-27 01:00:00,25.577308
2020-02-27 02:00:00,24.292991
2020-02-27 03:00:00,24.935243


### Calculate day
Day recalculation needed because days will start, as defined in `START_DAY`, at 5:00 am

In [13]:
def calcDay(df: pd.DataFrame) -> pd.DataFrame:
    df['day'] = df.apply(lambda x: (x.name - pd.DateOffset(hours=START_DAY)).date(), axis= 1)
    df['day'] = pd.to_datetime(df['day'])
    
    return df

In [14]:
df = calcDay(df)
df

Unnamed: 0,consumption,day
2011-07-26 05:00:00,,2011-07-26
2011-07-26 06:00:00,,2011-07-26
2011-07-26 07:00:00,,2011-07-26
2011-07-26 08:00:00,,2011-07-26
2011-07-26 09:00:00,,2011-07-26
...,...,...
2020-02-27 00:00:00,23.935151,2020-02-26
2020-02-27 01:00:00,25.577308,2020-02-26
2020-02-27 02:00:00,24.292991,2020-02-26
2020-02-27 03:00:00,24.935243,2020-02-26


## 3. Reshape Dataframe into TimeSeries
Get new Dataframe with indexed with `day`, and its 24 consumptions

In [15]:
consumption = np.asarray(df['consumption'])
consumption = consumption.reshape((len(df['day']) // 24, 24)) # Reshape each day with its 24 consumptions

consumptions = pd.DataFrame(consumption, index=np.arange(len(df['day']) // 24), columns=np.arange(24))
consumptions.columns = consumptions.columns.astype(str)
consumptions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,,,,,,,,,,,...,41.846246,22.805419,20.887574,18.846172,18.846057,18.420198,18.000000,18.118729,17.881271,18.000000
1,17.000000,19.000000,18.350795,35.846313,47.846263,50.846379,149.316601,137.956260,144.118651,131.264305,...,39.845466,21.500125,19.000000,20.000000,18.000000,18.728452,17.845798,19.425750,18.000000,17.111296
2,18.888704,18.803009,18.845892,35.845713,47.845857,51.845696,162.877204,147.690832,146.690908,133.258761,...,42.795578,23.204422,21.000000,22.000000,21.000000,21.023872,19.976128,20.714894,20.285106,20.000000
3,20.000000,21.000000,20.000000,37.788779,45.845705,50.845726,162.822891,143.039176,139.690738,129.690998,...,38.845451,21.910895,19.000000,19.000000,19.000000,18.318000,17.845896,17.836104,18.855870,17.846017
4,17.298113,17.000000,17.239697,17.845833,17.914470,18.000000,18.623151,19.376849,21.000000,21.000000,...,20.000000,19.235492,19.764508,19.000000,19.000000,19.000000,18.000000,19.000000,18.000000,17.000549
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3133,21.935497,21.935320,21.064881,21.946185,20.946242,21.784327,21.935417,22.118729,22.752089,23.140177,...,22.687338,21.935400,22.215645,21.946152,22.644214,22.247960,21.946055,21.611551,22.280723,20.946171
3134,21.946155,21.568840,21.323437,21.946155,21.536587,22.355722,21.946187,22.504338,22.935415,22.935470,...,22.452667,21.418263,22.935361,21.935446,21.495630,22.375282,21.935428,21.935402,21.935434,21.935470
3135,21.935432,24.299828,42.046687,56.888749,67.824054,77.116559,75.473832,73.795139,74.217150,66.816719,...,59.236310,32.813565,24.935450,25.127386,25.743428,23.935426,24.935405,23.935371,24.935494,24.808040
3136,24.179701,27.030773,43.485146,63.186978,77.027611,86.836100,88.814914,84.587054,75.734233,61.777340,...,59.872437,36.163066,24.980219,23.935414,24.935374,24.847482,25.023232,24.935294,24.815093,24.055454


### Index by day

In [16]:
days = df['day'].drop_duplicates().tolist()

weekdays = []
for day in days:
    weekdays.append(day.weekday())

consumptions = pd.concat([pd.DataFrame({'day': days, 'weekday': weekdays}), consumptions], axis=1)
consumptions = consumptions.set_index(['day'])

consumptions.insert(0, 'building_id', counter_id)

consumptions

Unnamed: 0_level_0,building_id,weekday,0,1,2,3,4,5,6,7,...,14,15,16,17,18,19,20,21,22,23
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-07-26,27,1,,,,,,,,,...,41.846246,22.805419,20.887574,18.846172,18.846057,18.420198,18.000000,18.118729,17.881271,18.000000
2011-07-27,27,2,17.000000,19.000000,18.350795,35.846313,47.846263,50.846379,149.316601,137.956260,...,39.845466,21.500125,19.000000,20.000000,18.000000,18.728452,17.845798,19.425750,18.000000,17.111296
2011-07-28,27,3,18.888704,18.803009,18.845892,35.845713,47.845857,51.845696,162.877204,147.690832,...,42.795578,23.204422,21.000000,22.000000,21.000000,21.023872,19.976128,20.714894,20.285106,20.000000
2011-07-29,27,4,20.000000,21.000000,20.000000,37.788779,45.845705,50.845726,162.822891,143.039176,...,38.845451,21.910895,19.000000,19.000000,19.000000,18.318000,17.845896,17.836104,18.855870,17.846017
2011-07-30,27,5,17.298113,17.000000,17.239697,17.845833,17.914470,18.000000,18.623151,19.376849,...,20.000000,19.235492,19.764508,19.000000,19.000000,19.000000,18.000000,19.000000,18.000000,17.000549
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-02-22,27,5,21.935497,21.935320,21.064881,21.946185,20.946242,21.784327,21.935417,22.118729,...,22.687338,21.935400,22.215645,21.946152,22.644214,22.247960,21.946055,21.611551,22.280723,20.946171
2020-02-23,27,6,21.946155,21.568840,21.323437,21.946155,21.536587,22.355722,21.946187,22.504338,...,22.452667,21.418263,22.935361,21.935446,21.495630,22.375282,21.935428,21.935402,21.935434,21.935470
2020-02-24,27,0,21.935432,24.299828,42.046687,56.888749,67.824054,77.116559,75.473832,73.795139,...,59.236310,32.813565,24.935450,25.127386,25.743428,23.935426,24.935405,23.935371,24.935494,24.808040
2020-02-25,27,1,24.179701,27.030773,43.485146,63.186978,77.027611,86.836100,88.814914,84.587054,...,59.872437,36.163066,24.980219,23.935414,24.935374,24.847482,25.023232,24.935294,24.815093,24.055454
