# Build complete Dataframe

Our first goal is to obtain the complete Dataframe of a building, that is, getting a time series starting from the first existing hour in the database, and finishing with the last one. Whenever the information in between is missing, we'll fill it with `NaN`, for later processing.

#### Directory structure

./<br></br>
notebook/<br></br>
    &emsp;|--- data-preprocessing<br></br>
    &emsp;&emsp;&emsp;&emsp;|--- complete_dataframe.ipynb<br></br>
out/

In [1]:
import pandas as pd
import numpy as np
import pymongo as pm
import datetime

In [2]:
HOST = '161.67.142.141'
PORT = 27017
DB = 'differential_uclm_db'
DB_COUNTERRAW = 'CounterRawConsumption'

START_DAY = 5 # Day starts at 5:00 am

### Database connection

In [3]:
def connectDB() -> pm.MongoClient:
    return pm.MongoClient(host=HOST, port=PORT)[DB]

In [4]:
db = connectDB()

## 1. Create hour index

First, we must obtain the building's first and last registered hour, building then the hour index between these dates.

### First and last registered hours
Find first and last registered hours for the specified building ID

In [5]:
def firstHour(db: pm.MongoClient, counter_id: int) -> datetime.datetime:
    return list(db[DB_COUNTERRAW].find({'counterinfo_id': counter_id}).sort('timestamp', pm.ASCENDING).limit(1))[0]['timestamp']

def lastHour(db: pm.MongoClient, counter_id: int) -> datetime.datetime:
    return list(db[DB_COUNTERRAW].find({'counterinfo_id': counter_id}).sort('timestamp', pm.DESCENDING).limit(1))[0]['timestamp']

In [6]:
counter_id = 487 # Building ID example
start, end = firstHour(db, counter_id).replace(hour=5), lastHour(db, counter_id).replace(hour=4) # Fix hours to have 24h days

start, end

(datetime.datetime(2013, 12, 17, 5, 0), datetime.datetime(2020, 5, 30, 4, 0))

### Build hour index
From firstHour lastHour with 1 hour step

In [7]:
def createIndex(first: datetime.datetime, last: datetime.datetime) -> pd.DatetimeIndex:
    return pd.date_range(start=first, end=last, freq='1H')

In [8]:
index = createIndex(start, end)

index

DatetimeIndex(['2013-12-17 05:00:00', '2013-12-17 06:00:00',
               '2013-12-17 07:00:00', '2013-12-17 08:00:00',
               '2013-12-17 09:00:00', '2013-12-17 10:00:00',
               '2013-12-17 11:00:00', '2013-12-17 12:00:00',
               '2013-12-17 13:00:00', '2013-12-17 14:00:00',
               ...
               '2020-05-29 19:00:00', '2020-05-29 20:00:00',
               '2020-05-29 21:00:00', '2020-05-29 22:00:00',
               '2020-05-29 23:00:00', '2020-05-30 00:00:00',
               '2020-05-30 01:00:00', '2020-05-30 02:00:00',
               '2020-05-30 03:00:00', '2020-05-30 04:00:00'],
              dtype='datetime64[ns]', length=56544, freq='H')

## 2. Build complete Dataframe
Now we rebuild complete Dataframe with the consumptions for every hour we got in the index, filling with `NaN` when the value is not found on the database or if it is a negative consumption. This is reindexing the Dataframe with the previous index we obtained

In [9]:
def getDataFrame(db: pm.MongoClient, counter_id: int) -> pd.DataFrame:
    cursor = db[DB_COUNTERRAW].find({'counterinfo_id': counter_id})
    df = pd.DataFrame(list(cursor))
    del df['_id']
    del df['counterinfo_id']
    
    df = df.set_index('timestamp') # Indexing dataframe by timestamp
    
    return df

In [10]:
df = getDataFrame(db, counter_id)
df

Unnamed: 0_level_0,consumption
timestamp,Unnamed: 1_level_1
2013-12-17 12:00:00,4.497335
2013-12-17 13:00:00,12.102932
2013-12-17 14:00:00,12.102932
2013-12-17 15:00:00,12.102932
2013-12-17 16:00:00,12.102932
...,...
2020-05-30 19:00:00,10.000000
2020-05-30 20:00:00,9.000000
2020-05-30 21:00:00,8.358873
2020-05-30 22:00:00,9.641127


### Reindex Dataframe

In [11]:
df = df.reindex(index=index)
df

Unnamed: 0,consumption
2013-12-17 05:00:00,
2013-12-17 06:00:00,
2013-12-17 07:00:00,
2013-12-17 08:00:00,
2013-12-17 09:00:00,
...,...
2020-05-30 00:00:00,8.924032
2020-05-30 01:00:00,10.000000
2020-05-30 02:00:00,10.000000
2020-05-30 03:00:00,9.000000


### Calculate day
Day recalculation needed because days will start, as defined in `START_DAY`, at 5:00 am

In [12]:
def calcDay(df: pd.DataFrame) -> pd.DataFrame:
    df['day'] = df.apply(lambda x: (x.name - pd.DateOffset(hours=START_DAY)).date(), axis= 1)
    df['day'] = pd.to_datetime(df['day'])
    
    return df

In [13]:
df = calcDay(df)
df

Unnamed: 0,consumption,day
2013-12-17 05:00:00,,2013-12-17
2013-12-17 06:00:00,,2013-12-17
2013-12-17 07:00:00,,2013-12-17
2013-12-17 08:00:00,,2013-12-17
2013-12-17 09:00:00,,2013-12-17
...,...,...
2020-05-30 00:00:00,8.924032,2020-05-29
2020-05-30 01:00:00,10.000000,2020-05-29
2020-05-30 02:00:00,10.000000,2020-05-29
2020-05-30 03:00:00,9.000000,2020-05-29


## 3. Reshape Dataframe into TimeSeries
Get new Dataframe with indexed with `day`, and its 24 consumptions

In [14]:
consumption = np.asarray(df['consumption'])
consumption = consumption.reshape((len(df['day']) // 24, 24)) # Reshape each day with its 24 consumptions

consumptions = pd.DataFrame({'consumptions': consumption.tolist()})

consumptions

Unnamed: 0,consumptions
0,"[nan, nan, nan, nan, nan, nan, nan, 4.49733527..."
1,"[12.1029321298894, 12.1029321298894, 12.102932..."
2,"[11.264909064798, 11.264909064798, 11.26490906..."
3,"[10.9838823956164, 10.9838823956164, 10.983882..."
4,"[6.93115242178077, 7.59915393780765, 7.5991539..."
...,...
2351,"[11.0, 9.0, 9.01754998254973, 12.9824500174503..."
2352,"[10.0, 9.0, 9.0, 12.7908828026325, 12.20911719..."
2353,"[9.59337189203563, 9.0, 9.0, 11.0, 12.56089413..."
2354,"[9.17706203985309, 9.82293796014691, 9.0, 11.0..."


### Index by day

In [15]:
days = df['day'].drop_duplicates().tolist()

consumptions = pd.concat([pd.DataFrame({'day': days}), consumptions], axis=1)
consumptions = consumptions.set_index('day')

consumptions.insert(0, 'building_id', counter_id)

consumptions

Unnamed: 0_level_0,building_id,consumptions
day,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-12-17,487,"[nan, nan, nan, nan, nan, nan, nan, 4.49733527..."
2013-12-18,487,"[12.1029321298894, 12.1029321298894, 12.102932..."
2013-12-19,487,"[11.264909064798, 11.264909064798, 11.26490906..."
2013-12-20,487,"[10.9838823956164, 10.9838823956164, 10.983882..."
2013-12-21,487,"[6.93115242178077, 7.59915393780765, 7.5991539..."
...,...,...
2020-05-25,487,"[11.0, 9.0, 9.01754998254973, 12.9824500174503..."
2020-05-26,487,"[10.0, 9.0, 9.0, 12.7908828026325, 12.20911719..."
2020-05-27,487,"[9.59337189203563, 9.0, 9.0, 11.0, 12.56089413..."
2020-05-28,487,"[9.17706203985309, 9.82293796014691, 9.0, 11.0..."


### Clean Data
Remove negative consumptions and their large positives related consumptions

In [16]:
def cleanData(df: pd.DataFrame) -> pd.DataFrame:
    consumptions = df['consumptions']
    
    for row in range(df.shape[0]):
        cons = np.asarray(consumptions[row])
        negatives = np.less(cons, 0)   # Negative values

        cons_clean = cons[~negatives]
        clean_mean, clean_std = np.mean(cons_clean), np.std(cons_clean)

        positives = np.greater(cons, clean_std * 3 + clean_mean)
        invalids = positives + negatives
        
        cons[invalids] = np.nan

        consumptions[row] = cons
    
    df['consumptions'] = consumptions
    return df

In [17]:
consumptions = cleanData(consumptions)
consumptions

  
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Unnamed: 0_level_0,building_id,consumptions
day,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-12-17,487,"[nan, nan, nan, nan, nan, nan, nan, 4.49733527..."
2013-12-18,487,"[12.1029321298894, 12.1029321298894, 12.102932..."
2013-12-19,487,"[11.264909064798, 11.264909064798, 11.26490906..."
2013-12-20,487,"[10.9838823956164, 10.9838823956164, 10.983882..."
2013-12-21,487,"[6.93115242178077, 7.59915393780765, 7.5991539..."
...,...,...
2020-05-25,487,"[11.0, 9.0, 9.01754998254973, 12.9824500174503..."
2020-05-26,487,"[10.0, 9.0, 9.0, 12.7908828026325, 12.20911719..."
2020-05-27,487,"[9.59337189203563, 9.0, 9.0, 11.0, 12.56089413..."
2020-05-28,487,"[9.17706203985309, 9.82293796014691, 9.0, 11.0..."
