# Build complete Dataframe

Our first goal is to obtain the complete Dataframe of a building, that is, getting a time series starting from the first existing hour in the database, and finishing with the last one. Whenever the information in between is missing, we'll fill it with `NaN`, for later processing.

#### Directory structure

./<br></br>
notebook/<br></br>
    &emsp;|--- data-preprocessing<br></br>
    &emsp;&emsp;&emsp;&emsp;|--- complete_dataframe.ipynb<br></br>
out/

In [2]:
import pandas as pd
import numpy as np
import pymongo as pm
import datetime

In [3]:
HOST = '161.67.142.141'
PORT = 27017
DB = 'differential_uclm_db'
DB_COUNTERRAW = 'CounterRawConsumption'

START_DAY = 5 # Day starts at 5:00 am

### Database connection

In [4]:
def connectDB() -> pm.MongoClient:
    return pm.MongoClient(host=HOST, port=PORT)[DB]

In [5]:
db = connectDB()

## 1. Create hour index

First, we must obtain the building's first and last registered hour, building then the hour index between these dates.

### First and last registered hours
Find first and last registered hours for the specified building ID

In [6]:
def firstHour(db: pm.MongoClient, counter_id: int) -> datetime.datetime:
    return list(db[DB_COUNTERRAW].find({'counterinfo_id': counter_id}).sort('timestamp', pm.ASCENDING).limit(1))[0]['timestamp']

def lastHour(db: pm.MongoClient, counter_id: int) -> datetime.datetime:
    return list(db[DB_COUNTERRAW].find({'counterinfo_id': counter_id}).sort('timestamp', pm.DESCENDING).limit(1))[0]['timestamp']

In [7]:
counter_id = 27 # Building ID example
start, end = firstHour(db, counter_id).replace(hour=5), lastHour(db, counter_id).replace(hour=4) # Fix hours to have 24h days

start, end

(datetime.datetime(2011, 7, 26, 5, 0), datetime.datetime(2020, 4, 2, 4, 0))

### Build hour index
From firstHour lastHour with 1 hour step

In [8]:
def createIndex(first: datetime.datetime, last: datetime.datetime) -> pd.DatetimeIndex:
    return pd.date_range(start=first, end=last, freq='1H')

In [9]:
index = createIndex(start, end)

index

DatetimeIndex(['2011-07-26 05:00:00', '2011-07-26 06:00:00',
               '2011-07-26 07:00:00', '2011-07-26 08:00:00',
               '2011-07-26 09:00:00', '2011-07-26 10:00:00',
               '2011-07-26 11:00:00', '2011-07-26 12:00:00',
               '2011-07-26 13:00:00', '2011-07-26 14:00:00',
               ...
               '2020-04-01 19:00:00', '2020-04-01 20:00:00',
               '2020-04-01 21:00:00', '2020-04-01 22:00:00',
               '2020-04-01 23:00:00', '2020-04-02 00:00:00',
               '2020-04-02 01:00:00', '2020-04-02 02:00:00',
               '2020-04-02 03:00:00', '2020-04-02 04:00:00'],
              dtype='datetime64[ns]', length=76152, freq='H')

## 2. Build complete Dataframe
Now we rebuild complete Dataframe with the consumptions for every hour we got in the index, filling with `NaN` when the value is not found on the database or if it is a negative consumption. This is reindexing the Dataframe with the previous index we obtained

In [10]:
def getDataFrame(db: pm.MongoClient, counter_id: int) -> pd.DataFrame:
    cursor = db[DB_COUNTERRAW].find({'counterinfo_id': counter_id})
    df = pd.DataFrame(list(cursor))
    del df['_id']
    del df['counterinfo_id']
    
    df = df.set_index('timestamp') # Indexing dataframe by timestamp
    
    return df

In [11]:
df = getDataFrame(db, counter_id)
df

Unnamed: 0_level_0,consumption
timestamp,Unnamed: 1_level_1
2011-07-26 17:00:00,111.000000
2011-07-26 18:00:00,43.348334
2011-07-26 19:00:00,41.846246
2011-07-26 20:00:00,22.805419
2011-07-26 21:00:00,20.887574
...,...
2020-04-02 19:00:00,10.550545
2020-04-02 20:00:00,10.385033
2020-04-02 21:00:00,10.967781
2020-04-02 22:00:00,10.967779


### Reindex Dataframe

In [12]:
df = df.reindex(index=index)
df

Unnamed: 0,consumption
2011-07-26 05:00:00,
2011-07-26 06:00:00,
2011-07-26 07:00:00,
2011-07-26 08:00:00,
2011-07-26 09:00:00,
...,...
2020-04-02 00:00:00,10.967850
2020-04-02 01:00:00,10.356770
2020-04-02 02:00:00,10.578497
2020-04-02 03:00:00,10.967849


### Calculate day
Day recalculation needed because days will start, as defined in `START_DAY`, at 5:00 am

In [13]:
def calcDay(df: pd.DataFrame) -> pd.DataFrame:
    df['day'] = df.apply(lambda x: (x.name - pd.DateOffset(hours=START_DAY)).date(), axis= 1)
    df['day'] = pd.to_datetime(df['day'])
    
    return df

In [14]:
df = calcDay(df)
df

Unnamed: 0,consumption,day
2011-07-26 05:00:00,,2011-07-26
2011-07-26 06:00:00,,2011-07-26
2011-07-26 07:00:00,,2011-07-26
2011-07-26 08:00:00,,2011-07-26
2011-07-26 09:00:00,,2011-07-26
...,...,...
2020-04-02 00:00:00,10.967850,2020-04-01
2020-04-02 01:00:00,10.356770,2020-04-01
2020-04-02 02:00:00,10.578497,2020-04-01
2020-04-02 03:00:00,10.967849,2020-04-01


## 3. Reshape Dataframe into TimeSeries
Get new Dataframe with indexed with `day`, and its 24 consumptions

In [15]:
consumption = np.asarray(df['consumption'])
consumption = consumption.reshape((len(df['day']) // 24, 24)) # Reshape each day with its 24 consumptions

consumptions = pd.DataFrame({'consumptions': consumption.tolist()})

consumptions

Unnamed: 0,consumptions
0,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
1,"[17.0, 19.0, 18.3507946535444, 35.846312818818..."
2,"[18.8887041808661, 18.8030088936913, 18.845892..."
3,"[20.0, 21.0, 20.0, 37.7887789876153, 45.845704..."
4,"[17.2981132075472, 17.0, 17.2396974482587, 17...."
...,...
3168,"[9.96776406754223, 10.9677114155888, 10.141369..."
3169,"[10.36707127443, 10.5683449246069, 10.96773053..."
3170,"[10.9677773145956, 10.635913161463, 9.97850891..."
3171,"[9.96774546503054, 10.9676962021865, 10.904799..."


### Index by day

In [16]:
days = df['day'].drop_duplicates().tolist()

weekdays = []
for day in days:
    weekdays.append(day.weekday())

consumptions = pd.concat([pd.DataFrame({'day': days, 'weekday': weekdays}), consumptions], axis=1)
consumptions = consumptions.set_index(['day'])

consumptions.insert(0, 'building_id', counter_id)

consumptions

Unnamed: 0_level_0,building_id,weekday,consumptions
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-07-26,27,1,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
2011-07-27,27,2,"[17.0, 19.0, 18.3507946535444, 35.846312818818..."
2011-07-28,27,3,"[18.8887041808661, 18.8030088936913, 18.845892..."
2011-07-29,27,4,"[20.0, 21.0, 20.0, 37.7887789876153, 45.845704..."
2011-07-30,27,5,"[17.2981132075472, 17.0, 17.2396974482587, 17...."
...,...,...,...
2020-03-28,27,5,"[9.96776406754223, 10.9677114155888, 10.141369..."
2020-03-29,27,6,"[10.36707127443, 10.5683449246069, 10.96773053..."
2020-03-30,27,0,"[10.9677773145956, 10.635913161463, 9.97850891..."
2020-03-31,27,1,"[9.96774546503054, 10.9676962021865, 10.904799..."


### Clean Data
Remove negative consumptions and their large positives related consumptions

In [17]:
def cleanData(df: pd.DataFrame) -> pd.DataFrame:
    consumptions = df['consumptions']
    
    for row in range(df.shape[0]):
        cons = np.asarray(consumptions[row])
        negatives = np.less(cons, 0)   # Negative values

        cons_clean = cons[~negatives]
        clean_mean, clean_std = np.mean(cons_clean), np.std(cons_clean)

        positives = np.greater(cons, clean_std * 3 + clean_mean)
        invalids = positives + negatives
        
        cons[invalids] = np.nan

        consumptions[row] = cons
    
    df['consumptions'] = consumptions
    return df

In [18]:
consumptions = cleanData(consumptions)
consumptions

  
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Unnamed: 0_level_0,building_id,weekday,consumptions
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-07-26,27,1,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
2011-07-27,27,2,"[17.0, 19.0, 18.3507946535444, 35.846312818818..."
2011-07-28,27,3,"[18.8887041808661, 18.8030088936913, 18.845892..."
2011-07-29,27,4,"[20.0, 21.0, 20.0, 37.7887789876153, 45.845704..."
2011-07-30,27,5,"[17.2981132075472, 17.0, 17.2396974482587, 17...."
...,...,...,...
2020-03-28,27,5,"[9.96776406754223, 10.9677114155888, 10.141369..."
2020-03-29,27,6,"[10.36707127443, 10.5683449246069, 10.96773053..."
2020-03-30,27,0,"[10.9677773145956, 10.635913161463, 9.97850891..."
2020-03-31,27,1,"[9.96774546503054, 10.9676962021865, 10.904799..."
