In [13]:
import pandas as pd
import numpy as np
import torch
import seaborn as sns;sns.set()
import matplotlib.pyplot as plt
import Graphs
import networkx as nx
import torch.optim as optim
import os
import time
import pickle as pkl
import pywt
from tsmoothie.smoother import ExponentialSmoother
from river import preprocessing,compose, base
from datetime import timedelta
from torch import nn
from torch.utils.data import TensorDataset,DataLoader
from torchsummaryX import summary
from operator import add
from copy import deepcopy
from scipy import signal, fftpack
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from statsmodels.tsa.seasonal import seasonal_decompose

In [14]:
parse_date=["time"]
buildingDB = pd.read_csv("./buildingDB.csv",header=0,index_col=0,parse_dates=parse_date)

In [15]:
babbage = buildingDB.loc[buildingDB["room"]=="babbage"]
babyfoot = buildingDB.loc[buildingDB["room"]=="babyfoot"]
jacquard = buildingDB.loc[buildingDB["room"]=="jacquard"]

In [16]:
babbage_nodata = set(jacquard.index) - set(babbage.index)
jacquard = jacquard.drop(babbage_nodata,axis=0)
babyfoot = babyfoot.drop(babbage_nodata,axis=0)

In [17]:
babbage = babbage.sort_index()
jacquard = jacquard.sort_index()
babyfoot = babyfoot.sort_index()
babbage.shape, jacquard.shape, babyfoot.shape

((67104, 7), (67104, 7), (67104, 7))

In [19]:
def Missing_values(data):
    '''
    Find missing values of the dataframe.

    Parameters
    ----------
    data : DataFrame

    Returns
    -------
    None.
    Print a dataframe with missing values and their percentage.

    '''
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total,percent], axis=1, keys=['Total', 'Pourcentage'])
    #Affiche que les variables avec des na
    print (missing_data[(percent>0)],'\n' )
    
class DataTransform:
    
    def __init__(self, dataset, drop,interpol=True):
        self.datatmp = dataset.drop(drop, axis=1)
        #self.datatmp["weekofyear"] = pd.Int64Index(dataset.index.isocalendar().week)
        #self.datatmp["dateofmonth"] = pd.DatetimeIndex(dataset.index).date
        #self.datatmp["month"] = pd.DatetimeIndex(dataset.index).month
        self.interpol = interpol
        
    def interpolate(self,column):
        "Column interpolation"
        if column in self.datatmp.columns:
            self.datatmp[column].interpolate(method="linear",limit_direction="both",inplace=True)
        else:
            print("Column not in data")
            
    def dataByWeek(self):
        "Create week of year feature"
        weeks = dict()
        for week in self.datatmp.weekofyear.unique():
            weeks[week] = self.datatmp.loc[self.datatmp["weekofyear"]==week]
        return weeks
    
    def dataByMonth(self):
        "Create month of year feature"
        months = dict()
        for month in self.datatmp.month.unique():
            months[month] = self.datatmp.loc[self.datatmp["month"]==month]
        return months
        
    
    def transform(self, interpol_columns):
        if self.interpol:
            self.interpolate(interpol_columns)
        return self.datatmp

In [20]:
babbagenew = DataTransform(babbage, ["presence","room"]).transform("light")
babyfootnew = DataTransform(babyfoot, ["presence","room"]).transform("light")
jacquardnew = DataTransform(jacquard, ["presence","room"]).transform("light")

In [11]:
#babbagenew=babbagenew.dropna(axis=0)
#babyfootnew = babyfootnew.dropna(axis=0)
#jacquardnew=jacquardnew.dropna(axis=0)

In [21]:
Missing_values(babbagenew), Missing_values(babyfootnew), Missing_values(jacquardnew)

Empty DataFrame
Columns: [Total, Pourcentage]
Index: [] 

Empty DataFrame
Columns: [Total, Pourcentage]
Index: [] 

Empty DataFrame
Columns: [Total, Pourcentage]
Index: [] 



(None, None, None)

In [15]:
def createChunk(n_freq,dataset,path_to_folder):
    
    if not os.path.exists(path_to_folder):
        os.makedirs(path_to_folder)
    
    start_idx = dataset.index.min()
    data_week = np.unique(pd.Int64Index(dataset.index.isocalendar().week))
    nextweek_idx = timedelta(weeks=n_freq)
    nb_data_in_week = dataset[(dataset.index >= start_idx)  & (dataset.index < start_idx+nextweek_idx)].shape[0]
    num_chunk = np.ceil(dataset.shape[0]/(nb_data_in_week))
    i = 0
    tmp_idx = start_idx
    while i < num_chunk:
        file_name = path_to_folder + "chunk_" + str(i) + ".csv"
        #os.makedirs(file_name)
        weekly = tmp_idx + timedelta(weeks=n_freq)
        chunk_i = dataset[(dataset.index >= tmp_idx) & (dataset.index < weekly)]            
        chunk_i.to_csv(file_name)
        tmp_idx = weekly
        i+=1

In [16]:
path_jacquard = "./QarnotData30m/jacquard/"
path_babyfoot = "./QarnotData30m/babyfoot/"
path_babbage = "./QarnotData30m/babbage/"
createChunk(n_freq=4, dataset=babbagenew, path_to_folder=path_babbage)
createChunk(n_freq=4, dataset=jacquardnew, path_to_folder=path_jacquard)
createChunk(n_freq=4, dataset=babyfootnew, path_to_folder=path_babyfoot)

In [17]:
list_babbage = sorted(os.listdir(path_babbage))
list_jacquard = sorted(os.listdir(path_jacquard))
list_babyfoot = sorted(os.listdir(path_babyfoot))
for babbage,jacquard,babyfoot in zip(list_babbage, list_jacquard,list_babyfoot):
    chunk_babbage = pd.read_csv(path_babbage+babbage,index_col=0,parse_dates=["time"])
    print("babbage:{} nb_data : {} min_date : {} max_date : {}".format(babbage, chunk_babbage.shape[0], chunk_babbage.index.min(), 
                                                                       chunk_babbage.index.max()))
    
    chunk_jacquard = pd.read_csv(path_jacquard+jacquard,index_col=0,parse_dates=["time"])
    print("jacquard:{} nb_data : {} min_date : {} max_date : {}".format(jacquard, chunk_jacquard.shape[0], 
                                                                        chunk_jacquard.index.min(), chunk_jacquard.index.max()))
    
    chunk_babyfoot = pd.read_csv(path_babyfoot+babyfoot,index_col=0,parse_dates=["time"])
    print("babyfoot:{} nb_data : {} min_date : {} max_date : {}".format(babyfoot, chunk_babyfoot.shape[0], 
                                                                        chunk_babyfoot.index.min(), chunk_babyfoot.index.max()))
    
    print("--------------------------")

babbage:chunk_0.csv nb_data : 1344 min_date : 2020-01-05 00:00:00 max_date : 2020-02-01 23:30:00
jacquard:chunk_0.csv nb_data : 1344 min_date : 2020-01-05 00:00:00 max_date : 2020-02-01 23:30:00
babyfoot:chunk_0.csv nb_data : 1344 min_date : 2020-01-05 00:00:00 max_date : 2020-02-01 23:30:00
--------------------------
babbage:chunk_1.csv nb_data : 1296 min_date : 2020-02-02 00:00:00 max_date : 2020-02-28 23:30:00
jacquard:chunk_1.csv nb_data : 1296 min_date : 2020-02-02 00:00:00 max_date : 2020-02-28 23:30:00
babyfoot:chunk_1.csv nb_data : 1296 min_date : 2020-02-02 00:00:00 max_date : 2020-02-28 23:30:00
--------------------------
babbage:chunk_2.csv nb_data : 1344 min_date : 2020-03-01 00:00:00 max_date : 2020-03-28 23:30:00
jacquard:chunk_2.csv nb_data : 1344 min_date : 2020-03-01 00:00:00 max_date : 2020-03-28 23:30:00
babyfoot:chunk_2.csv nb_data : 1344 min_date : 2020-03-01 00:00:00 max_date : 2020-03-28 23:30:00
--------------------------
babbage:chunk_3.csv nb_data : 1344 min_d