# Introduction
### Pour explorer tout les chemins possible, on a crée et entrainer un modele ***LSTM*** afin de voir la performance du modèle si on exploite les valeurs de la série temporelle.

# Download Data

In [0]:
!rm /root/.kaggle
!mkdir /root/.kaggle
!mv kaggle.json /root/.kaggle

rm: cannot remove '/root/.kaggle': No such file or directory


In [0]:
!kaggle competitions download -c ashrae-energy-prediction

Downloading test.csv.zip to /content
 96% 160M/167M [00:00<00:00, 183MB/s]
100% 167M/167M [00:01<00:00, 172MB/s]
Downloading train.csv.zip to /content
100% 120M/120M [00:00<00:00, 153MB/s] 

Downloading building_metadata.csv to /content
  0% 0.00/44.5k [00:00<?, ?B/s]
100% 44.5k/44.5k [00:00<00:00, 44.7MB/s]
Downloading weather_test.csv.zip to /content
  0% 0.00/2.53M [00:00<?, ?B/s]
100% 2.53M/2.53M [00:00<00:00, 170MB/s]
Downloading sample_submission.csv.zip to /content
 74% 65.0M/88.4M [00:00<00:00, 117MB/s]
100% 88.4M/88.4M [00:00<00:00, 121MB/s]
Downloading weather_train.csv.zip to /content
  0% 0.00/1.27M [00:00<?, ?B/s]
100% 1.27M/1.27M [00:00<00:00, 181MB/s]


In [0]:
!unzip sample_submission.csv.zip
!unzip test.csv.zip
!unzip train.csv.zip
!unzip weather_train.csv.zip
!unzip weather_test.csv.zip

Archive:  sample_submission.csv.zip
  inflating: sample_submission.csv   
Archive:  test.csv.zip
  inflating: test.csv                
Archive:  train.csv.zip
  inflating: train.csv               
Archive:  weather_train.csv.zip
  inflating: weather_train.csv       
Archive:  weather_test.csv.zip
  inflating: weather_test.csv        


In [0]:
!rm sample_submission.csv.zip
!rm test.csv.zip
!rm train.csv.zip
!rm weather_train.csv.zip
!rm weather_test.csv.zip

In [0]:
!mkdir -p kaggle/input
!mv sample_submission.csv /content/kaggle/input
!mv test.csv /content/kaggle/input
!mv train.csv /content/kaggle/input
!mv weather_train.csv /content/kaggle/input
!mv weather_test.csv /content/kaggle/input
!mv building_metadata.csv /content/kaggle/input

# Load Packages

In [0]:
%matplotlib inline
import json
import os
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection  import train_test_split
import numpy as np
import gc
from scipy.stats import norm
from scipy import stats, integrate
import matplotlib.pyplot as plt

# Load Data

In [0]:
%%time
ASHRAE_train = pd.read_csv('kaggle/input/train.csv')
ASHRAE_test = pd.read_csv('kaggle/input/test.csv')
weather_train = pd.read_csv('kaggle/input/weather_train.csv')
weather_test = pd.read_csv('kaggle/input/weather_test.csv')
building_meta = pd.read_csv('kaggle/input/building_metadata.csv')

CPU times: user 20.2 s, sys: 2.96 s, total: 23.2 s
Wall time: 1min 3s


# Preprocess Data

### Reduce Memory Size
#### Afin de preserver la mémoire, on a utiliser cette fonction pour réduire la taille de mémoire allouer à nos datasets
***On a copier ce code d'un forum en internet***

In [0]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

### Merge Data
#### On a merger tout les données dans une seule table pour commencer le traitement

In [0]:
%%time
BuildingTrain = building_meta.merge(ASHRAE_train, left_on='building_id', right_on='building_id' , how='left')
BuildingTest = building_meta.merge(ASHRAE_test, left_on='building_id', right_on='building_id' , how='left')
BuildingTrain.shape, BuildingTest.shape
del ASHRAE_test
del ASHRAE_train
del building_meta
gc.collect()
BTW_train=BuildingTrain.merge(weather_train,left_on=['site_id','timestamp'],right_on=['site_id','timestamp'],how='left')
BTW_test = BuildingTest.merge(weather_test,left_on=['site_id','timestamp'],right_on=['site_id','timestamp'],how='left')
del BuildingTest
del BuildingTrain
del weather_test
del weather_train
gc.collect()

CPU times: user 29.2 s, sys: 2.84 s, total: 32 s
Wall time: 32.1 s


#### Reduce Memory usage

In [0]:
BTW_train = reduce_mem_usage(BTW_train)
BTW_test = reduce_mem_usage(BTW_test)

In [0]:
# Pourcentage des elements N/A
BTW_train.isna().sum() / len(BTW_train) *100

### Drop Columns
On supprime ces trois colonnes parce qu'ils ont plein de cases N/A

In [0]:
BTW_train = BTW_train.drop(columns=['year_built', 'floor_count', 'wind_direction'])
BTW_test = BTW_test.drop(columns=['year_built', 'floor_count','wind_direction'])

consommation d'électricité avant '2016-05-21 00:00:00' est nulle pour tout les building: On va les eliminer

In [0]:
BTW_train.loc[(BTW_train['meter']==0) & (BTW_train['site_id']==0) & (BTW_train['timestamp']<'2016-05-21 00:00:00'), 'drop'] = True
BTW_train = BTW_train[BTW_train['drop']!=True]
del BTW_train["drop"]

### Fill N/A
On va remplir les cases N/A par la valeur moyenne de la colonne correspondante

In [0]:
BTW_train['precip_depth_1_hr'].fillna(BTW_train['precip_depth_1_hr'].astype('float').mean(), inplace = True)
BTW_train['cloud_coverage'].fillna(BTW_train['cloud_coverage'].astype('float').mean(), inplace = True)
BTW_train['wind_speed'].fillna(BTW_train['wind_speed'].astype('float').mean(), inplace=True)
BTW_train['air_temperature'].fillna(BTW_train['air_temperature'].astype('float').mean(), inplace=True)
BTW_train['dew_temperature'].fillna(BTW_train['dew_temperature'].astype('float').mean(), inplace=True)
BTW_train['sea_level_pressure'].fillna(BTW_train['sea_level_pressure'].astype('float').mean(), inplace=True)

BTW_test['precip_depth_1_hr'].fillna(BTW_test['precip_depth_1_hr'].astype('float').mean(), inplace = True)
BTW_test['cloud_coverage'].fillna(BTW_test['cloud_coverage'].astype('float').mean(), inplace = True)
BTW_test['wind_speed'].fillna(BTW_test['wind_speed'].astype('float').mean(), inplace=True)
BTW_test['air_temperature'].fillna(BTW_test['air_temperature'].astype('float').mean(), inplace=True)
BTW_test['dew_temperature'].fillna(BTW_test['dew_temperature'].astype('float').mean(), inplace=True)
BTW_test['sea_level_pressure'].fillna(BTW_test['sea_level_pressure'].astype('float').mean(), inplace=True)

print(BTW_train.isnull().sum()/len(BTW_train) * 100)
print(BTW_test.isnull().sum()/len(BTW_test) * 100)

site_id               0.0
building_id           0.0
primary_use           0.0
square_feet           0.0
meter                 0.0
timestamp             0.0
meter_reading         0.0
air_temperature       0.0
cloud_coverage        0.0
dew_temperature       0.0
precip_depth_1_hr     0.0
sea_level_pressure    0.0
wind_speed            0.0
dtype: float64
site_id               0.0
building_id           0.0
primary_use           0.0
square_feet           0.0
row_id                0.0
meter                 0.0
timestamp             0.0
air_temperature       0.0
cloud_coverage        0.0
dew_temperature       0.0
precip_depth_1_hr     0.0
sea_level_pressure    0.0
wind_speed            0.0
dtype: float64


### Add Features
On va extraire les features suivants du timestamp: année, mois, jour, heure

In [0]:
BTW_train ['timestamp'] =  pd.to_datetime(BTW_train['timestamp'])
BTW_test ['timestamp'] =  pd.to_datetime(BTW_test['timestamp'])
BTW_test['Year']=pd.DatetimeIndex(BTW_test['timestamp']).year
BTW_train['Year']=pd.DatetimeIndex(BTW_train['timestamp']).year
BTW_train['Month']=pd.DatetimeIndex(BTW_train['timestamp']).month
BTW_test['Month']=pd.DatetimeIndex(BTW_test['timestamp']).month
BTW_train['Day']=pd.DatetimeIndex(BTW_train['timestamp']).day
BTW_test['Day']=pd.DatetimeIndex(BTW_test['timestamp']).day
BTW_test['Hour']=pd.DatetimeIndex(BTW_test['timestamp']).hour
BTW_train['Hour']=pd.DatetimeIndex(BTW_train['timestamp']).hour

#### Features month, day and hour
On a trouvé dans un article que pour présenter les features cycliques, on peut introduire la fonction **SINUS** et **COSINUS**

In [0]:
features_cyc = {'Month' : 12, 'Day' : 7, 'Hour' : 24}
for feature in features_cyc.keys():
    BTW_train[feature+'_sin'] = np.sin((2*np.pi*BTW_train[feature])/features_cyc[feature])
    BTW_train[feature+'_cos'] = np.cos((2*np.pi*BTW_train[feature])/features_cyc[feature])
    BTW_test[feature+'_sin'] = np.sin((2*np.pi*BTW_test[feature])/features_cyc[feature])
    BTW_test[feature+'_cos'] = np.cos((2*np.pi*BTW_test[feature])/features_cyc[feature])
BTW_train = BTW_train.drop(features_cyc.keys(), axis=1)
BTW_test = BTW_test.drop(features_cyc.keys(), axis=1)

In [0]:
BTW_train= BTW_train.groupby(['meter',BTW_train['building_id'],'primary_use',BTW_train['Month_sin'], BTW_train['Month_cos'], BTW_train['Day_sin'], BTW_train['Day_cos']]).agg({'meter_reading':'sum', 'air_temperature': 'mean', 'wind_speed': 'mean', 'precip_depth_1_hr': 'mean', 'cloud_coverage': 'mean', 'square_feet': 'mean'})
BTW_test_1= BTW_test.groupby(['row_id','meter',BTW_test['building_id'],'primary_use',BTW_test['Month_sin'], BTW_test['Month_cos'], BTW_test['Day_cos'], BTW_test['Day_sin']]).agg({ 'air_temperature': 'mean', 'wind_speed': 'mean', 'precip_depth_1_hr': 'mean', 'cloud_coverage': 'mean', 'square_feet': 'mean'})

In [0]:
BTW_train = BTW_train.reset_index()
BTW_test = BTW_test.reset_index()

In [0]:
BTW_train.head()

Unnamed: 0,meter,building_id,primary_use,Month_sin,Month_cos,Day_sin,Day_cos,meter_reading,air_temperature,wind_speed,precip_depth_1_hr,cloud_coverage,square_feet,beaufort_scale
0,0,0,Education,-1.0,-1.83697e-16,-0.974928,-0.222521,5835.206055,25.453125,1.770508,1.958008,3.015625,7432,2.0
1,0,0,Education,-1.0,-1.83697e-16,-0.974928,-0.222521,5901.419922,25.40625,3.675781,11.664062,2.75,7432,3.0
2,0,0,Education,-1.0,-1.83697e-16,-0.974928,-0.222521,5864.560059,27.625,2.279297,6.332031,2.945312,7432,2.0
3,0,0,Education,-1.0,-1.83697e-16,-0.974928,-0.222521,5754.666016,27.8125,3.708984,0.125,2.691406,7432,3.0
4,0,0,Education,-1.0,-1.83697e-16,-0.781831,0.62349,6003.80127,25.15625,1.483398,1.666992,3.029297,7432,1.0


#### Wind Speed categorization
On utilise le beaufort scale pour convertir le wind speed en une variable discrète 

In [0]:
beaufort = [(0, 0, 0.3), (1, 0.3, 1.6), (2, 1.6, 3.4), (3, 3.4, 5.5), (4, 5.5, 8), (5, 8, 10.8), (6, 10.8, 13.9), 
          (7, 13.9, 17.2), (8, 17.2, 20.8), (9, 20.8, 24.5), (10, 24.5, 28.5), (11, 28.5, 33), (12, 33, 200)]

for item in beaufort:
    BTW_train.loc[(BTW_train['wind_speed']>=item[1]) & (BTW_train['wind_speed']<item[2]), 'beaufort_scale'] = item[0]
    BTW_test.loc[(BTW_test['wind_speed']>=item[1]) & (BTW_test['wind_speed']<item[2]), 'beaufort_scale'] = item[0]
del BTW_train['wind_speed']
del BTW_test['wind_speed']
gc.collect()

# Build Model

### Encode Data

In [0]:
BTW_encoded = BTW_train[:]
BTW_test_encoded = BTW_test[:]

In [0]:
# label encoding 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
BTW_encoded["primary_use"] = le.fit_transform(BTW_encoded["primary_use"])
BTW_test_encoded["primary_use"] = le.fit_transform(BTW_test_encoded["primary_use"])

In [0]:
X = BTW_encoded[['meter', 'building_id', 'primary_use', 'Month_sin', 'Month_cos', 'Day_sin', 'Day_cos', 'air_temperature', 'wind_speed', 'precip_depth_1_hr', 'cloud_coverage']] #'square_feet'
y = BTW_encoded['meter_reading']

In [0]:
X_test = BTW_test_encoded[['meter', 'building_id', 'primary_use', 'Month_sin', 'Month_cos', 'Day_sin', 'Day_cos', 'air_temperature', 'wind_speed', 'precip_depth_1_hr', 'cloud_coverage']] #'square_feet'

In [0]:
BTW_train.describe()

Unnamed: 0,meter,building_id,Month_sin,Month_cos,Day_sin,Day_cos,meter_reading,air_temperature,precip_depth_1_hr,cloud_coverage,square_feet,beaufort_scale
count,831147.0,831147.0,831147.0,831147.0,831147.0,831147.0,831147.0,831147.0,831147.0,831147.0,831147.0,831147.0
mean,0.675285,813.188322,-0.03092074,-0.006601679,0.06489923,-0.002754,51479.02,,,,108081.638573,2.432565
std,0.935118,418.978502,0.700325,0.7131244,0.7121949,0.698971,3601055.0,0.0,0.0,0.0,117155.254144,0.827288
min,0.0,0.0,-1.0,-1.0,-0.9749279,-0.900969,0.0,-24.921875,-1.0,0.0,283.0,0.0
25%,0.0,426.0,-0.8660254,-0.8660254,-0.7818315,-0.900969,576.549,8.773438,0.0,1.108398,33012.0,2.0
50%,0.0,904.0,-2.449294e-16,-1.83697e-16,-2.449294e-16,-0.222521,2116.088,16.9375,0.0,1.882812,72958.0,2.0
75%,1.0,1186.0,0.5,0.8660254,0.7818315,0.62349,6784.482,23.6875,0.788086,2.65625,140092.0,3.0
max,3.0,1448.0,1.0,1.0,0.9749279,1.0,461146900.0,40.1875,49.84375,9.0,875000.0,7.0


### Model

In [0]:
from sklearn import preprocessing
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dense, LSTM, GRU, Dropout, BatchNormalization
from keras.models import Sequential
from keras.optimizers import RMSprop,Adam
from keras import regularizers

Using TensorFlow backend.


In [0]:
def root_mean_squared_error(y_true, y_pred):
  return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [0]:
def make_model(input_dim=10,metrics=root_mean_squared_error,loss='mse', optimizer="rmsprop",drop_rate=0.5):

  model = Sequential()
  model.add(LSTM(256,return_sequences=True, input_shape=(None,input_dim)))
  model.add(Dropout(drop_rate))
  model.add(BatchNormalization())
  model.add(LSTM(512,return_sequences=True))
  model.add(Dropout(drop_rate))
  model.add(BatchNormalization())
  model.add(LSTM(128,return_sequences=False))
  model.add(BatchNormalization())
  model.add(Dropout(drop_rate))
  model.add(Dense(1))
  model.compile(optimizer=optimizer, loss=loss, metrics=[metrics])
  
  return model

In [0]:
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split

def run_model(x_df,y_df,epochs=50,batch_size=500,verbose=1,callbacks =None):
  histories = []
  models = []
  folds = 4
  seed = 99
  es = callbacks
  kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
  for fold_n, (train_index, valid_index) in enumerate(kf.split(x_df, x_df['building_id'])):
    print('Fold:', fold_n)
    model_checkpoint = ModelCheckpoint("model_" + str(fold_n)+ ".hdf5",
                                       save_best_only=True, verbose=1, monitor='val_root_mean_squared_error', mode='min')
    callbacks = [es, model_checkpoint]
    model = make_model(input_dim=x_df.shape[-1],drop_rate=0.3)
    X_train, X_valid = x_df.iloc[train_index], x_df.iloc[valid_index]
    y_train, y_valid = y_df.iloc[train_index], y_df.iloc[valid_index]

    X_train = X_train.values[:]
    X_train= X_train.reshape((X_train.shape[0],1,X_train.shape[-1]))
    y_train = np.log1p(y_train)

    x_val = X_valid.values[:]
    x_val = x_val.reshape((x_val.shape[0],1,x_val.shape[-1]))
    y_val = np.log1p(y_valid)

    hist = model.fit(X_train,y_train,epochs=epochs,batch_size=batch_size,verbose=verbose,validation_data=(x_val,y_val),callbacks=callbacks)
    histories.append(hist)
    models.append(model)
    print('*'* 50)
  
  return (models, histories)

#### Callbacks
Une fonction de Callback pour stopper l'entrainement du modèle dès que le modèle arrete de faire un progrés par rapport à val_root_mean_squared_error

In [0]:
es = EarlyStopping(monitor='val_root_mean_squared_error', min_delta=0.0001, patience=5, verbose=True, mode='auto')

### Create and Train Model

In [0]:
batch_size = 1024
epochs = 50
(models, history) = run_model(X,y, epochs, batch_size,verbose=1, callbacks=es)

Fold: 0
Train on 623360 samples, validate on 207787 samples
Epoch 1/50

Epoch 00001: val_root_mean_squared_error improved from inf to 2.16810, saving model to model_0.hdf5
Epoch 2/50

Epoch 00002: val_root_mean_squared_error improved from 2.16810 to 2.12510, saving model to model_0.hdf5
Epoch 3/50

Epoch 00003: val_root_mean_squared_error improved from 2.12510 to 2.06839, saving model to model_0.hdf5
Epoch 4/50

Epoch 00004: val_root_mean_squared_error improved from 2.06839 to 2.05064, saving model to model_0.hdf5
Epoch 5/50

Epoch 00005: val_root_mean_squared_error did not improve from 2.05064
Epoch 6/50

Epoch 00006: val_root_mean_squared_error improved from 2.05064 to 1.98129, saving model to model_0.hdf5
Epoch 7/50

Epoch 00007: val_root_mean_squared_error did not improve from 1.98129
Epoch 8/50

Epoch 00008: val_root_mean_squared_error did not improve from 1.98129
Epoch 9/50

Epoch 00009: val_root_mean_squared_error did not improve from 1.98129
Epoch 10/50

Epoch 00010: val_root_m

### Load Model

In [0]:
del X
del y
gc.collect()

In [0]:
trained_model = make_model(input_dim=X_test.shape[-1],drop_rate=0.3)
trained_model.load_weights('model_2.hdf5')

In [0]:
submit = pd.read_csv('kaggle/input/sample_submission.csv')
submit = reduce_mem_usage(submit)

### Make Predictions

In [0]:
X_test = X_test.values[:]
X_test= X_test.reshape((X_test.shape[0],1,X_test.shape[-1]))
prediction = trained_model.predict(X_test, verbose=1)



### Save CSV

In [0]:
submit['meter_reading'] = prediction
submit.to_csv('submission.csv', index=False,float_format='%.4f')

### Submit Work

In [0]:
!kaggle competitions submit ashrae-energy-prediction -f submission.csv -m "LSTM with new features work"

100% 627M/627M [00:16<00:00, 39.3MB/s]
Successfully submitted to ASHRAE - Great Energy Predictor III