In [1137]:
pip install feature_engine

Note: you may need to restart the kernel to use updated packages.


In [1138]:
import pandas as pd
import numpy as np
from feature_engine.datetime import DatetimeFeatures
from feature_engine.imputation import DropMissingData
from feature_engine.creation import CyclicalFeatures
from feature_engine.selection import DropFeatures
from feature_engine.timeseries.forecasting import (LagFeatures,WindowFeatures,)
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

In [1139]:
data=pd.read_csv('AirQualityUCI.csv', sep=';', parse_dates=[['Date', 'Time']]).iloc[:, :-2]
data.dropna(inplace=True)

In [1140]:
new_names = [
    'Date_Time',
    'CO_true',
    'CO_sensor',
    'NMHC_true',
    'C6H6_true',
    'NMHC_sensor',
    'NOX_true',
    'NOX_sensor',
    'NO2_true',
    'NO2_sensor',
    'O3_sensor',
    'T',
    'RH',
    'AH',    
]

In [1141]:
data.columns=new_names

In [1142]:
data.head()

Unnamed: 0,Date_Time,CO_true,CO_sensor,NMHC_true,C6H6_true,NMHC_sensor,NOX_true,NOX_sensor,NO2_true,NO2_sensor,O3_sensor,T,RH,AH
0,10/03/2004 18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,7578
1,10/03/2004 19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,7255
2,10/03/2004 20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,7502
3,10/03/2004 21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,7867
4,10/03/2004 22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,7888


In [1143]:
predictors=data.loc[1:]

In [1144]:
predictors

Unnamed: 0,Date_Time,CO_true,CO_sensor,NMHC_true,C6H6_true,NMHC_sensor,NOX_true,NOX_sensor,NO2_true,NO2_sensor,O3_sensor,T,RH,AH
1,10/03/2004 19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255
2,10/03/2004 20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502
3,10/03/2004 21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867
4,10/03/2004 22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888
5,10/03/2004 23.00.00,12,1197.0,38.0,47,750.0,89.0,1337.0,96.0,1393.0,949.0,112,592,07848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,04/04/2005 10.00.00,31,1314.0,-200.0,135,1101.0,472.0,539.0,190.0,1374.0,1729.0,219,293,07568
9353,04/04/2005 11.00.00,24,1163.0,-200.0,114,1027.0,353.0,604.0,179.0,1264.0,1269.0,243,237,07119
9354,04/04/2005 12.00.00,24,1142.0,-200.0,124,1063.0,293.0,603.0,175.0,1241.0,1092.0,269,183,06406
9355,04/04/2005 13.00.00,21,1003.0,-200.0,95,961.0,235.0,702.0,156.0,1041.0,770.0,283,135,05139


In [1145]:
data['Date_Time'] = data['Date_Time'].str.replace('.', ':', regex=False)

data['Date_Time'] = pd.to_datetime(data['Date_Time'])
# use dayfirst=True parameter if format is dd/mm/yyyy HH:mm:ss Eg: pd.to_datetime(data['Date_Time'], dayfirst=True)

data.head()

Unnamed: 0,Date_Time,CO_true,CO_sensor,NMHC_true,C6H6_true,NMHC_sensor,NOX_true,NOX_sensor,NO2_true,NO2_sensor,O3_sensor,T,RH,AH
0,2004-10-03 18:00:00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,7578
1,2004-10-03 19:00:00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,7255
2,2004-10-03 20:00:00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,7502
3,2004-10-03 21:00:00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,7867
4,2004-10-03 22:00:00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,7888


In [1146]:
for var in predictors:
    if data[var].dtype=="O":
        data[var]=data[var].str.replace(',','.')
        data[var]=pd.to_numeric(data[var])

In [1147]:
data=data.loc[:,["Date_Time", "CO_sensor", "RH"]]

In [1148]:
data.head()

Unnamed: 0,Date_Time,CO_sensor,RH
0,2004-10-03 18:00:00,1360.0,48.9
1,2004-10-03 19:00:00,1292.0,47.7
2,2004-10-03 20:00:00,1402.0,54.0
3,2004-10-03 21:00:00,1376.0,60.0
4,2004-10-03 22:00:00,1272.0,59.6


In [1149]:
data.set_index('Date_Time',inplace=True)
data.head()

Unnamed: 0_level_0,CO_sensor,RH
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-10-03 18:00:00,1360.0,48.9
2004-10-03 19:00:00,1292.0,47.7
2004-10-03 20:00:00,1402.0,54.0
2004-10-03 21:00:00,1376.0,60.0
2004-10-03 22:00:00,1272.0,59.6


In [1150]:
data1=data.copy()

In [1151]:
data.sort_index(inplace=True)
data.head()

Unnamed: 0_level_0,CO_sensor,RH
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-01-04 00:00:00,1143.0,61.6
2004-01-04 01:00:00,1044.0,63.9
2004-01-04 02:00:00,1034.0,67.2
2004-01-04 03:00:00,956.0,73.1
2004-01-04 04:00:00,909.0,66.6


In [1152]:
# Reducing data size.
data = data.loc["2004-04-01":"2005-04-30"]

In [1153]:
data.head()

Unnamed: 0_level_0,CO_sensor,RH
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-04-04 00:00:00,1224.0,56.5
2004-04-04 01:00:00,1215.0,59.2
2004-04-04 02:00:00,1115.0,62.4
2004-04-04 03:00:00,1124.0,65.0
2004-04-04 04:00:00,1028.0,65.3


In [1154]:
data=data.loc[(data['CO_sensor']>=0)& data['RH']>0] # removing outliers

# DateTime Features using FeatureEngine

In [1155]:
dtf=DatetimeFeatures(
variables='index',
features_to_extract=[
    "month",
    "week",
    "day_of_week",
    "day_of_month",
    'hour',
    "weekend"
],
)
data=dtf.fit_transform(data)# extracting the date time variable
data.head()

Unnamed: 0_level_0,CO_sensor,RH,month,week,day_of_week,day_of_month,hour,weekend
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-04-04 00:00:00,1224.0,56.5,4,14,6,4,0,1
2004-04-04 01:00:00,1215.0,59.2,4,14,6,4,1,1
2004-04-04 02:00:00,1115.0,62.4,4,14,6,4,2,1
2004-04-04 03:00:00,1124.0,65.0,4,14,6,4,3,1
2004-04-04 04:00:00,1028.0,65.3,4,14,6,4,4,1


# Lag Features

In [1156]:
lagfeature=LagFeatures(
    variables=["CO_sensor","RH"],
    freq=["1H","24H"], # moves 1 hour and 24 hour fordward
    missing_values="ignore"
)
data=lagfeature.fit_transform(data)#adding lag feature


In [1157]:
data.head()

Unnamed: 0_level_0,CO_sensor,RH,month,week,day_of_week,day_of_month,hour,weekend,CO_sensor_lag_1H,RH_lag_1H,CO_sensor_lag_24H,RH_lag_24H
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-04-04 00:00:00,1224.0,56.5,4,14,6,4,0,1,,,,
2004-04-04 01:00:00,1215.0,59.2,4,14,6,4,1,1,1224.0,56.5,,
2004-04-04 02:00:00,1115.0,62.4,4,14,6,4,2,1,1215.0,59.2,,
2004-04-04 03:00:00,1124.0,65.0,4,14,6,4,3,1,1115.0,62.4,,
2004-04-04 04:00:00,1028.0,65.3,4,14,6,4,4,1,1124.0,65.0,,


# Window feature

In [1158]:
windowfeature=WindowFeatures(
variables=["CO_sensor","RH"],
window="3H",
freq="1H",
missing_values="ignore"
)
data=windowfeature.fit_transform(data)

In [1159]:
data.head()

Unnamed: 0_level_0,CO_sensor,RH,month,week,day_of_week,day_of_month,hour,weekend,CO_sensor_lag_1H,RH_lag_1H,CO_sensor_lag_24H,RH_lag_24H,CO_sensor_window_3H_mean,RH_window_3H_mean
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-04-04 00:00:00,1224.0,56.5,4,14,6,4,0,1,,,,,,
2004-04-04 01:00:00,1215.0,59.2,4,14,6,4,1,1,1224.0,56.5,,,1224.0,56.5
2004-04-04 02:00:00,1115.0,62.4,4,14,6,4,2,1,1215.0,59.2,,,1219.5,57.85
2004-04-04 03:00:00,1124.0,65.0,4,14,6,4,3,1,1115.0,62.4,,,1184.666667,59.366667
2004-04-04 04:00:00,1028.0,65.3,4,14,6,4,4,1,1124.0,65.0,,,1151.333333,62.2


# Perodic Feature

In [1160]:
cyclicalfeature=CyclicalFeatures(
    variables=["CO_sensor","RH"],
    drop_original=False
)
data=cyclicalfeature.fit_transform(data)

In [1161]:
data.head()

Unnamed: 0_level_0,CO_sensor,RH,month,week,day_of_week,day_of_month,hour,weekend,CO_sensor_lag_1H,RH_lag_1H,CO_sensor_lag_24H,RH_lag_24H,CO_sensor_window_3H_mean,RH_window_3H_mean,CO_sensor_sin,CO_sensor_cos,RH_sin,RH_cos
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2004-04-04 00:00:00,1224.0,56.5,4,14,6,4,0,1,,,,,,,-0.6353,-0.772266,-0.758274,-0.651936
2004-04-04 01:00:00,1215.0,59.2,4,14,6,4,1,1,1224.0,56.5,,,1224.0,56.5,-0.613303,-0.789848,-0.868377,-0.495905
2004-04-04 02:00:00,1115.0,62.4,4,14,6,4,2,1,1215.0,59.2,,,1219.5,57.85,-0.340386,-0.940286,-0.957612,-0.28806
2004-04-04 03:00:00,1124.0,65.0,4,14,6,4,3,1,1115.0,62.4,,,1184.666667,59.366667,-0.366728,-0.930328,-0.994171,-0.107815
2004-04-04 04:00:00,1028.0,65.3,4,14,6,4,4,1,1124.0,65.0,,,1151.333333,62.2,-0.075027,-0.997181,-0.996237,-0.086666


# Missing values

In [1162]:
data.isnull().sum()

CO_sensor                     0
RH                            0
month                         0
week                          0
day_of_week                   0
day_of_month                  0
hour                          0
weekend                       0
CO_sensor_lag_1H             27
RH_lag_1H                    27
CO_sensor_lag_24H           461
RH_lag_24H                  461
CO_sensor_window_3H_mean     27
RH_window_3H_mean            27
CO_sensor_sin                 0
CO_sensor_cos                 0
RH_sin                        0
RH_cos                        0
dtype: int64

In [1163]:
#Droping the missing data
imputer=DropMissingData()
data=imputer.fit_transform(data)

In [1164]:
#imputer=DropMissingData()

In [1165]:
data.isnull().sum()

CO_sensor                   0
RH                          0
month                       0
week                        0
day_of_week                 0
day_of_month                0
hour                        0
weekend                     0
CO_sensor_lag_1H            0
RH_lag_1H                   0
CO_sensor_lag_24H           0
RH_lag_24H                  0
CO_sensor_window_3H_mean    0
RH_window_3H_mean           0
CO_sensor_sin               0
CO_sensor_cos               0
RH_sin                      0
RH_cos                      0
dtype: int64

Droping Orginal Time Series Data to avoid lookahead bais

In [1166]:
data2=data.copy()

In [1167]:
dropft=DropFeatures(features_to_drop=["CO_sensor","RH"])


In [1168]:
#imputer=DropMissingData()

# Pipeline

Extracting all the features in one step using feature engine pipline

In [1169]:
data=data1

In [1170]:
data.head()

Unnamed: 0_level_0,CO_sensor,RH
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-10-03 18:00:00,1360.0,48.9
2004-10-03 19:00:00,1292.0,47.7
2004-10-03 20:00:00,1402.0,54.0
2004-10-03 21:00:00,1376.0,60.0
2004-10-03 22:00:00,1272.0,59.6


In [1171]:
pipe=Pipeline([
    ("datetime_feature",dtf),
    ("lagfeatue",lagfeature),
    ("windowfeature",windowfeature),
    ("Perodic",cyclicalfeature),
    ("dropna", imputer),
    ("drop_ts",dropft),
    
])

# Spliting the data into Train and Test sets

last month data is 4 of march 2005 but we need 24 hours pervious data to create the feature

In [1172]:
data.head()

Unnamed: 0_level_0,CO_sensor,RH
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-10-03 18:00:00,1360.0,48.9
2004-10-03 19:00:00,1292.0,47.7
2004-10-03 20:00:00,1402.0,54.0
2004-10-03 21:00:00,1376.0,60.0
2004-10-03 22:00:00,1272.0,59.6


In [1173]:
# input data
X_train=data[data.index<"2005-03-04"]
X_test=data[data.index>=pd.Timestamp("2005-03-04")- pd.offsets.Hour(24)]

In [1174]:
#target
Y_train=data[data.index<"2005-03-04"][["CO_sensor","RH"]]
Y_test=data[data.index>=pd.Timestamp("2005-03-04")- pd.offsets.Hour(24)][["CO_sensor","RH"]]

In [1175]:
Y_train.head()

Unnamed: 0_level_0,CO_sensor,RH
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-10-03 18:00:00,1360.0,48.9
2004-10-03 19:00:00,1292.0,47.7
2004-10-03 20:00:00,1402.0,54.0
2004-10-03 21:00:00,1376.0,60.0
2004-10-03 22:00:00,1272.0,59.6


# Training pipeline and model


In [1176]:
# we will train the pipeline and create the features for the training set
X_train_1=pipe.fit_transform(X_train)

In [1177]:
#Data with input features
X_train_1.head()

Unnamed: 0_level_0,month,week,day_of_week,day_of_month,hour,weekend,CO_sensor_lag_1H,RH_lag_1H,CO_sensor_lag_24H,RH_lag_24H,CO_sensor_window_3H_mean,RH_window_3H_mean,CO_sensor_sin,CO_sensor_cos,RH_sin,RH_cos
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-05 00:00:00,1,2,0,5,0,0,1203.0,52.0,1143.0,61.6,1193.666667,48.0,-0.948161,-0.317791,-0.99981,0.019479
2004-01-05 01:00:00,1,2,0,5,1,0,1425.0,66.8,1044.0,63.9,1271.333333,55.6,-0.470378,-0.882465,-0.992106,0.125404
2004-01-05 02:00:00,1,2,0,5,2,0,1179.0,68.3,1034.0,67.2,1269.0,62.366667,-0.083064,-0.996544,-0.966307,0.257393
2004-01-05 03:00:00,1,2,0,5,3,0,1047.0,70.2,956.0,73.1,1217.0,68.433333,0.033873,-0.999426,-0.893485,0.449092
2004-01-05 04:00:00,1,2,0,5,4,0,1009.0,73.1,909.0,66.6,1078.333333,70.533333,0.412356,-0.911023,-0.971558,0.236802


our transformer removed the observation with missing data so we need to remove those observation from target variable  as well

In [1178]:
Y_train_1=Y_train.loc[X_train_1.index]

In [1179]:
print(X_train_1.shape)
print(Y_train_1.shape)

(7878, 16)
(7878, 2)


# lasso Regression

we need multiple regression as we need a prediction of CO concentration and prediction of RH

In [1180]:
lasso=MultiOutputRegressor(Lasso(random_state=0))

In [1181]:
lasso.fit(X_train_1,Y_train_1)

MultiOutputRegressor(estimator=Lasso(random_state=0))

# Forcasting 24 hour fordward fordward step by step

First we will obtain the 24 hours forcast in the 24 hours in the testset

In [1182]:
#first hour of forcast
forcast_point=pd.Timestamp("2005-03-04")

#forcast horizon 24 hours ahead
forcast_end=forcast_point + pd.offsets.Hour(23)

In [1183]:
forcast_point

Timestamp('2005-03-04 00:00:00')

In [1184]:
forcast_end

Timestamp('2005-03-04 23:00:00')

In [1185]:
# Time stamp for which we wanted to predict Carbon monixide concentration
index=pd.date_range(
start=forcast_point,
end=forcast_end,
freq="1H")

# Here we will be starting the forcasting from forcasting point upto forcast end during 1 hours of interval.

In [1186]:
index

DatetimeIndex(['2005-03-04 00:00:00', '2005-03-04 01:00:00',
               '2005-03-04 02:00:00', '2005-03-04 03:00:00',
               '2005-03-04 04:00:00', '2005-03-04 05:00:00',
               '2005-03-04 06:00:00', '2005-03-04 07:00:00',
               '2005-03-04 08:00:00', '2005-03-04 09:00:00',
               '2005-03-04 10:00:00', '2005-03-04 11:00:00',
               '2005-03-04 12:00:00', '2005-03-04 13:00:00',
               '2005-03-04 14:00:00', '2005-03-04 15:00:00',
               '2005-03-04 16:00:00', '2005-03-04 17:00:00',
               '2005-03-04 18:00:00', '2005-03-04 19:00:00',
               '2005-03-04 20:00:00', '2005-03-04 21:00:00',
               '2005-03-04 22:00:00', '2005-03-04 23:00:00'],
              dtype='datetime64[ns]', freq='H')

In [1187]:
# forcasting horizon
f_horizon=pd.DataFrame(columns=["CO_sensor","RH"],index=index)
f_horizon

Unnamed: 0,CO_sensor,RH
2005-03-04 00:00:00,,
2005-03-04 01:00:00,,
2005-03-04 02:00:00,,
2005-03-04 03:00:00,,
2005-03-04 04:00:00,,
2005-03-04 05:00:00,,
2005-03-04 06:00:00,,
2005-03-04 07:00:00,,
2005-03-04 08:00:00,,
2005-03-04 09:00:00,,


# Forcasting 1 hour fordward

In [1188]:
# 24 hours before the first forcasting point
start_point=forcast_point-pd.offsets.Hour(24)
start_point

Timestamp('2005-03-03 00:00:00')

In [1189]:
#input data for first prediction value
input_data=X_test[(X_test.index>= start_point)&(X_test.index<forcast_point)]
input_data.loc[forcast_point]=0 #puting nan value for forcasting point
input_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_data.loc[forcast_point]=0 #puting nan value for forcasting point


Unnamed: 0_level_0,CO_sensor,RH
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2005-03-03 00:00:00,1047.0,41.7
2005-03-03 01:00:00,1030.0,43.3
2005-03-03 02:00:00,986.0,45.9
2005-03-03 03:00:00,992.0,49.7
2005-03-03 04:00:00,1076.0,56.3
2005-03-03 05:00:00,1104.0,67.9
2005-03-03 06:00:00,1160.0,74.1
2005-03-03 07:00:00,1217.0,75.6
2005-03-03 08:00:00,1457.0,72.4
2005-03-03 09:00:00,1337.0,50.2


In [1190]:
len(input_data)

25

In [1191]:
# test transformation
pipe.transform(input_data)

Unnamed: 0_level_0,month,week,day_of_week,day_of_month,hour,weekend,CO_sensor_lag_1H,RH_lag_1H,CO_sensor_lag_24H,RH_lag_24H,CO_sensor_window_3H_mean,RH_window_3H_mean,CO_sensor_sin,CO_sensor_cos,RH_sin,RH_cos
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2005-03-04,3,9,4,4,0,0,1179.0,82.0,1047.0,41.7,1223.333333,82.7,0.0,1.0,0.0,1.0


In [1192]:
#first prediction
prediction=lasso.predict(pipe.transform(input_data))

In [1193]:
prediction

array([[1208.07616546,   77.60414478]])

In [1194]:
f_horizon.loc[forcast_point]=prediction

In [1195]:
f_horizon

Unnamed: 0,CO_sensor,RH
2005-03-04 00:00:00,1208.076165,77.604145
2005-03-04 01:00:00,,
2005-03-04 02:00:00,,
2005-03-04 03:00:00,,
2005-03-04 04:00:00,,
2005-03-04 05:00:00,,
2005-03-04 06:00:00,,
2005-03-04 07:00:00,,
2005-03-04 08:00:00,,
2005-03-04 09:00:00,,


# Forcasting 24 hours altogether

In [1196]:
#crating the forcasting horizen
#first hour
forcast_point=pd.Timestamp("2005-03-04")
#forcasting horizen
forcast_end=forcast_point+pd.offsets.Hour(23)

# Time stamp for which we wanted to predict Carbon monixide concentration
index=pd.date_range(
start=forcast_point,
end=forcast_end,
freq="1H")

# the forecasting horizon
f_horizon = pd.DataFrame(columns=["CO_sensor", "RH"], index=index)
f_horizon

Unnamed: 0,CO_sensor,RH
2005-03-04 00:00:00,,
2005-03-04 01:00:00,,
2005-03-04 02:00:00,,
2005-03-04 03:00:00,,
2005-03-04 04:00:00,,
2005-03-04 05:00:00,,
2005-03-04 06:00:00,,
2005-03-04 07:00:00,,
2005-03-04 08:00:00,,
2005-03-04 09:00:00,,


In [1197]:
# 24 hours before the first forcasting point
start_point=forcast_point-pd.offsets.Hour(24)
start_point

Timestamp('2005-03-03 00:00:00')

In [1198]:
#input data for first prediction value
input_data=X_test[(X_test.index>= start_point)&(X_test.index<forcast_point)]
input_data.loc[forcast_point]=0 #puting nan value for forcasting point
input_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_data.loc[forcast_point]=0 #puting nan value for forcasting point


Unnamed: 0_level_0,CO_sensor,RH
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2005-03-03 00:00:00,1047.0,41.7
2005-03-03 01:00:00,1030.0,43.3
2005-03-03 02:00:00,986.0,45.9
2005-03-03 03:00:00,992.0,49.7
2005-03-03 04:00:00,1076.0,56.3
2005-03-03 05:00:00,1104.0,67.9
2005-03-03 06:00:00,1160.0,74.1
2005-03-03 07:00:00,1217.0,75.6
2005-03-03 08:00:00,1457.0,72.4
2005-03-03 09:00:00,1337.0,50.2


In [1199]:
prediction=lasso.predict(pipe.transform(input_data))[0]

In [1200]:
#Add the prediction to the horizon and the input data
f_horizon.loc[forcast_point]=prediction
input_data.loc[forcast_point] = prediction

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_data.loc[forcast_point] = prediction


In [1201]:
# iteration it through the loop
for i in range(23):
    start_point = start_point + pd.offsets.Hour(1)
    forcast_point = forcast_point + pd.offsets.Hour(1)
    
    input_data = input_data[(input_data.index >= start_point)]
    input_data.loc[forcast_point] = 0
    
    prediction = lasso.predict(pipe.transform(input_data))
    
     # Add prediction to horizon.
    f_horizon.loc[forcast_point] = prediction
    input_data.loc[forcast_point] = prediction

f_horizon

Unnamed: 0,CO_sensor,RH
2005-03-04 00:00:00,1208.076165,77.604145
2005-03-04 01:00:00,1211.440412,73.058271
2005-03-04 02:00:00,1196.721269,68.906272
2005-03-04 03:00:00,1174.858442,65.266229
2005-03-04 04:00:00,1156.187547,61.966888
2005-03-04 05:00:00,1137.036933,58.959372
2005-03-04 06:00:00,1121.620697,56.149365
2005-03-04 07:00:00,1109.627052,53.467022
2005-03-04 08:00:00,1109.605995,50.742802
2005-03-04 09:00:00,1101.378621,47.967233
