In [53]:
# import the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [54]:
# load the data
data = pd.read_csv('energydata_complete.csv')

In [55]:
data

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,...,17.033333,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,...,17.066667,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,...,17.000000,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,2016-05-27 17:20:00,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,...,23.200000,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812
19731,2016-05-27 17:30:00,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,...,23.200000,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940
19732,2016-05-27 17:40:00,270,10,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,...,23.200000,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117
19733,2016-05-27 17:50:00,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,...,23.200000,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784


In [56]:
# drop the light and date columns
data.drop(columns = ['lights', 'date'], inplace=True)

## Data Processing

In [57]:
#Firstly, we normalise our dataset to a common scale using the min max scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalised_df = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
features_df = normalised_df.drop(columns=['Appliances'])
target_df = normalised_df['Appliances']

## Model Training

In [58]:
from sklearn.model_selection import train_test_split

# 70% of the data is used for the training of the models and the rest is used for testing
x_train, x_test, y_train, y_test = train_test_split(features_df,target_df,test_size=0.3,random_state=42)

In [59]:
x_train

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
9129,0.497360,0.236767,0.122850,0.565939,0.373878,0.303474,0.476577,0.264760,0.408027,0.159533,...,0.475893,0.376380,0.168810,0.862791,0.776316,0.142857,0.984615,0.192308,0.724554,0.724554
2453,0.286167,0.482616,0.188999,0.669978,0.217957,0.735317,0.270270,0.691421,0.178691,0.333576,...,0.240375,0.703504,0.262594,0.836434,0.807018,0.142857,0.600000,0.342383,0.864041,0.864041
9152,0.422386,0.230529,0.057427,0.606430,0.373878,0.338059,0.414414,0.236449,0.378404,0.151639,...,0.468262,0.409803,0.110397,0.853488,0.859649,0.095238,0.917949,0.158371,0.499502,0.499502
12694,0.560718,0.446840,0.280834,0.704002,0.514290,0.515189,0.540541,0.486556,0.509317,0.424604,...,0.561915,0.340784,0.444802,0.559690,0.750000,0.119048,0.384615,0.558069,0.323173,0.323173
16952,0.835269,0.422071,1.000000,0.318493,0.745383,0.459106,0.900901,0.516432,0.748845,0.455819,...,0.854318,0.633278,0.849946,0.530233,0.355263,0.142857,0.600000,0.787330,0.341060,0.341060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,0.524815,0.381892,0.239157,0.636236,0.481888,0.478268,0.576577,0.517712,0.455805,0.250877,...,0.568852,0.471578,0.468382,0.492248,0.706140,0.476190,0.600000,0.562594,0.910318,0.910318
11964,0.591341,0.458949,0.428883,0.607461,0.456962,0.582489,0.557658,0.702518,0.417582,0.285310,...,0.561915,0.625966,0.717042,0.469767,0.552632,0.142857,0.600000,0.791855,0.698711,0.698711
5390,0.369588,0.238785,0.152653,0.481582,0.240944,0.412370,0.279279,0.325651,0.302914,0.161743,...,0.312175,0.426325,0.126474,0.595349,0.828947,0.261905,0.948718,0.165913,0.381388,0.381388
860,0.527983,0.234933,0.315047,0.412016,0.332336,0.380122,0.540541,0.341585,0.259914,0.245539,...,0.375650,0.509106,0.227224,0.831008,0.530702,0.095238,0.600000,0.114630,0.020909,0.020909


In [60]:
# import the regression libraries
from sklearn.linear_model import Ridge, Lasso, LinearRegression

In [61]:
# regression between living room temp(T2) and outside building temp (T6)
x = normalised_df['T2'].values
y = normalised_df['T6'].values

# split the dataset
x_tr, x_tt, y_tr, y_tt = train_test_split(x,y,test_size=0.3,random_state=42)

In [62]:
# reshape the values to permit a single feature
x_tr= x_tr.reshape(-1, 1)
y_tr = y_tr.reshape(-1, 1)
x_tt = x_tt.reshape(-1, 1)

model_fit = LinearRegression().fit(x_tr, y_tr)
y_pred = model_fit.predict(x_tt)

In [63]:
# r-squared value for the single feature linear regression
r_2 = metrics.r2_score(y_tt, y_pred)
print(round(r_2,2))

0.64


In [64]:
# setting up the regression models
models = [
           ['Lasso: ', Lasso(alpha=0.001)],
           ['Ridge: ', Ridge(alpha=0.4)],
           ['Linear_Reg: ',  LinearRegression()]
        ]

In [65]:
# Run all the proposed models and update the results in the list - model_data
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

model_data = []
for name, model in models:
    model_data_dict = {}
    model_data_dict["Name"] = name
    
    model.fit(x_train,y_train)
    
    model_data_dict["Mean_Absolute_Error"] = round(mean_absolute_error(y_test, model.predict(x_test)),2)
    model_data_dict["R2_Score"] = round(metrics.r2_score(y_test, model.predict(x_test)),2)
    model_data_dict["RMSE_Score"] = round(sqrt(mean_squared_error(y_test, model.predict(x_test))),3)
    model_data_dict["Residual_Sum_Square"] = round(np.sum(np.square(y_test - model.predict(x_test))),2)
    model_data.append(model_data_dict)

In [66]:
# Convert list to dataframe
model_df = pd.DataFrame(model_data)
model_df

Unnamed: 0,Name,Mean_Absolute_Error,R2_Score,RMSE_Score,Residual_Sum_Square
0,Lasso:,0.06,0.03,0.094,51.85
1,Ridge:,0.05,0.15,0.088,45.37
2,Linear_Reg:,0.05,0.15,0.088,45.35


In [67]:
#comparing the effects of regularisation
def get_weights_df(model, feat, col_name):
    #this function returns the weight of every feature
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    return weights_df

linear_model_weights = get_weights_df(LinearRegression().fit(x_train, y_train), x_train, 'Linear_Model_Weight')
ridge_weights_df = get_weights_df(Ridge(alpha = 0.4).fit(x_train, y_train, ), x_train, 'Ridge_Weight')
lasso_weights_df = get_weights_df(Lasso(alpha = 0.001).fit(x_train, y_train), x_train, 'Lasso_weight')

final_weights = pd.merge(linear_model_weights, ridge_weights_df, on='Features')
final_weights = pd.merge(final_weights, lasso_weights_df, on='Features')

In [68]:
# The features for each model in ascending order
final_weights

Unnamed: 0,Features,Linear_Model_Weight,Ridge_Weight,Lasso_weight
0,RH_2,-0.456698,-0.411071,-0.0
1,T_out,-0.32186,-0.262172,0.0
2,T2,-0.236178,-0.201397,0.0
3,T9,-0.189941,-0.188916,-0.0
4,RH_8,-0.157595,-0.15683,-0.00011
5,RH_out,-0.077671,-0.054724,-0.049557
6,RH_7,-0.044614,-0.045977,-0.0
7,RH_9,-0.0398,-0.041367,-0.0
8,T5,-0.015657,-0.019853,-0.0
9,T1,-0.003281,-0.018406,0.0


In [69]:
# number of features with non zero values on Lasso model with alpha = 0.001
(final_weights.Lasso_weight != 0).sum()

4