In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


### loading the dataset

In [72]:
pd.options.display.max_columns = 30 

In [73]:
dataset = pd.read_csv ('energydata_complete.csv')
dataset.sample(5)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
17382,2016-05-11 10:00:00,60,0,24.79,50.36,27.39,44.38,25.12,46.86,25.725714,44.621429,23.79,53.76,21.12375,26.53,24.434,44.67,25.323333,49.4,23.79,49.27875,17.2,748.7,80.0,1.0,40.0,13.6,35.159299,35.159299
9230,2016-03-15 19:20:00,140,0,21.39,42.996667,19.39,43.53,20.89,37.46,20.79,35.963333,19.1,44.86,7.325,48.1175,21.365,28.365,22.963333,36.0,19.7,37.9,6.533333,763.933333,82.666667,6.333333,22.0,3.733333,9.911561,9.911561
14340,2016-04-20 07:00:00,60,0,21.0,38.163333,17.89,42.1,22.0,36.56,21.2,35.2,19.79,46.43,1.73,51.89,20.2,30.725714,21.6,41.6,19.89,39.86,1.9,767.9,93.0,1.0,27.0,0.8,31.756444,31.756444
10057,2016-03-21 13:10:00,50,0,21.29,36.4,19.5,37.95,21.79,36.23,20.29,35.73,19.29,42.433333,11.16,20.596667,19.2,30.79,21.79,37.7,19.5,37.326667,9.266667,761.433333,63.166667,3.0,40.0,2.516667,0.289422,0.289422
14963,2016-04-24 14:50:00,70,0,21.823333,36.4,20.1,37.925,22.6,34.2,21.7,34.4,20.5,46.326667,7.6,29.29,21.79,30.89,23.463333,36.363333,20.5,36.933333,6.45,757.366667,74.5,4.333333,40.0,2.083333,30.674511,30.674511


### dropping the date and lights features

In [74]:
data = dataset.drop(['date', 'lights'], axis  = 1)
data.sample(3)

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
630,40,22.5,40.29,21.6,39.56,20.79,41.933333,20.23,43.433333,19.666667,53.6,2.09,94.5,18.7,41.633333,20.2,48.89,17.89,47.09,2.1,761.8,95.0,5.0,40.0,1.4,42.875077,42.875077
18988,100,24.7,52.626667,23.79,52.266667,26.823333,44.966667,24.0,50.06,22.79,55.09,20.066667,40.9,23.6,47.554,25.29,53.384286,23.29,50.754286,16.133333,750.4,89.666667,3.0,36.333333,14.4,48.746243,48.746243
11573,60,22.0,39.09,19.39,41.59,22.39,38.9,20.26,39.663333,19.79,49.0,5.623333,61.526667,20.26,34.223333,23.856667,46.26,20.1,42.5,6.133333,757.633333,89.0,4.0,64.0,4.433333,44.004008,44.004008


### Data preprocessing

In [75]:
scaler = MinMaxScaler()
normalized_data = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)

normalized_data.sample(4)

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
6646,0.074766,0.444562,0.289973,0.268234,0.47624,0.298272,0.356753,0.465465,0.34002,0.351648,0.348075,0.281036,0.533199,0.302545,0.266869,0.383888,0.284441,0.292404,0.253449,0.273312,0.515504,0.75,0.166667,0.297436,0.31825,0.009675,0.009675
12511,0.037383,0.564238,0.354371,0.436152,0.500422,0.431206,0.445085,0.585586,0.459809,0.477146,0.363648,0.544346,0.065116,0.629729,0.360588,0.545316,0.260452,0.613944,0.514211,0.513934,0.546512,0.440789,0.547619,0.566667,0.425339,0.539022,0.539022
9004,0.009346,0.369588,0.220347,0.050157,0.546537,0.265038,0.389001,0.386486,0.257789,0.282848,0.294482,0.164411,0.678716,0.553252,0.392199,0.576137,0.393763,0.457856,0.44426,0.1597,0.909302,0.872807,0.261905,0.928205,0.235294,0.066202,0.066202
2379,0.028037,0.221753,0.543987,0.15023,0.737745,0.191093,0.762891,0.251351,0.855314,0.187291,0.355211,0.387425,1.0,0.188501,0.747872,0.10589,0.721841,0.204648,0.798841,0.384244,0.730233,0.855263,0.25,0.515385,0.536199,0.393134,0.393134


In [76]:
normalized_data.describe()


Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,0.081958,0.517061,0.364271,0.308303,0.56117,0.421038,0.489601,0.518499,0.485143,0.407272,0.317777,0.406809,0.542053,0.459671,0.432206,0.523873,0.457031,0.478234,0.512655,0.399089,0.609828,0.733558,0.288554,0.574321,0.46881,0.499742,0.499742
std,0.095818,0.169595,0.109512,0.159412,0.114438,0.166676,0.152107,0.184044,0.185289,0.176266,0.135656,0.177277,0.314963,0.198868,0.181355,0.179081,0.179039,0.209647,0.171833,0.170978,0.17208,0.196067,0.175087,0.181457,0.189803,0.289984,0.289984
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.037383,0.419219,0.283735,0.195542,0.490299,0.298272,0.380122,0.399099,0.335894,0.281653,0.234337,0.282104,0.293478,0.31197,0.294326,0.410436,0.324423,0.323621,0.386313,0.278671,0.503101,0.609649,0.142857,0.430769,0.339367,0.249895,0.249895
50%,0.046729,0.50792,0.347675,0.283499,0.563408,0.407112,0.456302,0.501502,0.458387,0.38796,0.289821,0.389026,0.548938,0.437637,0.413593,0.530363,0.4378,0.468262,0.485651,0.383173,0.623256,0.785088,0.261905,0.6,0.453997,0.497934,0.497934
75%,0.084112,0.613516,0.441519,0.392537,0.641016,0.505982,0.60726,0.630631,0.618722,0.50546,0.358586,0.504177,0.831412,0.585297,0.560284,0.648459,0.580398,0.594173,0.627956,0.495445,0.735659,0.890351,0.392857,0.6,0.595777,0.751701,0.751701
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Select the features/ predictors and the predicted variable

In [77]:
features = normalized_data.drop('Appliances', axis = 1)
target = normalized_data['Appliances']

In [78]:
assert features.shape[0] == target.shape[0] == normalized_data.shape[0]

### split the dataset into train and test dataset

In [38]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.3, random_state = 42)

In [39]:
y_train.shape

(13814,)

In [79]:
assert x_train.shape[0] + x_test.shape[0] == normalized_data.shape[0]

### Run a multiple linear regression using the training set 

In [80]:
MlrModel = LinearRegression()
MlrModel.fit(x_train, y_train)

LinearRegression()

### Evaluate your model on the test set

In [41]:
MlrPred = MlrModel.predict(x_test)

#### From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the R^2 value in two D.P?

In [82]:
x = normalized_data[['T2']]
y = normalized_data[['T6']]
# split the dataset
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.3, random_state = 42)

#### Fit a linear model on the training dataset

In [85]:
SlrModel =LinearRegression()
SlrModel.fit(xtrain,ytrain)

LinearRegression()

#### Make predictions using the simple linear model using the test dataset

In [86]:
SlrModelPred = SlrModel.predict(xtest)

#### Calculate the value of r squared in two decimal places

In [89]:
R_square_slr = r2_score (ytest, SlrModelPred)
print("The Value of the simple model's R square is {}".format(round(R_square_slr, 2)))

The Value of the simple model's R square is 0.64


### calculate the Mean absolute error of the multi linear regression model

In [91]:
MAE_MLR_model = mean_absolute_error (y_test, MlrPred)

print("The mean absolute error of the multiple linear model is {}".format(round(MAE_MLR_model , 2)))

The mean absolute error of the multiple linear model is 0.05


### calculate the Residual sum of square of the multiple linear regression model

In [92]:
RSS_MLR_Model = np.sum(np.square(y_test - MlrPred))

print("The residual sum of squares of the multiple linear model is {}". format(round(RSS_MLR_Model, 2)))

The residual sum of squares of the multiple linear model is 45.35


### Calculate the Root mean squared error of the multiple linear regression model

In [104]:
RMSE_MLR_Model = np.sqrt(mean_squared_error(y_test, MlrPred))
print ('The Root mean squared error of the multiple linear regresssion model is {}'.format(round(RMSE_MLR_Model,3)))

The Root mean squared error of the multiple linear regresssion model is 0.088


### calculate the r squared value of the multiple linear regression model

In [95]:
R2_MLR_Model = r2_score(y_test, MlrPred)
print("The r sqaured value of the Multiple linear regression model is {}". format(round(R2_MLR_Model,2)))

The r sqaured value of the Multiple linear regression model is 0.15


### find the weights of the multiple linear regression model

In [100]:
MLR_model_weights = pd.Series(MlrModel.coef_, x_train.columns).sort_values()
MLR_model_weights

rv2           -2.276510e+10
RH_2          -4.567150e-01
T_out         -3.218479e-01
T2            -2.361946e-01
T9            -1.899264e-01
RH_8          -1.576019e-01
RH_out        -7.765200e-02
RH_7          -4.458875e-02
RH_9          -3.981379e-02
T5            -1.566300e-02
T1            -3.281051e-03
Press_mm_hg    6.845979e-03
T7             1.031356e-02
Visibility     1.230399e-02
RH_5           1.600706e-02
RH_4           2.638924e-02
T4             2.899615e-02
Windspeed      2.919410e-02
RH_6           3.805687e-02
RH_3           9.603389e-02
T8             1.019936e-01
Tdewpoint      1.177425e-01
T6             2.364314e-01
T3             2.906326e-01
RH_1           5.535574e-01
rv1            2.276510e+10
dtype: float64

### fit the Ridge model on the training data using alpha = 0.4

In [101]:
ridgeModel = Ridge(alpha = 0.4)
ridgeModel.fit(x_train, y_train)

Ridge(alpha=0.4)

In [102]:
ridgePred = ridgeModel.predict(x_test)

### Calculate the Root mean square error the ridge model


In [103]:
Ridge_model_RMSE = np.sqrt(mean_squared_error(y_test, ridgePred))
print("The root squared error of the ridge model is {}". format(round(Ridge_model_RMSE,3)))

The root squared error of the ridge model is 0.088


### Check to see if there is a change in the ridge's RMSE and the linear model

In [110]:
assert Ridge_model_RMSE == RMS_MLR_Model

AssertionError: 

In [111]:
change = Ridge_model_RMSE - RMS_MLR_Model
change

2.158555554863939e-05

### fit a lasso model on the training data

In [112]:
lassoModel = Lasso(alpha = 0.001)
lassoModel.fit(x_train,y_train)

Lasso(alpha=0.001)

### Get the weights of the lasso model

In [121]:
weights = pd.Series(lassoModel.coef_, x_train.columns).sort_values()
#  find the features with non zero weights
weights != 0 


RH_out          True
RH_8            True
T1             False
Tdewpoint      False
Visibility     False
Press_mm_hg    False
T_out          False
RH_9           False
T9             False
T8             False
RH_7           False
rv1            False
T7             False
T6             False
RH_5           False
T5             False
RH_4           False
T4             False
RH_3           False
T3             False
RH_2           False
T2             False
RH_6           False
rv2            False
Windspeed       True
RH_1            True
dtype: bool

### Make predictions using the Lasso model

In [123]:
lassoPred = lassoModel.predict(x_test)

### Calculate the Root mean squared error 

In [124]:
LassoRMSE = np.sqrt(mean_squared_error(y_test, lassoPred))
print("The Root mean squared error of the Lasso model is {}". format(round(LassoRMSE,3)))

The Root mean squared error of the Lasso model is 0.094
