In [1]:
# This code block imports all the necessary libraries needed

import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# read dataset

df = pd.read_csv("energydata.csv")
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [3]:
# dropping the ['date', 'lights'] columns 

df1 = df.drop(['date','lights'],axis=1)
df1.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) 
and the temperature outside the building (y = T6). 
What is the R^2 value in two D.P?

In [10]:
X = df1['T2'].values.reshape(-1,1)
y = df1['T6'].values.reshape(-1,1)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
linear_model = LinearRegression()
linear_model.fit(X_train,y_train)

y_predict = linear_model.predict(X_test)
r2 = r2_score(y_test, y_predict)
round(r2,2)

0.64

In [12]:
#F Normalise dataset to a common scale using the minmax scaler

scaler = MinMaxScaler()
normalised_df = pd.DataFrame(scaler.fit_transform(df1), columns=df1.columns)

features_df = normalised_df.drop(columns =['Appliances'], axis=1)
target = normalised_df['Appliances']

# Split the dataset into training and testing dataset

X_train,X_test,y_train,y_test = train_test_split(features_df, target, test_size=0.3, random_state=42)
linear_model = LinearRegression()
linear_model.fit(X_train,y_train)

#get predictions
predicted_values = linear_model.predict(X_test)
predicted_values

array([0.03322207, 0.24411599, 0.03400024, ..., 0.06844707, 0.10032325,
       0.05722198])

What is the mean absolute error (in two decimal places)

In [13]:
# mean_absolute_error

mean_absolute_error = mean_absolute_error(y_test, predicted_values)
round(mean_absolute_error, 2)

0.05

What is the Residual sum of square(in two decimal places)

In [14]:
# Residual sum of square

residual_sum_of_square = np.sum(np.square(y_test - predicted_values))
round(residual_sum_of_square, 2)

45.35

What is the root mean sqaure error (in three decimal places)

In [15]:
# root mean sqaure error  
root_mean_sqaure_error = np.sqrt(mean_squared_error(y_test, predicted_values))
round(root_mean_sqaure_error, 3)

0.088

What is the coefficient of determination (in two decimal places)

In [16]:
# coefficient of determination is the same as Rsquare

from sklearn.metrics import r2_score 
r2 = r2_score(y_test, predicted_values)
round(r2,2)

0.15

obtain the features weight from the model

In [19]:
# compare the weigts of the features after regularization

def get_weights_df(model, feat, col_name):
    
# returns the weight of every feature
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['features_df', col_name]
    weights_df[col_name].round(3)
    return weights_df
linear_model_weights = get_weights_df(linear_model, X_train, 'Linear_Model_Weight')
linear_model_weights

Unnamed: 0,features_df,Linear_Model_Weight
0,RH_2,-0.456698
1,T_out,-0.32186
2,T2,-0.236178
3,T9,-0.189941
4,RH_8,-0.157595
5,RH_out,-0.077671
6,RH_7,-0.044614
7,RH_9,-0.0398
8,T5,-0.015657
9,T1,-0.003281


What is the new root mean square error with the Lasso Regression

In [27]:
# compute lasso regression and ridge
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

ridge_reg = Ridge(alpha=0.5)
ridge_reg.fit(X_train, y_train)


def get_weights_df(model, feat, col_name):
    weights = pd.Series(model.coef_, feat.columns).sort_values()    
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['features_df', col_name]
    weights_df[col_name].round(3)
    return weights_df


lasso_weights_df = get_weights_df(lasso_reg, X_train, 'Lasso_weight')
ridge_weights_df = get_weights_df(ridge_reg, X_train, 'Ridge_Weight')

weights = pd.merge(ridge_weights_df, lasso_weights_df, on='features_df')
weights

Unnamed: 0,features_df,Ridge_Weight,Lasso_weight
0,RH_2,-0.401134,-0.0
1,T_out,-0.250765,0.0
2,T2,-0.19388,0.0
3,T9,-0.188584,-0.0
4,RH_8,-0.156596,-0.00011
5,RH_out,-0.050541,-0.049557
6,RH_7,-0.046291,-0.0
7,RH_9,-0.041701,-0.0
8,T1,-0.021549,0.0
9,T5,-0.020727,-0.0


Train a ridge regression model with an alpha value of 0.4. check if there is any change in root mean square error if evaluated on the test set

In [32]:
ridge_reg = Ridge(alpha=0.4)
ridge_reg.fit(X_train, y_train)


def get_weights_df(model, feat, col_name):
    weights = pd.Series(model.coef_, feat.columns).sort_values()    
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['features_df', col_name]
    weights_df[col_name].round(3)
    return weights_df


ridge_weights_df = get_weights_df(ridge_reg, X_train, 'Ridge_Weight')
ridge_weights_df

Unnamed: 0,features_df,Ridge_Weight
0,RH_2,-0.411071
1,T_out,-0.262172
2,T2,-0.201397
3,T9,-0.188916
4,RH_8,-0.15683
5,RH_out,-0.054724
6,RH_7,-0.045977
7,RH_9,-0.041367
8,T5,-0.019853
9,T1,-0.018406


What is the new root mean square error with the lasso regression (in 3 decimal places)

In [34]:
#New Root Mean Square Error 
from sklearn.metrics import  mean_squared_error
root_mean_squared_error = np.sqrt(mean_squared_error(y_test, predicted_values))
round (root_mean_squared_error, 3)


# there is no effect on the rmse

0.088

What is the new root mean square eror(RMSE) with Lasso Regression

In [35]:
# root_mean_squared_error with lasso

lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)
lasso_predict = lasso_reg.predict(X_test)

rmse_lasso =np.sqrt(mean_squared_error(y_test, lasso_predict))
round(rmse_lasso,3)

0.094