In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge


In [12]:
df = pd.read_csv('energydata_complete.csv')
df.columns

Index(['date', 'Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3',
       'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8',
       'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1', 'rv2'],
      dtype='object')

## solving for the linear model between T2 and T6

In [28]:
predictor = df['T2']
target = df['T6']
#since there are no reason to scale the data set since the two data sets have similar datas
# i willl move on without scaling the data sets
x_test, x_train, y_test, y_train = train_test_split(predictor,
                                                    target,
                                                    test_size=0.3,
                                                    random_state=1)
# i used 70% of the available data for training and reserved 30% for testing the data and then reshaped my dataset
# to a one dimensional data since thats the only way it could work.
x_train = x_train.values.reshape(-1, 1)
x_test = x_test.values.reshape(-1, 1)
model = LinearRegression()
model.fit(x_train, y_train)
predicted_value = model.predict(x_test)

predicted_value

array([24.28662527,  4.21170699,  6.7991409 , ...,  5.99614417,
        6.07049572,  2.65032445])

In [29]:
from sklearn.metrics import r2_score
r2_score = r2_score(y_test, predicted_value)
round(r2_score, 2) 

0.64

## answering question 13

In [33]:
#firstly i need to remove the  [“date”, “lights”] columns.
df_to_scale = df.drop(columns= ['date', 'lights'])

#scaling the data using MinMaxScaler 
scaler = MinMaxScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(df_to_scale), columns=df_to_scale.columns)

# assigning datasets to my predictor and target
predictor_df = scaled_df.drop(columns=['Appliances'])
target_df = scaled_df['Appliances']


### setting up my training and testing sets using the stated conditions 


In [35]:
x_test, x_train, y_test, y_train = train_test_split(predictor_df,
                                                    target_df,
                                                    train_size=0.7,
                                                    test_size=0.3,
                                                    random_state=42)
regression_model = LinearRegression()
regression_model.fit(x_train, y_train)
y = regression_model.predict(x_test)
y

array([0.03513553, 0.07433935, 0.0259384 , ..., 0.04622279, 0.08089308,
       0.08721625])

In [37]:
# for mean absolute error 
mae = mean_absolute_error(y_test, y)
round(mae, 2)

0.05

## answering question  14
## Residual Sum of Squares

In [39]:
rss = np.sum(np.square(y_test - y))
round(rss, 2)

110.13

## question 15 Root Mean Squared Error (in three decimal places)

In [45]:
rmse = np.sqrt(mean_squared_error(y_test, y))
round(rmse, 3) 

0.089

## question 16   Coefficient of Determination (in two decimal places)

In [51]:
from sklearn.metrics import r2_score
r2_score = r2_score(y_test, y)
round(r2_score, 2)

0.14

## question 17 Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?


In [56]:
def get_weights_df(model, feat, col_name):
    #this function returns the weight of every feature
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    weights_df[col_name].round(3)
    return weights_df


linear_model_weights = get_weights_df(regression_model, x_train,
                                      'Linear_Model_Weight')
linear_model_weights

Unnamed: 0,Features,Linear_Model_Weight
0,RH_2,-0.539055
1,T2,-0.326397
2,T_out,-0.286637
3,T9,-0.218335
4,RH_8,-0.169558
5,RH_7,-0.057397
6,RH_out,-0.036925
7,RH_9,-0.018717
8,rv1,-0.005266
9,rv2,-0.005266


## Question 18
### Train a ridge regression model with an alpha value of 0.4

In [57]:
ridge_reg = Ridge(alpha=0.4)
ridge_reg.fit(x_train, y_train)
pred_ridge = ridge_reg.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, pred_ridge))
round(rmse, 3)

0.089

## Question 19
### Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it

In [61]:
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train)
lasso_pred = lasso_reg.predict(x_test)

lasso_weights_df = get_weights_df(lasso_reg, x_train, 'Lasso_weight')
lasso_weights_df

Unnamed: 0,Features,Lasso_weight
0,RH_out,-0.051573
1,T1,0.0
2,Tdewpoint,0.0
3,Visibility,0.0
4,Press_mm_hg,-0.0
5,T_out,0.0
6,RH_9,-0.0
7,T9,-0.0
8,RH_8,-0.0
9,T8,0.0


## Question 20
### What is the new RMSE with the lasso regression? (Answer should be in three (3) decimal places)

In [62]:
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
round(rmse, 3)

0.095