In [1]:
## imports ##
import numpy as np
import pandas as pd

#Custom decision tree regressor
from statmodels.decisiontrees import DecisionTreeRegressor
#Custom random forest regressor
from statmodels.random_forest import RandomForestRegressor
#Custom lasso regressor
from statmodels.regression import LassoRegression
#Custom gradient boosting regressor
from statmodels.gradientboosting import GradientBoostTreeRegressor

#Find performance
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
data = pd.read_csv("energydata_complete.csv")
data = data.set_index("date")
from sklearn.model_selection import train_test_split
X = data.iloc[:, 1:].values
Y = data.iloc[:, 0].values
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=.2, random_state=41)
print(f"Shape of X: {X.shape}, shape of y: {Y.shape}")
print(f"Shape of X_Train: {X_train.shape}, shape of y_Train: {Y_train.shape}")
print(f"Shape of X_Train: {X_test.shape}, shape of y_Train: {Y_test.shape}")

Shape of X: (19735, 27), shape of y: (19735,)
Shape of X_Train: (15788, 27), shape of y_Train: (15788,)
Shape of X_Train: (3947, 27), shape of y_Train: (3947,)


In [3]:
def rmse(Y, Y_Hat):
    return np.sqrt(mean_squared_error(Y, Y_Hat))

# a.1 Lasso regression model

In [6]:
X_train = pd.DataFrame(X_train, columns=data.columns[1:])
model_LassoRegression = LassoRegression(n_iter=1000, lr=1e-3, alpha=0.1)
model_LassoRegression.fit(X_train.values, Y_train)


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  errors = np.average((y_true - y_pred) ** 2, axis=0)


<statmodels.regression.regression.LassoRegression at 0x1f854a41340>

In [8]:

y_hat_LassoReg = model_LassoRegression.predict(X_train.values)
print('Lasso Regression - Root Mean Squared error:',
      round(rmse(Y_train, y_hat_LassoReg), 4))
print('Lasso Regression - R-Squared:',
      round(r2_score(Y_train, y_hat_LassoReg), 4))


ValueError: Input contains NaN.

The Root Mean Squared error of Lasso regression model is 97.543. As the lower the RMSE, the better the model, this model does not fit the train dataset well. This model has the R-Squared value of 0.0972, which means 9.72% of the variability observed in the target variable is explained by the regression model.

# a.2 Decision Tree model

In [None]:
model_tree = DecisionTreeRegressor(max_depth=3, min_samples_split=2)
model_tree.fit(X_train_Transformed, Y_train.reshape(-1, 1))


In [None]:
y_hat_tree = model_tree.predict(X_train_Transformed.values)
print('Decision tree Regression - Root Mean Squared error:', round(rmse(Y_train,y_hat_tree), 4))
print('Decision tree Regression - R-Squared:', round(r2_score(Y_train,y_hat_tree), 4))

Decision tree Regression - Root Mean Squared error: 101.1047
Decision tree Regression - R-Squared: 0.0301


The Root Mean Squared error of Decision tree model is 100.5505. This model does not fit the train dataset well. This model has the R-Squared value of 0.0407, which means 4.07% of the variability observed in the target variable is explained by the regression model.

# a.3 Random forest model

In [None]:
model_RF = RandomForestRegressor(n_trees = 5, max_depth = 3, min_samples_split = 5)
model_RF.fit(X_train_Transformed, Y_train)

In [None]:
y_hat_RF = model_RF.predict(X_train_Transformed.values)
print('Random Forest Regression - Root Mean Squared error:', round(rmse(Y_train,y_hat_RF), 4))
print('Random Forest Regression - R-Squared:', round(r2_score(Y_train,y_hat_RF), 4))

Random Forest Regression - Root Mean Squared error: 99.7805
Random Forest Regression - R-Squared: 0.0553


The Root Mean Squared error of Random Forest model is 99.6317. This model does not fit the train dataset well. This model has the R-Squared value of 0.0582, which means 5.82% of the variability observed in the target variable is explained by the regression model.

# a.4 GradientBoost model

In [None]:
model_GradiantBoost = GradientBoostTreeRegressor(n_elements=5, learning_rate=0.01)
    
#fit the model
model_GradiantBoost.fit(np.array(X_train_Transformed), Y_train)


In [None]:
y_hat_GB = model_GradiantBoost.predict(X_train)
print('Gradient Boost Regression - Root Mean Squared error:', round(rmse(Y_train,y_hat_GB),4))
print('Gradient Boost Regression - R-Squared:', round(r2_score(Y_train, y_hat_GB),4))

Gradient Boost Regression - Root Mean Squared error: 137.0879
Gradient Boost Regression - R-Squared: -0.7831


The Root Mean Squared error of GradientBoost model is 137.0879. This model does not fit the train dataset well. This model has the R-Squared value of -0.7831, which this model fits the dataset poorly.

## Comparison
The Lasso regression model has the lowest RMSE, which means it predicts the training data best. And it also has the highest R-square value which means it explains the most variability of the training dataset.  

# b.1 Lasso regression model

In [None]:
y_hat_test_LassoReg = model_LassoRegression.predict(X_test)
print('Lasso Regression - Root Mean Squared error:',
      round(rmse(Y_test, y_hat_test_LassoReg), 4))
print('Lasso Regression - R-Squared:',
      round(r2_score(Y_test, y_hat_test_LassoReg), 4))


Lasso Regression - Root Mean Squared error: 97.1061
Lasso Regression - R-Squared: 0.093


Lasso regression has RMSE of 195.5506 and R-Squared value of -2.6782. This means it does not predict the test dataset well.

# b.2 Decision Tree model

In [None]:
y_hat_test_tree = model_tree.predict(X_test)
print('Decision tree Regression - Root Mean Squared error:', round(rmse(Y_test, y_hat_test_tree), 4))
print('Decision tree Regression - R-Squared:', round(r2_score(Y_test, y_hat_test_tree), 4))

Decision tree Regression - Root Mean Squared error: 101.436
Decision tree Regression - R-Squared: 0.0103


Decision tree has RMSE of 151.6432 and R-Squared value of -1.2119. This means it does not predict the test dataset well.

# b.3 Random forest model

In [None]:
y_hat_test_RF = model_RF.predict(X_test)
print('Random Forest Regression - Root Mean Squared error:', round(rmse(Y_test, y_hat_test_RF.reshape(-1)), 4))
print('Random Forest Regression - R-Squared:', round(r2_score(Y_test, y_hat_test_RF.reshape(-1)), 4))

Random Forest Regression - Root Mean Squared error: 99.4681
Random Forest Regression - R-Squared: 0.0483


Decision tree has RMSE of 100.8583 and R-Squared value of 0.0215. This model predict well on the test dataset. And 2.15% of the variability observed in the target variable is explained by the regression model.

# b.4 GradientBoost model

In [None]:
y_hat_test_GB = model_GradiantBoost.predict(X_test)
print('Gradient Boost Regression - Root Mean Squared error:', round(rmse(Y_test, y_hat_test_GB),4))
print('Gradient Boost Regression - R-Squared:', round(r2_score(Y_test,  y_hat_test_GB),4))

Gradient Boost Regression - Root Mean Squared error: 137.1734
Gradient Boost Regression - R-Squared: -0.8099


GradientBoost model has RMSE of 16594.1495 and R-Squared value of -0.5962. This means it does not predict the test dataset well.

In [None]:
def get_score_after_permutation(model, X, y, col_idx):
    X_permuted = X.copy()
    X_permuted[:, col_idx] = np.random.permutation(
        X_permuted[:, col_idx])
    permuted_score = r2_score(model.predict(X_permuted), y.reshape(-1))
    return permuted_score


def get_feature_importance(model, X, y, col_idx):
    baseline_score_train = r2_score(model.predict(X), y.reshape(-1))
    permuted_score_train = get_score_after_permutation(model, X, y, col_idx)
    feature_importance = baseline_score_train - permuted_score_train
    return feature_importance


def calculate_All_Feature_Importance(model, X, y):
    list_Feature_Importance = []
    for col_Index in range(X.shape[1]):
        list_Feature_Importance.append(
            get_feature_importance(model, X, y, col_Index))
    return list_Feature_Importance

In [None]:
importances = calculate_All_Feature_Importance(model_LassoRegression, X_train_Transformed, Y_train)
for label, score in zip(data.columns, importances):
    print(f"{label} have the importance of {score}")

ValueError: Shape of passed values is (15788, 28), indices imply (15788, 27)

In [None]:
importances = calculate_All_Feature_Importance(model_tree, X_train_Transformed, Y_train)
for label, score in zip(data.columns, importances):
    print(f"{label} have the importance of {score}")

In [None]:
importances = calculate_All_Feature_Importance(model_RF, X_train_Transformed, Y_train)
for label, score in zip(data.columns, importances):
    print(f"{label} have the importance of {score}")

In [None]:
importances = calculate_All_Feature_Importance(model_GradiantBoost, X_train_Transformed, Y_train)
for label, score in zip(data.columns, importances):
    print(f"{label} have the importance of {score}")