In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt

## Load Dataset

In [3]:
dataset = pd.read_excel('/content/drive/MyDrive/dataset/energy_efficiency.xlsx')

In [4]:
dataset.head(5)

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


## Exploring Data

In [5]:
dataset.describe()

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.764167,671.708333,318.5,176.604167,5.25,3.5,0.234375,2.8125,22.307195,24.58776
std,0.105777,88.086116,43.626481,45.16595,1.75114,1.118763,0.133221,1.55096,10.090204,9.513306
min,0.62,514.5,245.0,110.25,3.5,2.0,0.0,0.0,6.01,10.9
25%,0.6825,606.375,294.0,140.875,3.5,2.75,0.1,1.75,12.9925,15.62
50%,0.75,673.75,318.5,183.75,5.25,3.5,0.25,3.0,18.95,22.08
75%,0.83,741.125,343.0,220.5,7.0,4.25,0.4,4.0,31.6675,33.1325
max,0.98,808.5,416.5,220.5,7.0,5.0,0.4,5.0,43.1,48.03


In [6]:
dataset.rename(columns={'Heating Load':'Heating_Load', 'Cooling Load':'Cooling_Load'}, inplace=True)

In [7]:
X = dataset[['Relative Compactness', 'Surface Area', 'Wall Area', 'Roof Area', 'Overall Height', 'Glazing Area', 'Glazing Area Distribution']]
Y = dataset[['Heating_Load', 'Cooling_Load']]
Y_heating = dataset[['Heating_Load']]
Y_cooling = dataset[['Cooling_Load']]

In [8]:
# Checking the null values
print(X.isnull().sum())

Relative Compactness         0
Surface Area                 0
Wall Area                    0
Roof Area                    0
Overall Height               0
Glazing Area                 0
Glazing Area Distribution    0
dtype: int64


In [9]:
Y.head()

Unnamed: 0,Heating_Load,Cooling_Load
0,15.55,21.33
1,15.55,21.33
2,15.55,21.33
3,15.55,21.33
4,20.84,28.28


In [None]:
# Because we don't have any null value, We just skip the cleaning process
# We just jump to the preprocess for models later
temp = dataset[['Heating_Load', 'Cooling_Load']]
temp['Overall_Load'] = temp['Heating_Load'] + temp['Cooling_Load']

temp['class'] = 2
temp.loc[temp['Overall_Load'] < 42, ['class']] = 1
temp.loc[temp['Overall_Load'] > 70, ['class']] = 3

y_category = temp['class']

In [11]:
# Divide the dataset with category
from sklearn.model_selection import train_test_split

x_train_category, x_test_category, y_train_category, y_test_category = train_test_split(X, y_category, random_state=5)

Using scaler by calling MinMax Scaler

In [12]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler(feature_range=(0, 1))
x_train_category = minmax.fit_transform(x_train_category)
x_test_category = minmax.transform(x_test_category)

Split the whole dataset for model training and testing

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=5)

# Scaling using MinMax Scaler
minmax2 = MinMaxScaler(feature_range=(0, 1))
x_train = minmax2.fit_transform(x_train)
x_test = minmax2.transform(x_test)

In [14]:
print(x_test.shape)
print(y_test.shape)

(192, 7)
(192, 2)


## Regression Model

Linear Regression

In [15]:
# Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score

linear_reg = LinearRegression()
# Try to understand multi output regression -> from the github
multiOutput_reg = MultiOutputRegressor(linear_reg, n_jobs=-1)
multiOutput_reg.fit(x_train, y_train)

#
train_r2Score = r2_score(y_train, multiOutput_reg.predict(x_train))
test_r2Score = r2_score(y_test, multiOutput_reg.predict(x_test))
#output = pd.DataFrame(index=None, columns=['model','train_r2_score','test_r2_score'])
#output = pd.concat([output, pd.DataFrame(['Linear Regressor', train_r2Score, test_r2Score]).T], ignore_index=False)
output = pd.DataFrame(['Linear Regressor', train_r2Score, test_r2Score], index=['Model', 'Train R2 Score', 'Test R2 Score']).T

In [16]:
output

Unnamed: 0,Model,Train R2 Score,Test R2 Score
0,Linear Regressor,0.902229,0.899673


KNN Regressor

In [17]:
#KNN Regressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for KNN
param_grid = {
    'n_neighbors': range(3, 70),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

knn_reg = KNeighborsRegressor()
grid_search = GridSearchCV(knn_reg, param_grid, cv = 5, return_train_score = True)
multiOutput_knn = MultiOutputRegressor(grid_search)
multiOutput_knn.fit(x_train, y_train)

#
train_r2Score = r2_score(y_train, multiOutput_knn.predict(x_train))
test_r2Score = r2_score(y_test, multiOutput_knn.predict(x_test))

In [18]:
output_knn = pd.DataFrame(['KNN Regressor', train_r2Score, test_r2Score], index=['Model', 'Train R2 Score', 'Test R2 Score']).T
output_knn

Unnamed: 0,Model,Train R2 Score,Test R2 Score
0,KNN Regressor,0.986387,0.977898


Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Declare param grid
param_grid = {
    'n_estimators': [300, 400, 500],
    'max_features': ['sqrt', 'log2', 'auto'],
    'max_depth': [10, 20, 30, 40, 50],
}

# Modelling
rf_reg = RandomForestRegressor(random_state=5, n_jobs=-1)
grid_search_rf = GridSearchCV(rf_reg, param_grid, cv = 5, return_train_score=True)
multiOutput_rf = MultiOutputRegressor(grid_search_rf)
multiOutput_rf.fit(x_train, y_train)

#
train_r2Score = r2_score(y_train, multiOutput_rf.predict(x_train))
test_r2Score = r2_score(y_test, multiOutput_rf.predict(x_test))

In [20]:
output_rf = pd.DataFrame(['Random Forest Regressor', train_r2Score, test_r2Score], index=['Model', 'Train R2 Score', 'Test R2 Score']).T
output_rf

Unnamed: 0,Model,Train R2 Score,Test R2 Score
0,Random Forest Regressor,0.987927,0.973393


## Classification

### Logistic

In [21]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression

# Initialize param grid
param_grid = {
    'C': [0.01, 0.1, 0.5, 1, 2, 5, 10]
}

log_reg = LogisticRegression()

grid_search_log = GridSearchCV(log_reg, param_grid, cv = 5)
grid_search_log.fit(x_train_category, y_train_category)

print('Best parameters for logistic classification: {}'.format(grid_search_log.best_params_))
print('The Train Accuracy score for Logistic Reression is',accuracy_score(y_train_category, grid_search_log.predict(x_train_category)))
print('The Test Accuracy score for Logistic Reression is',accuracy_score(y_test_category, grid_search_log.predict(x_test_category)))

Best parameters for logistic classification: {'C': 0.5}
The Train Accuracy score for Logistic Reression is 0.8819444444444444
The Test Accuracy score for Logistic Reression is 0.8802083333333334


### SVM

In [22]:
from sklearn.svm import SVC

# Initialize param grid
param_grid = {
    'C': [0.001, 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
}

svm = SVC(random_state=5)
grid_search_svm = GridSearchCV(svm, param_grid, cv = 5)
grid_search_svm.fit(x_train_category, y_train_category)

print('The best parameter for SVM in classification: {}'.format(grid_search_svm.best_params_))
print('The Train Accuracy score for SVM is',accuracy_score(y_train_category, grid_search_svm.predict(x_train_category)))
print('The Test Accuracy score for SVM is',accuracy_score(y_test_category, grid_search_svm.predict(x_test_category)))

The best parameter for SVM in classification: {'C': 10, 'kernel': 'poly'}
The Train Accuracy score for SVM is 0.9809027777777778
The Test Accuracy score for SVM is 0.9895833333333334


### Random Forest Classifier

In [23]:
from sklearn.ensemble import RandomForestClassifier

# Initialize param grid
param_grid = {
    'n_estimators': [300, 400, 500],
    'max_depth': [10, 20, 30, 40, 50],
}

rfc = RandomForestClassifier(random_state=5)

grid_search_rfc = GridSearchCV(rfc, param_grid, cv=5)
grid_search_rfc.fit(x_train_category, y_train_category)

print('The best parameter for SVM in classification: {}'.format(grid_search_rfc.best_params_))
print('The Train Accuracy score for SVM is',accuracy_score(y_train_category, grid_search_rfc.predict(x_train_category)))
print('The Test Accuracy score for SVM is',accuracy_score(y_test_category, grid_search_rfc.predict(x_test_category)))

The best parameter for SVM in classification: {'max_depth': 10, 'n_estimators': 300}
The Train Accuracy score for SVM is 0.9947916666666666
The Test Accuracy score for SVM is 0.9895833333333334


### GradientBoost Classifier

In [25]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize param grid
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 1],
    'max_depth': [3, 5, 10],
    'max_features': ['sqrt', 'log2', None],
}

gradient_boost = GradientBoostingClassifier(random_state=5)

grid_search_gb = GridSearchCV(gradient_boost, param_grid, cv=5)
grid_search_gb.fit(x_train_category, y_train_category)

print('The best parameter for Gradient Boosting in classification: {}'.format(grid_search_gb.best_params_))
print('The Train Accuracy score for Gradient Boosting is',accuracy_score(y_train_category, grid_search_gb.predict(x_train_category)))
print('The Test Accuracy score for Gradient Boosting is',accuracy_score(y_test_category, grid_search_gb.predict(x_test_category)))

The best parameter for Gradient Boosting in classification: {'max_depth': 10, 'n_estimators': 300}
The Train Accuracy score for Gradient Boosting is 0.9947916666666666
The Test Accuracy score for Gradient Boosting is 0.9895833333333334
