In [80]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [81]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import Perceptron, PassiveAggressiveClassifier
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC

In [82]:
data = pd.read_csv("drive/MyDrive/data/Hydropower_Consumption.csv")
print(data.shape)
data = data.values[:,1:]
print(data)

(153, 21)
[[312 498 555 ... 105 105 107]
 [75246 80864 85181 ... 130388 132735 0]
 [4548 3519 3477 ... 448 448 4018]
 ...
 [14551 1821 18198 ... 88762 84485 65563]
 [7673 7814 8021 ... 12076 12076 11799]
 [3227 2968 3786 ... 3929 3929 3592]]


In [83]:
scaler = MinMaxScaler(feature_range=(0, 1))
data = scaler.fit_transform(data)

X = data[:, :-1]  # feature data
y = data[:, -1:]  # label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

y_train = y_train[:, 0]
y_test = y_test[:, 0]

In [84]:
# model selection
regressors = [
              LinearRegression(),
              Ridge(),
              RidgeCV(),
              KNeighborsRegressor(),
             ]


# Normalized data
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)


# Model training and performance evaluation
for regressor in regressors:
  model = regressor
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print("%s" % (model))
  print("Root Mean Squared Error:", mean_squared_error(y_test, y_pred)**0.5)
  print("R2 Score:", r2_score(y_test, y_pred))
  print("")

LinearRegression()
Root Mean Squared Error: 0.009468112460766266
R2 Score: 0.23508442951595865

Ridge()
Root Mean Squared Error: 0.06004108799623161
R2 Score: -29.75982021399238

RidgeCV()
Root Mean Squared Error: 0.05909136061966126
R2 Score: -28.794401504221273

KNeighborsRegressor()
Root Mean Squared Error: 0.02542093455096791
R2 Score: -4.51403744351307



In [85]:
# Using Principal Component Regression (PCR) = Principal Component Analysis (PCA) + Linear Regression
# Plus: Trying other model after PCA transform
import numpy as np
from numpy.linalg import svd
from sklearn.decomposition import PCA

# Standardized and find tuning parameter k (number of Pricipal Component)
k = 10
scaler = MinMaxScaler(feature_range=(0, 1))
Xs = scaler.fit_transform(X).astype("float64")
cov_Xs = 1/(Xs.shape[0]-1) * Xs.T @ Xs
u, s, uT = svd(cov_Xs)
print("inflection point:", np.sum(s[:k])/np.sum(s))
print("")


# Transform the data
pca_ = PCA(n_components=10)
X_transformed = pca_.fit_transform(Xs)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.25, random_state=42, shuffle=True)
y_train = y_train[:, 0]
y_test = y_test[:, 0]


# Model training and performance evaluation (R2 score <= 1 (can be negative))
for regressor in regressors:
  model = regressor
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print("%s" % (model))
  print("Root Mean Squared Error:", mean_squared_error(y_test, y_pred)**0.5)
  print("R2 Score:", r2_score(y_test, y_pred))
  print("")

inflection point: 0.9988808976479174

LinearRegression()
Root Mean Squared Error: 0.00492038897811015
R2 Score: 0.7934212905869836

Ridge()
Root Mean Squared Error: 0.06003843695256825
R2 Score: -29.757103946552522

RidgeCV()
Root Mean Squared Error: 0.059047884805022884
R2 Score: -28.750575829910773

KNeighborsRegressor()
Root Mean Squared Error: 0.0256577188633421
R2 Score: -4.617237289963618

