In [110]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron, PassiveAggressiveClassifier
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [111]:
data = pd.read_csv("drive/MyDrive/data/Hydropower_Consumption.csv")
print(data.shape)
data = data.values[:,1:]
print(data)

(153, 21)
[[312 498 555 ... 105 105 107]
 [75246 80864 85181 ... 130388 132735 0]
 [4548 3519 3477 ... 448 448 4018]
 ...
 [14551 1821 18198 ... 88762 84485 65563]
 [7673 7814 8021 ... 12076 12076 11799]
 [3227 2968 3786 ... 3929 3929 3592]]


In [112]:
X = data[:, :-1]  # feature data
y = data[:, -1:]  # label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

y_train = y_train[:, 0]
y_test = y_test[:, 0]

In [113]:
# model selection
regressors = [
              LinearRegression(),
              Ridge(),
              RidgeCV(),
             ]

# Normalized data
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# Model training and performance evaluation
for regressor in regressors:
  model = regressor
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print("%s" % (model))
  print("Root Mean Squared Error:", mean_squared_error(y_test, y_pred)**0.5)
  print("")

LinearRegression()
Root Mean Squared Error: 12021.416220511568

Ridge()
Root Mean Squared Error: 12021.416178347119

RidgeCV()
Root Mean Squared Error: 11948.951359360191



In [114]:
# Using Principal Component Regression (PCR) = Principal Component Analysis (PCA) + Linear Regression
# Plus: Trying other model after PCA transform
import numpy as np
from numpy.linalg import svd
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline # Sequentially apply a list of transforms and a final estimator

# Standardized and find tuning parameter k (number of Pricipal Component)
scaler = StandardScaler()
Xs = scaler.fit_transform(X).astype("float64")
cov_Xs = 1/(Xs.shape[0]-1) * Xs.T @ Xs
u, s, uT = svd(cov_Xs)
k = 10
print("inflection point:", np.sum(s[:k])/np.sum(s))
print("")

# Transform the data
pca_ = PCA(n_components=k)
X_transformed = pca_.fit_transform(Xs)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.25, random_state=42, shuffle=True)
y_train = y_train[:, 0]
y_test = y_test[:, 0]


# Model training and performance evaluation
for regressor in regressors:
  model = regressor
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print("%s" % (model))
  print("Root Mean Squared Error:", mean_squared_error(y_test, y_pred)**0.5)
  print("")

inflection point: 0.9988244146750525

LinearRegression()
Root Mean Squared Error: 7482.575589445003

Ridge()
Root Mean Squared Error: 61485.34128757519

RidgeCV()
Root Mean Squared Error: 76637.9137590876

