In [2]:
import os; os.chdir("..")

In [3]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

from run import read_data
import project.utils as utils
from config import Config

%load_ext autoreload
%autoreload 2

In [4]:
# data related section
data_raw = read_data(Config.PATH)
data_raw = utils.prepare_data(data_raw, station=Config.STATION, features=Config.FEATURES_LIST)

train_data = data_raw.copy()
train_scaler = StandardScaler()
train_scaler, train_data[Config.FEATURES_2_SCALE] = utils.normalize_data(
        train_scaler,
        train_data[Config.FEATURES_2_SCALE].values)

train_data = utils.make_features(train_data, features=Config.FEATURES_LIST)

x_features, y_features = utils.generate_train_data(
    train_data, features=Config.FEATURES_LIST, dt_from=Config.FILTER_DT_FROM, dt_till=Config.FILTER_DT_TILL
)

In [None]:
train_data.corr(numeric_only=True)

In [None]:
cols = list(train_data.columns.copy())
cols.remove('y')
cols

In [None]:
pca = PCA()
x_new = pca.fit_transform(x_features)

def myplot(score,coeff,labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley, c = y_features)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.45, coeff[i,1] * 1.45, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.45, coeff[i,1] * 1.45, labels[i], color = 'g', ha = 'center', va = 'center')

plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()

#Call the function. Use only the 2 PCs.
myplot(x_new[:,0:2],np.transpose(pca.components_[0:2, :]))
plt.show()

In [None]:
model = PCA(n_components=6).fit(x_features)
X_pc = model.transform(x_features)
print(model.components_.shape)
# number of components
n_pcs= model.components_.shape[0]

# get the index of the most important feature on EACH component
# LIST COMPREHENSION HERE
most_important = [np.abs(model.components_[i]).argmax() for i in range(n_pcs)]
print(most_important)
initial_feature_names = cols
# get the names
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]

# LIST COMPREHENSION HERE AGAIN
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}

# build the dataframe
df = pd.DataFrame(dic.items())

In [None]:
# Explained variance ratio to understand the amount of variance each component explains
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Print the cumulative explained variance to decide on the number of components
print("Cumulative Variance Explained by Components: ", cumulative_variance)
print("Most important features:\n\n")
df