In [1]:
import pandas as pd
df = pd.read_csv('../data/data.csv')

In [None]:
print(df.iloc[:,:-1])

In [None]:
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca_fit = pca.fit_transform(df.iloc[:,:-1])
df_pca = pd.DataFrame(data=pca_fit, columns=['PC1','PC2','PC3'])
df_pca.head()
fig, ax = plt.subplots(2, 2, figsize=(15,8))
sns.scatterplot(data=df_pca, x='PC1', y='PC2',ax=ax[0][0])
sns.scatterplot(data=df_pca, x='PC1', y='PC3',ax=ax[0][1])
sns.scatterplot(data=df_pca, x='PC2', y='PC3',ax=ax[1][0])
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.cumsum())
fig.delaxes(ax[1][1])

In [None]:
# Cumulative explained variance
plt.bar(range(1,len(pca.explained_variance_ratio_ )+1),pca.explained_variance_ratio_ )
plt.ylabel('Explained variance')
plt.xlabel('Components')
plt.plot(range(1,len(pca.explained_variance_ratio_ )+1),
         np.cumsum(pca.explained_variance_ratio_),
         c='red',
         label="Cumulative Explained Variance")
plt.legend(loc='upper left')

In [None]:
# Trial bi-plot, understanding the need of standardization
def my_plot(score, coeff, labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley,s=5)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'red', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'r', ha = 'center', va = 'center')

    plt.xlabel("PC{}".format(1))
    plt.ylabel("PC{}".format(2))
    plt.grid()

my_plot(pca_fit[:, 0:2], np.transpose(pca.components_[0:2, :]), list(df.columns))
plt.show()

In [None]:
# standardazing the matrix
df_norm = pd.DataFrame(columns=df.iloc[:,:-1].columns)
for column in df.iloc[:,:-1]:
    df_norm[column] = (df[column] - df[column].mean())/df[column].std(ddof=0)
df_norm['CLASS'] = df.iloc[:,-1];
print(df_norm)

In [None]:
# Repeating PCA computation

import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca_fit = pca.fit_transform(df_norm.iloc[:,:-1])
df_pca = pd.DataFrame(data=pca_fit, columns=['PC1','PC2','PC3'])
df_pca.head()
fig, ax = plt.subplots(2, 2, figsize=(15,8))
sns.scatterplot(data=df_pca, x='PC1', y='PC2',ax=ax[0][0])
sns.scatterplot(data=df_pca, x='PC1', y='PC3',ax=ax[0][1])
sns.scatterplot(data=df_pca, x='PC2', y='PC3',ax=ax[1][0])
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.cumsum())
fig.delaxes(ax[1][1])

In [None]:
# Cumulative explained variance
plt.bar(range(1,len(pca.explained_variance_ratio_ )+1),pca.explained_variance_ratio_ )
plt.ylabel('Explained variance')
plt.xlabel('Components')
plt.plot(range(1,len(pca.explained_variance_ratio_ )+1),
         np.cumsum(pca.explained_variance_ratio_),
         c='red',
         label="Cumulative Explained Variance")
plt.legend(loc='upper left')

In [None]:
# Biplot of the first two components
def my_plot(score, coeff, index1, index2, labels=None):
    xs = score[:,index1]
    ys = score[:,index2]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley,s=5)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'red', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'r', ha = 'center', va = 'center')

    plt.xlabel("PC{}".format(index1+1))
    plt.ylabel("PC{}".format(index2+1))
    plt.grid()

my_plot(pca_fit[:, 0:3], np.transpose(pca.components_[0:3, :]), 0, 1, list(df_norm.iloc[:,:-1].columns))
plt.show()

In [None]:
# Loadings matrix
loadings = pca.components_
num_pc = pca.n_features_
pc_list = ["PC"+str(i) for i in list(range(1, num_pc+1))]
loadings_df = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
loadings_df['variable'] = df_norm.iloc[:,1:10].columns.values
loadings_df = loadings_df.set_index('variable')
loadings_df

In [None]:
# Varimax rotation, other kind of rotation
from factor_analyzer import FactorAnalyzer
fa = FactorAnalyzer(n_factors=3, method='principal', rotation="varimax")
fa_fit = fa.fit_transform(df_norm.iloc[:,1:10])
loadings = fa.loadings_
fa.get_factor_variance()[2]

In [None]:
# Varimax loadings matrix
pd.DataFrame.from_records(loadings)

In [None]:
# New Principal Components after varimax 
df_fa = pd.DataFrame(data=fa_fit, columns=['PCV1', 'PCV2','PCV3'])
df_fa.head()
fig, ax = plt.subplots(2, 2, figsize=(15,8))
sns.scatterplot(data=df_fa, x='PCV1', y='PCV2',ax=ax[0][0])
sns.scatterplot(data=df_fa, x='PCV1', y='PCV3',ax=ax[0][1])
sns.scatterplot(data=df_fa, x='PCV2', y='PCV3',ax=ax[1][0])
fig.delaxes(ax[1][1])

In [None]:
df_norm.to_csv('../data/data_normalized.csv', index = False)