In [None]:
def elim_high_corr(X_df, corr_thresh):
  X_corr = X_df.corr()
  highcorr = []
  for i in X_df.columns:
    for j in X_df.columns:
      if i!=j and abs(X_corr[i][j])>corr_thresh:
        if ((X_df[i]).std() <= (X_df[j]).std() and i not in highcorr):
          highcorr += [i]
        elif ((X_df[i]).std() > (X_df[j]).std() and j not in highcorr):
          highcorr += [j]
  print("columns to drop: ", len(highcorr),"\n", highcorr)
  print("reduced no. of columns: ", X_df.shape[1]-len(highcorr))
  Xreduced = X_df.drop(highcorr,axis=1)
  return [Xreduced, highcorr]

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

In [None]:
DF00 = pd.read_excel('DF_1170x50.xlsx',index_col=0)

print(DF00.shape)
DF01 = DF00[(DF00['Molecular Weight'] < 1000) & (DF00['Molecular Weight'] > 100)]
print(DF01.shape)
DF02 = DF01[DF01['Hawkins_GBSA_Score']<0]
print(DF02.shape)

In [None]:
y = DF02['Hawkins_GBSA_sa_energy']

todrop = DF02.columns[-9:]
X0 = DF02.drop(todrop,axis=1)

vt = VarianceThreshold(0.01)
X1 = pd.DataFrame(vt.fit_transform(X0), index=X0.index, columns=X0.columns[vt.get_support()])
print(X1.shape)

X2 = elim_high_corr(X1, 0.7)[0]

In [None]:
hbpca1 = PCA(n_components=5)
#hbpcafit1 = hbpca1.fit(X2)
dfpca = pd.DataFrame(hbpca1.fit_transform(X2),columns=['PC1','PC2','PC3','PC4','PC5'], index=X2.index)
#dfpca1 = pd.DataFrame(hbpcafit1.transform(X2), index=X2.index)

In [None]:
exvr = (hbpca1.explained_variance_ratio_)
print(exvr, exvr.sum())

plt.figure(figsize=(8,6))
plt.bar(dfpca.columns, exvr)
#plt.grid()
#plt.ylim(0,1)
#plt.legend()

In [None]:
(exvr[:3]).sum(), exvr.sum()

In [None]:
plt.figure(figsize=(12,8))
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.scatter(dfpca['PC1'], dfpca['PC2'], c=(y), alpha=0.5, cmap='hot', edgecolors='none', vmax=max(y), vmin=min(y))
plt.tick_params(labelsize=16)
plt.colorbar(label='Gsa')
plt.grid()

plt.xlabel('PC1',size=20)
plt.ylabel('PC2',size=20)
plt.savefig("PC2_vs_PC1.png")
plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.scatter(dfpca['PC1'], dfpca['PC3'], c=(y), alpha=0.5, cmap='cool', edgecolors='none', vmax=max(y), vmin=min(y))
plt.colorbar()
plt.grid()
plt.xlabel('PC1',size=20)
plt.ylabel('PC3',size=20)
plt.savefig("PC3_vs_PC1.png")
plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.scatter(dfpca['PC2'], dfpca['PC3'], c=(y), alpha=0.5, cmap='bwr', edgecolors='none', vmax=max(y), vmin=min(y))
plt.colorbar()
plt.grid()
plt.xlabel('PC2',size=20)
plt.ylabel('PC3',size=20)
plt.savefig("PC3_vs_PC2.png")
plt.show()

In [None]:
pca_loadings_df = pd.DataFrame(hbpca1.components_, index=dfpca.columns, columns=X2.columns)
pca_loadings_df.to_excel('pca_loadings_df.xlsx')

In [None]:
pd.DataFrame(exvr).to_excel('Explained_variance_ratio_PC1-5.xlsx')