<a href="https://colab.research.google.com/github/tacocat0200/Relevant-projects/blob/main/PCA_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random as rd
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt

# Data Generation

* data is in a dataframe called "data"
* columns are individual samples or cells
* rows are gene measurements taken from all samples


In [None]:
genes = ['gene' + str(i) for i in range(1, 101)]

wt = ['wt' + str(i) for i in range(1,6)]
ko = ['ko' + str(i) for i in range(1,6)]

data = pd.DataFrame(columns = [*wt, *ko], index = genes)

for gene in data.index:
  data.loc[gene, 'wt1':'wt5'] = np.random.poisson(lam = rd.randrange(10, 1000),
                                                  size = 5)
  
  data.loc[gene, 'ko1' : 'ko5'] = np.random.poisson(lam = rd.randrange(10, 1000), size = 5)


print(data.head())
print(data.shape)

# PCA

In [3]:
#center and scale the data
scaled_data = preprocessing.scale(data.T)

In [4]:
#create a PCA object
pca = PCA()
#do the math
pca.fit(scaled_data)
#get PCA coordinates for scaled data
pca_data = pca.transform(scaled_data)

In [None]:
#draw a scree plot 

per_var = np.round(pca.explained_variance_ratio_* 100, decimals = 1)
labels = ['PC' + str(x) for x in range(1, len(per_var) + 1)]

plt.bar(x = range(1, len(per_var) + 1), height = per_var, tick_label = labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()

In [None]:
pca_df = pd.DataFrame(pca_data, index = [*wt, *ko], columns = labels)

plt.scatter(pca_df.PC1, pca_df.PC2)
plt.title('My PCA Graph')
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))

for sample in pca_df.index:
  plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample]))

plt.show()

In [None]:
#determine which genes had the biggest influence on PC1

loading_scores = pd.Series(pca.components_[0], index = genes)
sorted_loading_scores = loading_scores.abs().sort_values(ascending = False)

top_10_genes = sorted_loading_scores[0:10].index.values

print(loading_scores[top_10_genes])