# Correlation Coefficient Matrix of GSE75748 scRNA-seq

GSE75748 cohort includes the scRNA-seq data of undifferentiated hES (human embryonic stem) cells and differentiated progenitor cells, as well as temporal data. We selected only single-time-point undifferentiated H9 hES cells for analysis, with a sample size of 162.

Read count of the study is prepared by [SCPortalen by RIKEN](http://single-cell.clst.riken.jp/). It is then imputed by [DrImpute](https://github.com/gongx030/DrImpute), reducing zero-rate from 55.0% to 6.0%.

Then we will get the Spearman correlation matrix the study, which serves as a start point of logic gate inference.

In [1]:
%matplotlib inline
import pandas as pd
from matplotlib import pyplot as plt
from correlation_matrix import *

In [2]:
h9_tpm_imp = pd.read_csv("../data/GSE75748/h9_imputed.tsv", sep = "\t")
h9_tpm_imp.head()

Unnamed: 0,SRR2977655,SRR2977656,SRR2977657,SRR2977658,SRR2977659,SRR2977660,SRR2977661,SRR2977662,SRR2977663,SRR2977664,...,SRR2977807,SRR2977808,SRR2977809,SRR2977810,SRR2977811,SRR2977812,SRR2977813,SRR2977814,SRR2977815,SRR2977816
ENSG00000000003.14,5.333434,5.401781,5.770235,4.661787,6.163233,5.538727,5.245607,5.342846,5.332342,4.015143,...,5.051099,5.530516,3.392994,5.732665,5.679862,5.603962,5.248733,5.265112,5.444956,3.658732
ENSG00000000005.5,0.405174,1.249968,0.382527,0.368489,0.392334,0.368489,1.645401,0.405174,0.504082,3.131879,...,0.535048,0.719265,0.152943,3.743526,0.602072,3.538463,0.656286,0.690559,3.039113,0.749957
ENSG00000000419.12,4.204045,3.092342,4.310163,4.536766,4.014042,3.381379,4.406172,4.118315,3.951724,3.695306,...,4.643275,3.100543,5.333091,3.417602,4.386976,4.655274,3.856929,4.388999,2.25616,3.578839
ENSG00000000457.13,1.80395,1.249416,0.319675,2.135601,0.757774,2.427578,1.309815,1.590746,3.693367,3.055467,...,3.639198,1.283812,0.285568,0.113242,0.137654,0.535571,3.72945,0.213661,5.374394,0.170811
ENSG00000000460.16,3.753187,0.244273,0.172557,4.943733,4.321864,4.40967,3.152386,3.976263,2.295498,4.773156,...,3.844763,0.964174,4.893142,4.96176,2.728839,4.073061,2.13818,4.200864,3.579143,2.512359


In [4]:
h9_pearson_corr = pearson_corr_mat(h9_tpm_imp)
h9_pearson_corr.head()

Unnamed: 0,ENSG00000000003.14,ENSG00000000005.5,ENSG00000000419.12,ENSG00000000457.13,ENSG00000000460.16,ENSG00000001036.13,ENSG00000001084.10,ENSG00000001167.14,ENSG00000001460.17,ENSG00000001461.16,...,ENSG00000282988.1,ENSG00000283013.1,ENSG00000283036.1,ENSG00000283041.1,ENSG00000283045.1,ENSG00000283050.1,ENSG00000283051.1,ENSG00000283064.1,ENSG00000283098.1,ENSG00000283103.1
ENSG00000000003.14,1.0,0.064436,0.041577,0.078306,-0.02206,-0.096646,0.051492,-0.019811,0.173703,0.147588,...,0.086652,-0.1346,0.055213,0.021363,-0.01078,-0.038101,-0.041881,0.001606,0.096077,0.19343
ENSG00000000005.5,0.064436,1.0,0.024849,-0.058639,0.058202,0.026413,0.088073,-0.069091,0.008483,0.046041,...,-0.009181,-0.098209,-0.117932,0.016383,-0.102055,0.141859,-0.085127,-0.080212,0.056878,-0.109973
ENSG00000000419.12,0.041577,0.024849,1.0,-0.115936,-0.022008,0.032582,0.10306,0.054477,0.108908,-0.056045,...,-0.161078,0.132186,0.042207,0.122212,0.069267,0.10967,0.033243,-0.115613,-0.096184,-0.039914
ENSG00000000457.13,0.078306,-0.058639,-0.115936,1.0,-0.010841,-0.00826,0.026272,-0.032167,0.114359,-0.065949,...,0.138236,0.076793,0.020185,-0.182577,-0.126809,-0.025268,0.038938,0.194618,0.311537,-0.007464
ENSG00000000460.16,-0.02206,0.058202,-0.022008,-0.010841,1.0,0.011817,0.042116,0.087367,0.050621,-0.059431,...,-0.019805,-0.007496,0.046613,0.009486,0.009387,-0.039315,0.099705,-0.011047,-0.010918,0.027518


We have the Pearson correlation matrix. Now we want to compare the values with what is given by Scipy

In [6]:
from scipy.stats import pearsonr
from scipy.stats import spearmanr

TEST_SIZE = 10

# empty 10 * 10 matrix
test_pearson_corr = pd.DataFrame([[0] * 10 for _ in range(10)])
test_spearman_corr = pd.DataFrame([[0] * 10 for _ in range(10)])

# lower-left triangle
for i in range(TEST_SIZE):
    for j in range(i):
        # pearson
        r, p = pearsonr(h9_tpm_imp.iloc[i], h9_tpm_imp.iloc[j])
        test_pearson_corr.iloc[i, j] = r
        test_pearson_corr.iloc[j, i] = r
        # spearman
        r, p = spearmanr(h9_tpm_imp.iloc[i], h9_tpm_imp.iloc[j])
        test_spearman_corr.iloc[i, j] = r
        test_spearman_corr.iloc[j, i] = r
for i in range(TEST_SIZE):
    test_pearson_corr.iloc[i, i] = 1.0
    test_spearman_corr.iloc[i, i] = 1.0

display(test_pearson_corr)
display(test_spearman_corr)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.064436,0.041577,0.078306,-0.02206,-0.096646,0.051492,-0.019811,0.173703,0.147588
1,0.064436,1.0,0.024849,-0.058639,0.058202,0.026413,0.088073,-0.069091,0.008483,0.046041
2,0.041577,0.024849,1.0,-0.115936,-0.022008,0.032582,0.10306,0.054477,0.108908,-0.056045
3,0.078306,-0.058639,-0.115936,1.0,-0.010841,-0.00826,0.026272,-0.032167,0.114359,-0.065949
4,-0.02206,0.058202,-0.022008,-0.010841,1.0,0.011817,0.042116,0.087367,0.050621,-0.059431
5,-0.096646,0.026413,0.032582,-0.00826,0.011817,1.0,-0.15204,-0.07347,0.040177,0.112993
6,0.051492,0.088073,0.10306,0.026272,0.042116,-0.15204,1.0,0.017989,0.061562,0.037128
7,-0.019811,-0.069091,0.054477,-0.032167,0.087367,-0.07347,0.017989,1.0,0.06358,0.097078
8,0.173703,0.008483,0.108908,0.114359,0.050621,0.040177,0.061562,0.06358,1.0,0.207533
9,0.147588,0.046041,-0.056045,-0.065949,-0.059431,0.112993,0.037128,0.097078,0.207533,1.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.066396,0.070206,0.011653,-0.005721,0.018786,0.00044,-0.052862,0.126178,0.055717
1,0.066396,1.0,-0.001656,-0.048184,0.011303,-0.016304,0.102233,-0.018085,-0.094103,0.152605
2,0.070206,-0.001656,1.0,-0.124372,-0.013447,-0.063406,0.110461,0.05704,0.102308,-0.052131
3,0.011653,-0.048184,-0.124372,1.0,0.026235,-0.042422,0.055772,0.023712,0.093345,-0.052375
4,-0.005721,0.011303,-0.013447,0.026235,1.0,0.033001,0.004608,0.060428,0.035537,-0.041458
5,0.018786,-0.016304,-0.063406,-0.042422,0.033001,1.0,-0.155663,-0.061269,0.077632,0.122266
6,0.00044,0.102233,0.110461,0.055772,0.004608,-0.155663,1.0,0.012007,0.076663,0.055923
7,-0.052862,-0.018085,0.05704,0.023712,0.060428,-0.061269,0.012007,1.0,0.050253,0.090846
8,0.126178,-0.094103,0.102308,0.093345,0.035537,0.077632,0.076663,0.050253,1.0,0.192295
9,0.055717,0.152605,-0.052131,-0.052375,-0.041458,0.122266,0.055923,0.090846,0.192295,1.0


We can see our results are quite correct. Next, we also generate the Spearman correlation matrix

In [3]:
h9_spearman_corr = spearman_corr_mat(h9_tpm_imp)
h9_spearman_corr.head()

Unnamed: 0,ENSG00000000003.14,ENSG00000000005.5,ENSG00000000419.12,ENSG00000000457.13,ENSG00000000460.16,ENSG00000001036.13,ENSG00000001084.10,ENSG00000001167.14,ENSG00000001460.17,ENSG00000001461.16,...,ENSG00000282988.1,ENSG00000283013.1,ENSG00000283036.1,ENSG00000283041.1,ENSG00000283045.1,ENSG00000283050.1,ENSG00000283051.1,ENSG00000283064.1,ENSG00000283098.1,ENSG00000283103.1
ENSG00000000003.14,1.0,0.066396,0.070206,0.011653,-0.005721,0.018786,0.00044,-0.052862,0.126178,0.055717,...,0.18236,-0.024983,-0.021043,0.197593,0.046657,0.075704,-0.029964,-0.04683,0.088553,0.210005
ENSG00000000005.5,0.066396,1.0,-0.001656,-0.048184,0.011303,-0.016304,0.102233,-0.018085,-0.094103,0.152605,...,-0.102707,-0.185696,-0.137568,0.03444,-0.040859,0.069326,-0.183974,0.209683,-0.101369,-0.018577
ENSG00000000419.12,0.070206,-0.001656,1.0,-0.124372,-0.013447,-0.063406,0.110461,0.05704,0.102308,-0.052131,...,-0.126147,0.176552,-0.024802,0.05839,0.166198,0.085874,-0.047075,-0.08553,-0.028783,-0.051301
ENSG00000000457.13,0.011653,-0.048184,-0.124372,1.0,0.026235,-0.042422,0.055772,0.023712,0.093345,-0.052375,...,0.099081,0.119142,0.160668,-0.154965,-0.045885,0.094309,0.189544,0.109012,0.142999,-0.009824
ENSG00000000460.16,-0.005721,0.011303,-0.013447,0.026235,1.0,0.033001,0.004608,0.060428,0.035537,-0.041458,...,-0.021442,-0.031752,0.073332,0.005484,0.098675,-0.02743,0.06982,-0.03734,0.014165,0.024618


In [7]:
# save to files
#h9_spearman_corr.to_csv("../data/GSE75748/h9_spearman_corr.tsv", sep = '\t', float_format = '%.8f')
h9_pearson_corr.to_csv("../data/GSE75748/h9_pearson_corr.tsv", sep = '\t', float_format = '%.8f')