# Pearson correlation

In [8]:
# import lib
import numpy as np
import pandas as pd
from scipy.special import betainc

# import cupy as xp
# from cupyx.scipy.special import betainc

In [9]:
rng = np.random.default_rng(4582345683546)

m = 120
n = 330
nx = 160
ny = n - nx

data = rng.random((m, n))
column_names = ([f'_PC{i}' for i in range(nx)] 
                + [f'_lncRNAs{i}' for i in range(ny)])
lncRNA_PC_T = pd.DataFrame(data, columns=column_names)

In [10]:
lncRNA_PC_T

Unnamed: 0,_PC0,_PC1,_PC2,_PC3,_PC4,_PC5,_PC6,_PC7,_PC8,_PC9,...,_lncRNAs160,_lncRNAs161,_lncRNAs162,_lncRNAs163,_lncRNAs164,_lncRNAs165,_lncRNAs166,_lncRNAs167,_lncRNAs168,_lncRNAs169
0,0.280900,0.001448,0.027928,0.337749,0.682488,0.600051,0.526520,0.084916,0.833719,0.371976,...,0.983245,0.956690,0.112283,0.416037,0.847787,0.200446,0.985431,0.861864,0.359156,0.127048
1,0.675633,0.598406,0.835905,0.852677,0.882882,0.288680,0.893689,0.545906,0.819584,0.460990,...,0.024071,0.341727,0.523100,0.572442,0.031956,0.245020,0.289259,0.730080,0.642509,0.440972
2,0.791960,0.151214,0.426073,0.491463,0.993709,0.965380,0.525757,0.089036,0.002369,0.589183,...,0.487754,0.306396,0.475312,0.978334,0.022493,0.511281,0.095478,0.622551,0.770534,0.027047
3,0.345313,0.059499,0.420587,0.271361,0.882440,0.648232,0.507514,0.126179,0.628951,0.700430,...,0.193933,0.934100,0.558114,0.083293,0.080865,0.934729,0.277268,0.897894,0.903043,0.468979
4,0.275378,0.254446,0.848173,0.520497,0.951239,0.503223,0.961480,0.187399,0.946201,0.586077,...,0.190908,0.112905,0.459866,0.106023,0.415205,0.779767,0.899239,0.822693,0.827596,0.424855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0.122689,0.384899,0.770796,0.341601,0.621454,0.070699,0.556214,0.427497,0.579688,0.591669,...,0.163702,0.868754,0.869039,0.560948,0.407106,0.084038,0.334843,0.850289,0.539801,0.811079
116,0.586044,0.995512,0.375057,0.526370,0.946351,0.422344,0.469472,0.187648,0.830351,0.311740,...,0.715733,0.759228,0.455834,0.390192,0.573611,0.503640,0.098833,0.824145,0.333262,0.678541
117,0.954292,0.935925,0.880770,0.766003,0.626403,0.574287,0.315012,0.216482,0.641772,0.861348,...,0.005379,0.589101,0.973884,0.290060,0.895838,0.111087,0.418049,0.804445,0.440579,0.370669
118,0.570509,0.878175,0.310905,0.457772,0.486288,0.160115,0.436907,0.598360,0.376603,0.722815,...,0.768415,0.456061,0.399188,0.278927,0.921090,0.679190,0.244323,0.861309,0.855939,0.419645


In [18]:
from scipy.stats import pearsonr


def correlation_analysis(lncRNA_PC_T):
    """Function for correlation analysis"""
    correlations = []
    for PC in [column for column in lncRNA_PC_T.columns if "_PC" in column]:
        for lncRNA in [column for column in lncRNA_PC_T.columns if "_lncRNAs" in column]:
            correlations.append(
                pd.Series(
                    pearsonr(lncRNA_PC_T[PC], lncRNA_PC_T[lncRNA]),
                    index=["PCC", "p-value"],
                    name=PC + "_" + lncRNA,
                )
            )

    return correlations

In [19]:
import numpy as xp
from scipy.special import betainc

def pearsonr2(x, y):
    # Assumes inputs are DataFrames and computation is to be performed
    # pairwise between columns. We convert to arrays and reshape so calculation
    # is performed according to normal broadcasting rules along the last axis.
    x = xp.asarray(x).T[:, xp.newaxis, :]
    y = xp.asarray(y).T
    n = x.shape[-1]

    # Compute Pearson correlation coefficient. We can't use `cov` or `corrcoef`
    # because they want to compute everything pairwise between rows of a
    # stacked x and y.
    xm = x.mean(axis=-1, keepdims=True)
    ym = y.mean(axis=-1, keepdims=True)
    cov = xp.sum((x - xm) * (y - ym), axis=-1)/(n-1)
    sx = xp.std(x, ddof=1, axis=-1)
    sy = xp.std(y, ddof=1, axis=-1)
    rho = cov/(sx * sy)

    # Compute the two-sided p-values. See documentation of scipy.stats.pearsonr.
    ab = n/2 - 1
    x = (abs(rho) + 1)/2
    p = 2*(1-betainc(ab, ab, x))
    return rho, p

#df_corr = correlation_analysis(lncRNA_PC_T)
#%timeit correlation_analysis(lncRNA_PC_T)
# 1min 14s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

x = lncRNA_PC_T.iloc[:, :nx]
y = lncRNA_PC_T.iloc[:, nx:]
corr2, p2 = pearsonr2(x, y)
%timeit pearsonr2(x, y)
# 21.9 ms ± 190 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

# Check results
np.testing.assert_allclose(corr2.ravel(), df_corr.iloc[:, 0])
np.testing.assert_allclose(p2.ravel(), df_corr.iloc[:, 1])


KeyboardInterrupt



In [20]:
x = lncRNA_PC_T.iloc[:, :nx]
y = lncRNA_PC_T.iloc[:, nx:]
corr2, p2 = pearsonr2(x, y)
%timeit pearsonr2(x, y)

18.6 ms ± 296 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [51]:
x = lncRNA_PC_T.iloc[:, :nx]
x

Unnamed: 0,_PC0,_PC1,_PC2,_PC3,_PC4,_PC5,_PC6,_PC7,_PC8,_PC9,...,_PC150,_PC151,_PC152,_PC153,_PC154,_PC155,_PC156,_PC157,_PC158,_PC159
0,0.280900,0.001448,0.027928,0.337749,0.682488,0.600051,0.526520,0.084916,0.833719,0.371976,...,0.917573,0.743953,0.584834,0.911307,0.876242,0.853886,0.628875,0.139491,0.452216,0.504159
1,0.675633,0.598406,0.835905,0.852677,0.882882,0.288680,0.893689,0.545906,0.819584,0.460990,...,0.175340,0.341878,0.678735,0.177962,0.087303,0.735064,0.710624,0.047160,0.958539,0.130303
2,0.791960,0.151214,0.426073,0.491463,0.993709,0.965380,0.525757,0.089036,0.002369,0.589183,...,0.066412,0.932558,0.084120,0.333515,0.381932,0.338181,0.717279,0.864041,0.718377,0.947395
3,0.345313,0.059499,0.420587,0.271361,0.882440,0.648232,0.507514,0.126179,0.628951,0.700430,...,0.905909,0.399036,0.240701,0.141320,0.181853,0.310369,0.219957,0.563083,0.460125,0.615624
4,0.275378,0.254446,0.848173,0.520497,0.951239,0.503223,0.961480,0.187399,0.946201,0.586077,...,0.768146,0.631334,0.323654,0.118137,0.458695,0.468922,0.557437,0.549200,0.395492,0.696041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0.122689,0.384899,0.770796,0.341601,0.621454,0.070699,0.556214,0.427497,0.579688,0.591669,...,0.824567,0.233877,0.470325,0.332221,0.092467,0.758470,0.397464,0.942344,0.953470,0.994446
116,0.586044,0.995512,0.375057,0.526370,0.946351,0.422344,0.469472,0.187648,0.830351,0.311740,...,0.314401,0.103251,0.229742,0.929791,0.169221,0.560133,0.901405,0.239978,0.784998,0.973438
117,0.954292,0.935925,0.880770,0.766003,0.626403,0.574287,0.315012,0.216482,0.641772,0.861348,...,0.295255,0.349857,0.354120,0.705755,0.935035,0.285987,0.101165,0.076028,0.876937,0.219235
118,0.570509,0.878175,0.310905,0.457772,0.486288,0.160115,0.436907,0.598360,0.376603,0.722815,...,0.192272,0.464618,0.287626,0.819947,0.072080,0.900190,0.060257,0.039706,0.631473,0.687233


In [34]:
print(np.asarray(x).shape)
print(np.asarray(x).T.shape)

(120, 160)
(160, 120)


In [39]:
np.asarray(x).T[:, np.newaxis, :].shape

(160, 1, 120)

In [40]:
y = np.asarray(y).T

In [41]:
y.shape

(170, 120)

In [48]:
n = x.shape[-1]

In [53]:
corr2.shape

(160, 170)

In [55]:
y.shape

(170, 120)