In [1]:
from SDP_interaction_inference.constraints import Constraint
from SDP_interaction_inference import optimization
from SDP_interaction_inference.dataset import Dataset
from SDP_interaction_inference.correlation import Correlation
from SDP_interaction_inference import utils
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import scipy
import tqdm
from copy import deepcopy
import matplotlib.patches
import corner
import plotly
import plotly.graph_objects as go

# Correlation Investigation

## Data loading

In [11]:
# read fibroblast transcript counts
data_FIB = pd.read_csv("./Data/GSE151334_FIB_counts_thresh.csv", index_col=0)

# load capture
beta = np.loadtxt("./Capture/beta_FIB.txt")

# load RNA types
biotypes_dict = json.load(open("./Biotypes/biotypes_FIB.json"))

# select indices of protein coding mRNA and non-coding miRNA
pcRNA_indices = [idx for idx, btype in enumerate(biotypes_dict.values()) if btype == "protein_coding"]
miRNA_indices = [idx for idx, btype in enumerate(biotypes_dict.values()) if btype == "miRNA"]

# separate data
data_FIB_pcRNA = data_FIB.iloc[pcRNA_indices]
data_FIB_miRNA = data_FIB.iloc[miRNA_indices]

# amounts: see './Biotypes/updated-biotypes.ipynb' for full pie chart of types
print(f"Total of {data_FIB.shape[0]} genes with mean > 1 present \nSelected {data_FIB_miRNA.shape[0]} non-coding miRNA \nSelected {data_FIB_pcRNA.shape[0]} protein coding (mRNA)")

Total of 11747 genes with mean > 1 present 
Selected 19 non-coding miRNA 
Selected 9296 protein coding (mRNA)


## Setup functions

In [3]:
def construct_dataset(mirna_sample, mrna_dataset, beta, resamples=1000):

    # size
    gene_pairs, cells = mrna_dataset.shape

    # construct paired count dataframe
    counts_df = pd.DataFrame(
        index = [f"Gene-pair-{i}" for i in range(gene_pairs)],
        columns = [f"Cell-{j}" for j in range(cells)]
    )

    # fill with pairs
    for i in range(gene_pairs):
        gene_i = mirna_sample
        gene_j = mrna_dataset.iloc[i]
        gene_pair_ij = list(zip(gene_i, gene_j))
        counts_df.iloc[i] = gene_pair_ij

    # construct dataset object
    data = Dataset()
    data.count_dataset = counts_df
    data.cells = cells
    data.gene_pairs = gene_pairs

    # settings
    data.resamples = resamples

    # set capture
    data.beta = beta

    return data

In [4]:
def direct_correlation_bootstrap_sample(x1_sample, x2_sample, confidence=None, resamples=None):

    # get sample size
    n = len(x1_sample)

    # get bootstrap size: default to sample size
    if resamples is None:
        resamples = n
    # confidence level: default to 95%
    if confidence is None:
        confidence = 0.95

    # initialize random generator
    rng = np.random.default_rng()

    # separate sample pairs
    x1_sample = list(x1_sample)
    x2_sample = list(x2_sample)

    # convert sample to n x 2 array
    sample = np.array([x1_sample, x2_sample]).T

    # bootstrap to N x n x 2 array
    boot = rng.choice(sample, size=(resamples, n))

    # compute correlations
    estimates = np.zeros(resamples)
    for i in range(resamples):

        b1 = boot[i, :, 0]
        b2 = boot[i, :, 1]

        E_xy = np.mean(b1 * b2)
        E_x = np.mean(b1)
        E_y = np.mean(b2)
        E_x2 = np.mean(b1**2)
        E_y2 = np.mean(b2**2)

        sigx = np.sqrt(E_x2 - E_x**2)
        sigy = np.sqrt(E_y2 - E_y**2)

        if sigx == 0.0 or sigy == 0.0:
            corr = np.nan
        else:
            corr = (E_xy - E_x*E_y) / (sigx * sigy)

        # test = scipy.stats.pearsonr(boot[i, :, 0], boot[i, :, 1])
        estimates[i] = corr #test.statistic

    # take quantiles
    alpha = 1 - confidence
    interval = np.quantile(estimates, [(alpha / 2), 1 - (alpha / 2)])

    return interval

In [5]:
def direct_correlation_bootstrap_dataset(mirna_sample, mrna_dataset, confidence=None, resamples=None):

    # size
    gene_pairs, cells = mrna_dataset.shape

    # record bounds
    bounds = np.empty((gene_pairs, 2))
    
    # bootstrap
    for i in tqdm.tqdm(range(gene_pairs)):
        bounds[i, :] = direct_correlation_bootstrap_sample(mirna_sample, mrna_dataset.iloc[i], confidence, resamples)

    return bounds

## Setup results

### Construct

In [32]:
# select mRNA
mRNA = data_FIB_pcRNA.index

In [33]:
# dataframe to store correlation results
correlation_df = pd.DataFrame(
    index=mRNA
)

## Running

In [None]:
# for each miRNA
miRNA_names = data_FIB_miRNA.index
for miRNA in miRNA_names:

    # display miRNA
    print(f"\nRunning {miRNA}")

    # construct dataset of miRNA paired with mRNA
    dataset = construct_dataset(data_FIB_miRNA.loc[miRNA], data_FIB_pcRNA, beta)

    # run correlation test: 1-sided & 2-sided
    correlations_1 = Correlation(dataset, printing=False, alternative="less")
    correlations_2 = Correlation(dataset, printing=False, alternative="two-sided")

    # compute direct bootstrap bounds
    correlation_bounds = direct_correlation_bootstrap_dataset(data_FIB_miRNA.loc[miRNA], data_FIB_pcRNA)

    # extract results (rho same for 1 or 2 sided test)
    rho = [res['statistic'] for res in correlations_1.result_dict.values()]
    p1 = [res['pvalue'] for res in correlations_1.result_dict.values()]
    p2 = [res['pvalue'] for res in correlations_2.result_dict.values()]
    lb = correlation_bounds[:, 0]
    ub = correlation_bounds[:, 1]

    # store on dataframe
    correlation_df[f'{miRNA}_rho'] = rho
    correlation_df[f'{miRNA}_p1'] = p1
    correlation_df[f'{miRNA}_p2'] = p2
    correlation_df[f'{miRNA}_lb'] = lb
    correlation_df[f'{miRNA}_ub'] = ub

    # save dataframes as safety
    #correlation_df.to_csv("Results/correlations.csv")  # Uncomment when running
    #correlation_df.to_csv("Results/correlations.csv")


Running MIR100


100%|██████████| 9296/9296 [00:10<00:00, 893.61it/s]
100%|██████████| 9296/9296 [00:10<00:00, 924.19it/s]
100%|██████████| 9296/9296 [01:44<00:00, 89.16it/s]



Running MIR103A2


100%|██████████| 9296/9296 [00:10<00:00, 923.69it/s]
100%|██████████| 9296/9296 [00:10<00:00, 904.68it/s]
100%|██████████| 9296/9296 [01:46<00:00, 86.93it/s]



Running MIR16-2


100%|██████████| 9296/9296 [00:10<00:00, 900.60it/s]
100%|██████████| 9296/9296 [00:10<00:00, 898.35it/s]
100%|██████████| 9296/9296 [01:52<00:00, 82.63it/s]



Running MIR199A1


100%|██████████| 9296/9296 [00:10<00:00, 885.40it/s]
100%|██████████| 9296/9296 [00:10<00:00, 876.00it/s]
100%|██████████| 9296/9296 [01:45<00:00, 88.49it/s]



Running MIR199A2


100%|██████████| 9296/9296 [00:10<00:00, 917.47it/s]
100%|██████████| 9296/9296 [00:10<00:00, 904.83it/s]
100%|██████████| 9296/9296 [01:44<00:00, 88.67it/s]



Running MIR221


100%|██████████| 9296/9296 [00:10<00:00, 904.88it/s]
100%|██████████| 9296/9296 [00:10<00:00, 915.61it/s]
100%|██████████| 9296/9296 [01:45<00:00, 88.25it/s]



Running MIR222


100%|██████████| 9296/9296 [00:10<00:00, 915.34it/s]
100%|██████████| 9296/9296 [00:10<00:00, 914.24it/s]
100%|██████████| 9296/9296 [01:44<00:00, 89.15it/s]



Running MIR23A


100%|██████████| 9296/9296 [00:10<00:00, 876.34it/s]
100%|██████████| 9296/9296 [00:11<00:00, 780.35it/s]
100%|██████████| 9296/9296 [01:45<00:00, 87.79it/s]



Running MIR24-2


100%|██████████| 9296/9296 [00:10<00:00, 908.74it/s]
100%|██████████| 9296/9296 [00:10<00:00, 889.14it/s]
100%|██████████| 9296/9296 [01:47<00:00, 86.71it/s]



Running MIR27A


100%|██████████| 9296/9296 [00:10<00:00, 902.49it/s]
100%|██████████| 9296/9296 [00:10<00:00, 915.66it/s]
100%|██████████| 9296/9296 [01:46<00:00, 87.44it/s]



Running MIR29A


100%|██████████| 9296/9296 [00:10<00:00, 890.06it/s]
100%|██████████| 9296/9296 [00:10<00:00, 914.72it/s]
100%|██████████| 9296/9296 [01:47<00:00, 86.88it/s]



Running MIR31


100%|██████████| 9296/9296 [00:10<00:00, 919.49it/s]
100%|██████████| 9296/9296 [00:10<00:00, 907.11it/s]
100%|██████████| 9296/9296 [01:47<00:00, 86.63it/s]



Running MIR3609


100%|██████████| 9296/9296 [00:10<00:00, 910.01it/s]
100%|██████████| 9296/9296 [00:10<00:00, 927.55it/s]
100%|██████████| 9296/9296 [01:46<00:00, 87.57it/s]



Running MIR4449


100%|██████████| 9296/9296 [00:10<00:00, 925.39it/s]
100%|██████████| 9296/9296 [00:10<00:00, 916.58it/s]
100%|██████████| 9296/9296 [01:44<00:00, 88.77it/s]



Running MIR6087


100%|██████████| 9296/9296 [00:10<00:00, 909.71it/s]
100%|██████████| 9296/9296 [00:10<00:00, 919.19it/s]
100%|██████████| 9296/9296 [01:44<00:00, 88.93it/s]



Running MIR877


100%|██████████| 9296/9296 [00:10<00:00, 915.70it/s]
100%|██████████| 9296/9296 [00:10<00:00, 906.56it/s]
100%|██████████| 9296/9296 [01:45<00:00, 88.11it/s]



Running MIRLET7A1


100%|██████████| 9296/9296 [00:10<00:00, 903.01it/s]
100%|██████████| 9296/9296 [00:10<00:00, 898.62it/s]
100%|██████████| 9296/9296 [01:49<00:00, 85.08it/s]



Running MIRLET7A2


100%|██████████| 9296/9296 [00:10<00:00, 891.40it/s]
100%|██████████| 9296/9296 [00:10<00:00, 912.64it/s]
100%|██████████| 9296/9296 [01:44<00:00, 88.70it/s]



Running MIRLET7D


100%|██████████| 9296/9296 [00:09<00:00, 930.40it/s] 
100%|██████████| 9296/9296 [00:10<00:00, 919.80it/s] 
100%|██████████| 9296/9296 [01:43<00:00, 89.75it/s]
