In [1]:
"""
compute spearman correlation between expression and sigma signatures
output correaltion coefficents, pv

input files:
expr_file = "data/common_brca_expr_pid.tsv"
sigma_file = "data/common_sigma_count.tsv"

output files:
corr file = "../results/corr_expr_sigma_count.tsv"
pv_file = "../results/corr_pv_expr_sigma_count.tsv"

See corr_analysis.ipynb for subsequent analysis
"""

'\ncompute correlation between expression and sigma signatures\ncreate discretize matrix \n    1: posivitely correlated (pv <th)\n    0: no correlation\n    -1: negatively correlated (pv <th)\nfor sig1, 2, 3, 5, 8, 13 (level and count)\n\n'

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss

In [67]:
# input files 
# the first two columns of expr file are id (sequential) and gene name
expr_file = "data/common_brca_expr_pid.tsv"
sigma_file = "data/common_sigma_count.tsv"

In [68]:
# read gene expression data
brca_expr = pd.read_csv(expr_file, sep="\t", index_col=0)
brca_expr_df = brca_expr.iloc[:,1:] 
brca_expr_genes = brca_expr.Name 
# read signature data
sigma_count_df = pd.read_csv(sigma_file, sep="\t", index_col=0)

In [69]:
def comp_corr(x_col, y_col, nan_th=30):
    """
    compute correlation of x_col  and y_col
    x_col: expression
    y_col: sig count
    nan_th: allow some NaN as long as valid elements > nan_th
    """
    if (~x_col.isnull()).sum() >= nan_th:  
        corr, pv = ss.spearmanr(x_col, y_col, nan_policy='omit')
    else:
        corr, pv = 0, 1 # if not enough samples
    return corr, pv

def comp_corr_all(x_df, y_df):
    """
    spearman correlation between gene expression and signatures

    :param x_df: gene expression DataFrame : samples (rows) X genes (columns)  
    :param y_df: signature DataFrame: samples (rows) X signatures (columns) 
    :return: corr_df (x*y) 
    """
    # compute correlation
    corr_df = pd.DataFrame(index=x_df.columns, columns=y_df.columns)
    pv_df = pd.DataFrame(index=x_df.columns, columns=y_df.columns)  
    for j in range(y_df.shape[1]):
        print(y_df.columns[j])
        corr_results = [comp_corr(x_df.iloc[:,i], y_df.iloc[:, j]) for i in range(x_df.shape[1])]
        # print(corr_results)
        corr_list, pv_list = tuple(zip(*corr_results))
        corr_df.iloc[:,j] = corr_list
        pv_df.iloc[:,j] = pv_list
    return corr_df, pv_df

In [70]:
# run correlation func or read correlation results for sig 1 2, 3, 5, 8, and 13 only

sigs_of_interest = ["1C", "1D", "2C", "2D", "3C", "3D", "5C", "5D", "8C", "8D", "13C", "13D"]
corr_df, pv_df = comp_corr_all(brca_expr_df.T, sigma_count_df.T[sigs_of_interest])

# write the results
corr_df.index = brca_expr_genes
pv_df.index = brca_expr_genes

1C
1D
2C
2D
3C
3D
5C
5D
8C
8D
13C
13D


In [62]:
corr_count.to_csv("results/corr_expr_sigma_count.tsv", sep="\t")
pv_count.to_csv("results/corr_pv_expr_sigma_count.tsv", sep="\t")