In [24]:
import pandas as pd
df = pd.DataFrame({ 'gene':["1 // foo // blabla",
                                   "2 // bar // lalala",
                                   "3 // qux // trilil",
                                   "4 // woz // hohoho",
                                   "nopeynope"], 
                   'cnv':['a','b','c','d','e'],
                   'cell1':[5,9,1,7,9], 
                   'cell2':[12,90,13,87,9],
                   'location':['1', '13.4', '1.1-2', '1.2-p22', '43-q44'],
                  'np_t_w': ['1','13.4','1.1','2','3']})
df

Unnamed: 0,cell1,cell2,cnv,gene,location,np_t_w
0,5,12,a,1 // foo // blabla,1,1.0
1,9,90,b,2 // bar // lalala,13.4,13.4
2,1,13,c,3 // qux // trilil,1.1-2,1.1
3,7,87,d,4 // woz // hohoho,1.2-p22,2.0
4,9,9,e,nopeynope,43-q44,3.0


In [10]:
df['location'].str.extract('(-)', expand=True)

Unnamed: 0,0
0,
1,
2,-
3,-
4,-


In [7]:
df = pd.DataFrame([['a','b',1],
                  ['a','c',1.9],
                  ['a','d',45],
                  ['a','e',3.9],
                  ['b','c', 2.1],
                  ['b','d',-2],
                  ['b','e',4.1]],
                 columns = ['cnv','exp','np_t_w'])
df

Unnamed: 0,cnv,exp,np_t_w
0,a,b,1.0
1,a,c,1.9
2,a,d,45.0
3,a,e,3.9
4,b,c,2.1
5,b,d,-2.0
6,b,e,4.1


In [8]:
### OLD
from scipy.stats import pearsonr
def rolling_pearson(df, group='cnv', data='np_t_w'):
    """
    computes pearson between array of data for one gene partners and each successive chromosomal neighbor
    
    :param df: pd df, clove results, sorted in ascending order of chromosome locus
    :param group: str, df column label on which to form gene neigbors, default 'cnv'
    :param data: str, df column label of source data to populate arrays, default 'np_t_w' (clove t-stats)
    """
    
    unique_genes = df[group].unique()
    results = []
    for idx, g0 in enumerate(unique_genes):
        g0_arr = df[df[group] == g0][data].values
        if idx < len(unique_genes) - 1:
            for g1 in unique_genes[idx+1:]:
                g1_arr = df[df[group] == g1][data].values
                print(g1_arr)
                pearson, pval = pearsonr(g0_arr, g1_arr)
                results.append([g0, g1, pearson, pval])

    return pd.DataFrame(results, columns=[group, group+'_k', 'pearson', 'pval'])

In [95]:
### NEW
from scipy.stats import pearsonr
from scipy.spatial import distance
def rolling_similarity(df, group='cnv', data='np_t_w', how='pearson'):
    """
    computes similarity between array of data for one gene partners and each successive chromosomal neighbor
    
    :param df: pd df, clove results, sorted in ascending order of chromosome locus
    :param group: str, df column label on which to form gene neigbors, default 'cnv'
    :param data: str, df column label of source data to populate arrays, default 'np_t_w' (clove t-stats)
    :param how: str, choice of: {pearson, euclidian, cosine}
    """
    
    unique_genes = df[group].unique()
    results = []
    for idx, g0 in enumerate(unique_genes):
        if idx < len(unique_genes) - 1:
            g1 = unique_genes[idx+1]
            merged = pd.merge(df[df[group] == g0], df[df[group] == g1], how='inner', on='exp')
            merged.columns = ['cnv_g0','exp','clove_g0','cnv_g1','clove_g1']
            if how == 'pearson':
                cols = [g0, g1, how ,'pval']
                pear, pval = pearsonr(merged['clove_g0'], merged['clove_g1'])
                results.append([g0,g1,pear,pval])
            elif how == 'euclidian':
                cols = [g0, g1, how]
                results.append([g0, g1, distance.euclidean(merged['clove_g0'], merged['clove_g1'])])
            elif how == 'cosine':
                cols = [g0, g1, how]
                results.append([g0, g1, distance.cosine(merged['clove_g0'], merged['clove_g1'])])

    return pd.DataFrame(results, columns=cols)

In [94]:
rolling_similarity(df, group='cnv',data='np_t_w')

q


Unnamed: 0,a,b,pearson,pval
0,a,b,-0.932864,0.234603


In [27]:
df

Unnamed: 0,cell1,cell2,cnv,gene,location,np_t_w
0,5,12,a,1 // foo // blabla,1,1.0
1,9,90,b,2 // bar // lalala,13.4,13.4
2,1,13,c,3 // qux // trilil,1.1-2,1.1
3,7,87,d,4 // woz // hohoho,1.2-p22,2.0
4,9,9,e,nopeynope,43-q44,3.0


In [13]:
pearsonr?

In [22]:
import numpy as np
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
x[:2]

array([0, 1])

In [23]:
np.random.shuffle(x)


TypeError: 'NoneType' object is not subscriptable

develop explicit way to define certain combinations of genes on which to compute CLOvEs

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from cloveFunctions import *
import itertools

In [2]:
edf = pd.read_csv('data/breast_mrna.tab.gz', compression='gzip', sep='\t', index_col=0)
cdf = pd.read_csv('data/breast_CCLE_DEL_calls.csv.gz', compression='gzip', index_col=0)

In [3]:
e_samp = edf.index[1:3].values
c_samp = cdf.index[3:5].values

In [4]:
c_samp

array(['A2LD1', 'A2M'], dtype=object)

In [33]:
def explicitPairContextStat(expdf, cnvdf, exp_lis=False, cnv_lis=False, cat_df=False, nan_style='omit', permute=False):
    """
    takes exp and cnv genes (either all or explicitand returns pair summary statistics
    
    :param n_samp: int, number of random samples to take
    :param expdf: pandas dataframe, expression by sample 
                    (hopefully filtered with mainFilter, tissue specific, with matching samples in cnv)
    :param cnvdf: pandas dataframe, binarized mask 5(1=del, 0=nodel) deletion by sample 
                    (hopefully filtered with mainFilter, tissue specific, with matching samples in exp)
    :param exp_lis: list of str, HUGO gene names in expdf to restrict to, default is False (use all genes in expdf)
    :param exp_lis: list of str, HUGO gene names in cnvdf to restrict to, default is False (use all genes in cnvdf)
    :param in_df: pandas dataframe, previous calculations to concat new results to, used in while loop to get n_samp
    :param nan_style: str, how the stats.ttest_ind treats NANs, {‘propagate’, ‘raise’, ‘omit’}
    :param permute: bool, True will calculate pairs with randomly permuted expression matrix as null model
    
    returns df[['exp', 'cnv', 'cntxt_pos_mu', 'cntxt_neg_mu', 
                'cntxt_pos_var', 'cntxt_neg_var', 
                'cntxt_pos_n', 'cntxt_neg_n']]
    """
    
    cells = list(set(cnvdf.columns).intersection(expdf.columns))
    expdf = expdf[cells]
    cmask = cnvdf[cells] == 1
    
    if type(exp_lis) != bool:
        exp_samp = set(expdf.index).intersection(exp_lis)
        for gene in exp_lis:
            if gene not in expdf.index:
                print('{} not found in expdf.index.  Omitted'.format(gene))
    else:
        exp_samp = expdf.index
    
    if type(cnv_lis) != bool:
        cnv_samp = set(cnvdf.index).intersection(cnv_lis)
        for gene in cnv_lis:
            if gene not in cnvdf.index:
                print('{} not found in cnvdf.index.  Omitted'.format(gene))
    else:
        cnv_samp = cnvdf.index
    print('attempting {} comparisons with current parameters'.format(len(exp_samp) * len(cnv_samp)))
    r={'exp':[],'cnv':[]}
    for pair in itertools.product(exp_samp, cnv_samp):
        r['exp'].append(pair[0])
        r['cnv'].append(pair[1])
    df = pd.DataFrame(r)
    
    pos_n, neg_n, = [], []
    pos_mu, neg_mu = [], []
    pos_var, neg_var = [], []
    cohens_d = []
    np_t_s, np_p_s = [], []
    np_t_w, np_p_w = [], []
    
    df
    
    for row in df.itertuples():
        # mask cnv contexts onto expression data
        pos = np.array(expdf.loc[row.exp][cmask.loc[row.cnv]])
        neg = np.array(expdf.loc[row.exp][~cmask.loc[row.cnv]])
        
        # calculate n
        pos_n.append(len(pos))
        neg_n.append(len(neg))
        
        # calculate mu
        pos_mu.append(pos.mean())
        neg_mu.append(neg.mean())
        
        # calculate var
        pos_var.append(pos.var())
        neg_var.append(neg.var())
        
        # calculate cohen's d
        cohens_d.append(cohenD(pos, neg))
        
        # calculate t_stat, welch
        t, p = stats.ttest_ind(pos, neg, nan_policy=nan_style, equal_var=True)
        np_t_w.append(t)
        np_p_w.append(p)
        
        if permute:
            pos = np.array(expdf.loc[row.exp][cmask_n.loc[row.cnv]])
            neg = np.array(expdf.loc[row.exp][~cmask_n.loc[row.cnv]])
            t, p = stats.ttest_ind(pos, neg, nan_policy=nan_style, equal_var=True)
            np_t_w_null.append(t)
            np_p_w_null.append(p)
            
    df['pos_n'] = pos_n
    df['neg_n'] = neg_n
    df['pos_mu'] = pos_mu
    df['neg_mu'] = neg_mu
    df['pos_var'] = pos_var
    df['neg_var'] = neg_var
    df['cohens_d'] = cohens_d
    df['np_t_w'] = np_t_w
    df['np_p_w'] = np_p_w
    
    if permute:
        df['np_t_w_null'] = np_t_w_null
        df['np_t_w_null'] = np_t_w_null
    
    df.dropna(inplace=True)

    right = expdf.rename_axis('exp', axis=0) 
    right['gene_var_exp'] = right.var(axis=1)
    right = right.reset_index()
    
    df = pd.merge(df, right[['exp','gene_var_exp']], on='exp')
    
    if cat_df:
        return pd.concat([cat_df, df])
        
    return df

In [36]:
edf_f, cdf_f = mainFitler(edf, cdf, var=2, n=6, amp_fh=False, dele_fh=False, mut_fh=False, save=False)
df = explicitPairContextStat(edf_f, cdf_f, cnv_lis=c_samp)

filtering done


In [37]:
df['exp'].unique()

array(['AP1S2', 'CD9', 'CEBPD', 'COTL1', 'CREB3L4', 'CXXC5', 'DNAJB4',
       'DYNLT3', 'ELL3', 'EXT1', 'FAM46A', 'FAM69A', 'FRMD6', 'HES1',
       'HIST1H2BD', 'HIST2H2BE', 'ID1', 'IKBIP', 'IL6ST', 'KDELR3',
       'LOC100128822', 'MAFF', 'MIEN1', 'MLPH', 'MREG', 'NAT1', 'OPN3',
       'PDCD4', 'PLEKHF2', 'PLK2', 'RAB11FIP1', 'RHOBTB3', 'RHPN2', 'RND3',
       'SFN', 'SH3YL1', 'SLC7A11', 'SNHG5', 'TFAP2A', 'TRPS1', 'TSPAN13'], dtype=object)

In [39]:
df['exp'].value_counts()

SH3YL1          2
RHPN2           2
HIST1H2BD       2
OPN3            2
HIST2H2BE       2
PLEKHF2         2
LOC100128822    2
MLPH            2
PLK2            2
SNHG5           2
NAT1            2
IL6ST           2
TSPAN13         2
CXXC5           2
RND3            2
DYNLT3          2
TRPS1           2
DNAJB4          2
RAB11FIP1       2
COTL1           2
FAM69A          2
MREG            2
IKBIP           2
AP1S2           2
SFN             2
KDELR3          2
RHOBTB3         2
MIEN1           2
CD9             2
FRMD6           2
HES1            2
TFAP2A          2
CEBPD           2
CREB3L4         2
SLC7A11         2
FAM46A          2
ID1             2
EXT1            2
MAFF            2
PDCD4           2
ELL3            2
Name: exp, dtype: int64