In [1]:
import os

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [3]:
resultsdir = '/users/genomics/xoel/codebases/co_new/results_foxg1/'
os.makedirs(resultsdir, exist_ok=True)
os.chdir(resultsdir)

In [4]:
selected_samples = ['RGCmaturation',
                    'NeuralPCW16',
                    'NeuralPCW20',
                    'NeuralPCW21',
                    'NeuralPCW24',
                    'Gliogenesis']

In [5]:
datadir = '/users/genomics/xoel/codebases/co_new/data_foxg1/'
dirlist = {item: datadir+item+'/' for item in sorted(os.listdir(datadir)) if os.path.isdir(os.path.join(datadir, item)) if item[0]!='.' if item in selected_samples}
dirlist

{'Gliogenesis': '/users/genomics/xoel/codebases/co_new/data_foxg1/Gliogenesis/',
 'NeuralPCW16': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW16/',
 'NeuralPCW20': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW20/',
 'NeuralPCW21': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW21/',
 'NeuralPCW24': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW24/'}

In [6]:
n_cores = int(os.getenv('SLURM_JOB_CPUS_PER_NODE') or 16)
n_cores

96

# Load genes of interest

## Gene lists

### Diseases

In [7]:
diseases = pd.read_csv('/users/genomics/xoel/codebases/cortical_disorders2/data/gene_disorder_associations.intersectionExpData.csv', index_col=0)
diseases = diseases[diseases.sum(1).astype(bool)]

disease_genes = diseases.index.unique().tolist()

### TFs

In [8]:
tfs = pd.read_csv('/users/genomics/xoel/codebases/cortical_disorders2/raw/HumanTFs/DatabaseExtract_v_1.01.csv', index_col=0)
tfs = tfs[tfs['Is TF?']=='Yes']
tf_genes = tfs['HGNC symbol'].str.strip().unique().tolist()

______________________

In [9]:
glists = {'Diseases': disease_genes,
          'TFs': tf_genes}

### Regulons

#### Disease

In [10]:
disreg = pd.read_csv('/users/genomics/xoel/codebases/cortical_disorders2/results/RCT_diseases/cisTarget_regulons.csv')

disreg_genes = np.unique(disreg[['Core', 'Target']].to_numpy().flatten()).tolist()
disreg_cores = disreg['Core'].unique().tolist()
disreg_targets = disreg['Target'].unique().tolist()

glists['Disease regulon'] = disreg_genes
glists['Disease core'] = disreg_cores
glists['Disease target'] = disreg_targets

In [11]:
disset_cores = {k+' core': v['Core'].unique().tolist() for k,v in disreg.groupby('geneSet')}
disset_targets = {k+' target': v['Target'].unique().tolist() for k,v in disreg.groupby('geneSet')}

glists.update(disset_cores)
glists.update(disset_targets)

#### Peaks

In [12]:
peakreg = pd.read_csv('/users/genomics/xoel/codebases/cortical_disorders2/results/RCT_peaks/cisTarget_regulons.csv')

peakreg_genes = np.unique(peakreg[['Core', 'Target']].to_numpy().flatten())
peakreg_cores = peakreg['Core'].unique()
peakreg_targets = peakreg['Target'].unique()


glists['Peak regulon'] = peakreg_genes
glists['Peak core'] = peakreg_cores
glists['Peak target'] = peakreg_targets

In [13]:
peakset_cores = {k+' core': v['Core'].unique().tolist() for k,v in peakreg.groupby('geneSet')}
peakset_targets = {k+' target': v['Target'].unique().tolist() for k,v in peakreg.groupby('geneSet')}

glists.update(peakset_cores)
glists.update(peakset_targets)

In [14]:
regulon_genes = sorted(list(set(disreg_genes).union(set(peakreg_genes))))
regulon_cores = sorted(list(set(disreg_cores).union(set(peakreg_cores))))
regulon_targets = sorted(list(set(disreg_targets).union(set(peakreg_targets))))

glists['Regulon gene'] = regulon_genes
glists['Regulon core'] = regulon_cores
glists['Regulon target'] = regulon_targets

In [15]:
g_subsets = {
    'Cores': np.unique(glists['Regulon core']),
    'Cores+Dis': np.unique(list(glists['Regulon core'])+list(glists['Diseases'])),
    'DiseaseRisk': np.unique(glists['Diseases']),
    'PeakCores+DisReg': np.unique(list(glists['Regulon core'])+list(glists['Disease regulon']))
}

# Links filtering

In [34]:
links_folders = {k: f+'cluster_GRN/' for k,f in dirlist.items()}
links_folders

{'Gliogenesis': '/users/genomics/xoel/codebases/co_new/data_foxg1/Gliogenesis/cluster_GRN/',
 'NeuralPCW16': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW16/cluster_GRN/',
 'NeuralPCW20': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW20/cluster_GRN/',
 'NeuralPCW21': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW21/cluster_GRN/',
 'NeuralPCW24': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW24/cluster_GRN/'}

In [35]:
grn_links = []

for sample, link_folder in tqdm(links_folders.items()):
    # print(sample)
    link_files = os.listdir(link_folder)
    clusters = set([f.split('.')[0] for f in link_files if f[0] != '.'])
    clusters

    grns_df = []

    for cl in tqdm(clusters, colour='yellow'):
        
        raw_grn = pd.read_csv(
            [os.path.join(link_folder, l) for l in link_files if cl in l and 'raw_GRN' in l][0], 
            index_col=[0]).sort_values(by='p', ascending=True)
        
        filt_grn = pd.read_csv(
            [os.path.join(link_folder, l) for l in link_files if cl in l and 'filtered_GRN' in l][0], 
            index_col=[0]).sort_values(by='p', ascending=True)

        raw_grn['cell.type'] = cl
        raw_grn['kept'] = raw_grn.index.isin(filt_grn.index)

        grns_df.append(raw_grn)
#         # print(raw_grn.head())

    grns_df = pd.concat(grns_df, ignore_index=True)
    grns_df['Sample'] = sample

    grn_links.append(grns_df)

grn_links = pd.concat(grn_links, ignore_index=True)
print(grn_links.shape)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

(5980706, 9)


In [36]:
grn_links = grn_links[~(grn_links['coef_mean'] == 0 & grn_links['p'].isna() & ~grn_links['kept'])]

In [37]:
grn_links.to_csv(resultsdir+'LinkFiltering.txt', index=False, header=True, sep='\t')

In [38]:
max_excel_rows = 1048576

with pd.ExcelWriter('LinkFiltering.xlsx') as writer:
    for sample, sdf in grn_links.groupby('Sample'):
        print(f'Writing excel sheet: {sample}')
        if sdf.shape[0] < max_excel_rows:
            sdf.to_excel(writer, sheet_name=sample, index=False, header=True)

        else:
            print('Dividing...')
            sdf.iloc[:max_excel_rows,:].to_excel(writer, sheet_name=sample+'_1', index=False, header=True)
            sdf.iloc[max_excel_rows:,:].to_excel(writer, sheet_name=sample+'_2', index=False, header=True)


Writing excel sheet: Gliogenesis
Dividing...
Writing excel sheet: NeuralPCW16
Dividing...
Writing excel sheet: NeuralPCW20
Dividing...
Writing excel sheet: NeuralPCW21
Writing excel sheet: NeuralPCW24
Dividing...


# Merged scores

In [39]:
import celloracle as co

Matplotlib is building the font cache; this may take a moment.


In [40]:
dirlist.items()

dict_items([('Gliogenesis', '/users/genomics/xoel/codebases/co_new/data_foxg1/Gliogenesis/'), ('NeuralPCW16', '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW16/'), ('NeuralPCW20', '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW20/'), ('NeuralPCW21', '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW21/'), ('NeuralPCW24', '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW24/')])

In [41]:
links_files = {k: v+[f for f in os.listdir(v) if 'celloracle.links' in f and not 'Perturbation' in f][0] for k, v in dirlist.items()}
links_files

{'Gliogenesis': '/users/genomics/xoel/codebases/co_new/data_foxg1/Gliogenesis/Links.celloracle.links',
 'NeuralPCW16': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW16/Links.celloracle.links',
 'NeuralPCW20': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW20/Links.celloracle.links',
 'NeuralPCW21': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW21/Links.celloracle.links',
 'NeuralPCW24': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW24/Links.celloracle.links'}

In [42]:
scores_df = []
for tag, fname in links_files.items():
    print(tag)
    scores = co.load_hdf5(fname).merged_score.reset_index().rename(
        columns={'name':'Gene', 'cluster': 'cell.type'})
    scores['Sample'] = tag
    scores_df.append(scores)
scores_df = pd.concat(scores_df, axis=0, ignore_index=True)
scores_df['is.source'] = scores_df['Gene'].isin(grn_links['source'].unique())

Gliogenesis
NeuralPCW16
NeuralPCW20
NeuralPCW21
NeuralPCW24


In [43]:
scores_df.head()

Unnamed: 0,Gene,degree_all,degree_centrality_all,degree_in,degree_centrality_in,degree_out,degree_centrality_out,betweenness_centrality,eigenvector_centrality,cell.type,module,connectivity,participation,role,Role name,Sample,is.source
0,FOS,88,0.15331,9,0.015679,79,0.137631,3170.0,1.0,Astro,5,4.311375,0.649135,Connector Hub,Connector Hub,Gliogenesis,True
1,MALAT1,21,0.036585,21,0.036585,0,0.0,0.0,0.472218,Astro,3,1.034229,0.680272,Connector,Connector,Gliogenesis,False
2,E2F1,97,0.16899,1,0.001742,96,0.167247,109.0,0.370791,Astro,4,8.938748,0.409939,Connector Hub,Connector Hub,Gliogenesis,True
3,CCNB2,8,0.013937,8,0.013937,0,0.0,0.0,0.226789,Astro,3,-0.326599,0.71875,Connector,Connector,Gliogenesis,False
4,FOXM1,32,0.055749,1,0.001742,31,0.054007,31.0,0.074852,Astro,4,2.618966,0.408203,Connector Hub,Connector Hub,Gliogenesis,True


In [44]:
scores_df.to_csv(resultsdir+'NetworkScores.txt', index=False, header=True, sep='\t')
scores_df.to_excel(resultsdir+'NetworkScores.xlsx', index=False, header=True)

In [63]:
scores_foxg1 = scores_df[scores_df['Gene']=='FOXG1'].copy()

### Get scores from full model


In [68]:
links_files = {k: v.replace('data_foxg1', 'data')+[f for f in os.listdir(v) if 'celloracle.links' in f and not 'Perturbation' in f][0] for k, v in dirlist.items()}
links_files

{'Gliogenesis': '/users/genomics/xoel/codebases/co_new/data/Gliogenesis/Links.celloracle.links',
 'NeuralPCW16': '/users/genomics/xoel/codebases/co_new/data/NeuralPCW16/Links.celloracle.links',
 'NeuralPCW20': '/users/genomics/xoel/codebases/co_new/data/NeuralPCW20/Links.celloracle.links',
 'NeuralPCW21': '/users/genomics/xoel/codebases/co_new/data/NeuralPCW21/Links.celloracle.links',
 'NeuralPCW24': '/users/genomics/xoel/codebases/co_new/data/NeuralPCW24/Links.celloracle.links'}

In [69]:
scores_df = []
for tag, fname in links_files.items():
    print(tag)
    scores = co.load_hdf5(fname).merged_score.reset_index().rename(
        columns={'name':'Gene', 'cluster': 'cell.type'})
    scores['Sample'] = tag
    scores_df.append(scores)
scores_df = pd.concat(scores_df, axis=0, ignore_index=True)
scores_df['is.source'] = scores_df['Gene'].isin(grn_links['source'].unique())

Gliogenesis
NeuralPCW16
NeuralPCW20
NeuralPCW21
NeuralPCW24


In [70]:
scores_df = pd.concat([scores_df, scores_foxg1], axis=0, ignore_index=True)

In [71]:
scores_df.to_csv(resultsdir+'NetworkScoresFull.txt', index=False, header=True, sep='\t')
scores_df.to_excel(resultsdir+'NetworkScoresFull.xlsx', index=False, header=True)

# Perturbation per gene

### Funs

In [16]:
import numpy as np

In [17]:
from tqdm.contrib.concurrent import process_map  # or thread_map

In [18]:
def _summarize_pert(df):
    max_sim = max(df['Sim.step'])
    n_replicates = len(df['Replicate'].unique())
    n_cells = len(df['Cell.ID'].unique())
    
    seriess = []
    for (ct, rep), d in df[df['Sim.step'].isin([0,max_sim])].groupby(['cell.type','Replicate']):

        n_initial = (d['Sim.step'] == 0).sum()
        n_final = (d['Sim.step'] == max_sim).sum()
        n_change = n_final - n_initial
                
        ratio = n_final/n_initial
        log2_ratio = np.log2(ratio)

        seriess.append(pd.Series({
            'cell.type':ct, 
            'Replicate': rep,
            'n_initial':n_initial, 
            'n_final': n_final, 
            'n_change': n_change,
            'ratio': ratio, 
            'log2_ratio': log2_ratio}))

    summary = pd.concat(seriess, axis=1).T.groupby('cell.type').mean().reset_index().drop('Replicate', axis=1)
    
    summary['pct_initial'] = summary['n_initial']/n_cells * 100
    summary['pct_final'] = summary['n_final']/n_cells * 100
    summary['pct_change'] = summary['pct_final'] - summary['pct_initial']
    
    summary['n_steps'] = max_sim
    summary['n_replicates'] = n_replicates
    
    return(summary)

In [19]:
def _load_perturbation(pertdir, gp, summary=True, ct2ct=True, ct_common_labels=None):
    gene, exp = gp.split('_') 
    gene = '.'.join(gene.split('.')[1:])
    exp = '.'.join(exp.split('.')[:-1])
    # print(gene, exp)

    # pert_df = pd.read_csv(f'{pertdir}{gp}/markov_simulation_cell_types.csv', index_col=0).reset_index().rename(columns={'index':'Cell.ID'})
    pert_df = pd.read_csv(f'{pertdir}{gp}',
                          index_col=0).reset_index().rename(columns={'index':'Cell.ID'})
    
    if ct_common_labels:
        pert_df = pert_df.replace(ct_common_labels)
    
    pert_df['Replicate'] = pert_df.groupby('Cell.ID').apply(lambda x: pd.Series(list(range(len(x))), index=x.index)).values
    pert_df = pert_df.melt(id_vars=['Cell.ID', 'Replicate'], var_name='Sim.step', value_name='cell.type')
    pert_df['Sim.step'] = pert_df['Sim.step'].astype(int)
    
    pert_df['Gene'] = gene
    pert_df['Exp'] = exp
    pert_df['Pert'] = 'KO' if float(exp) == 0 else 'OE'
    
    results = {'df': pert_df}

    if summary:
        summary = _summarize_pert(pert_df)
        summary['Gene'] = gene
        summary['Exp'] = exp
        summary['Pert'] = 'KO' if float(exp) == 0 else 'OE'
    
        results['summary'] = summary
        
    if ct2ct:
        max_sim = max(pert_df['Sim.step'])

        inits = pert_df[pert_df['Sim.step']==0].drop('Replicate', axis=1)
        inits = inits[~inits.duplicated()].value_counts('cell.type')
        
        trans = pert_df[pert_df['Sim.step'].isin([0,max_sim])].groupby(['Replicate', 'Cell.ID'])['cell.type'].apply(lambda x: {'CTO': x.iloc[0], 'CTF': x.iloc[1]}).unstack()
        trans = trans.reset_index().value_counts(['Replicate', 'CTO', 'CTF']).reset_index().groupby(['CTO', 'CTF'])[0].mean().reset_index().rename({0: 'trans.cells'}, axis=1)

        for cto in trans['CTO'].unique():
            for ctf in trans['CTO'].unique():
                if ~((trans['CTO'] == cto ) & (trans['CTF']==ctf)).any():
                    trans = pd.concat([
                        trans,
                        pd.DataFrame({'CTO':cto, 'CTF':ctf, 'trans.cells':0}, index=[0])], 
                        axis=0, ignore_index=True)
        
        trans['init.cells'] = inits[trans.CTO].tolist()
        trans['trans.pct'] = trans['trans.cells']/trans['init.cells']*100
        
        trans['Gene'] = gene
        trans['Exp'] = exp
        trans['Pert'] = 'KO' if float(exp) == 0 else 'OE'

        # We want to show negative values for self-transitions
        # trans['trans.pct'] = np.where(
        #     trans['CTO']==trans['CTF'],
        #     trans['trans.pct']-100,
        #     trans['trans.pct'])
        
        results['ct2ct'] = trans
    
    return results

### For main figure

In [20]:
pert_folders_ko = {k: f+'pert_KO/perturbation_transitions/' for k, f in dirlist.items()}
# pert_folders['Gliogenesis'] = pert_folders['Gliogenesis'].replace('pert_KO', 'perturbations')
pert_folders_ko

{'Gliogenesis': '/users/genomics/xoel/codebases/co_new/data_foxg1/Gliogenesis/pert_KO/perturbation_transitions/',
 'NeuralPCW16': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW16/pert_KO/perturbation_transitions/',
 'NeuralPCW20': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW20/pert_KO/perturbation_transitions/',
 'NeuralPCW21': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW21/pert_KO/perturbation_transitions/',
 'NeuralPCW24': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW24/pert_KO/perturbation_transitions/'}

In [21]:
pert_folders_ko = {k: f+'pert_KO_full/perturbation_transitions/' for k, f in dirlist.items()}
# pert_folders['Gliogenesis'] = pert_folders['Gliogenesis'].replace('pert_KO', 'perturbations')
pert_folders_ko

{'Gliogenesis': '/users/genomics/xoel/codebases/co_new/data_foxg1/Gliogenesis/pert_KO_full/perturbation_transitions/',
 'NeuralPCW16': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW16/pert_KO_full/perturbation_transitions/',
 'NeuralPCW20': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW20/pert_KO_full/perturbation_transitions/',
 'NeuralPCW21': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW21/pert_KO_full/perturbation_transitions/',
 'NeuralPCW24': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW24/pert_KO_full/perturbation_transitions/'}

In [22]:
perturbations_ko = []
perts_ct2ct_ko = []

# for ko
for sample, pertdir in pert_folders_ko.items():
    
    print(sample)
    # if not sample in ['Gliogenesis', 'NeuralPCW20']:
    #     continue
    
    # Group all RGC cell types in neurogenesis
    if 'Neural' in sample:
        ct_common_labels = {
            'vRG E': 'RGC',
            'vRG L': 'RGC',
            'tRG': 'RGC',
            'oRG E': 'RGC',
            'oRG L': 'RGC'
        }
    else:
        ct_common_labels = None
    def __load_perturbation(gp, pertdir=pertdir):
        return(_load_perturbation(pertdir=pertdir,
                                  ct_common_labels=ct_common_labels,
                                  gp=gp))

    # raise
    folders = [x for x in os.listdir(pertdir) if '.ipynb' not in x]
    results = process_map(__load_perturbation, 
                          folders, 
                          max_workers=n_cores)
    
    
    pert_dfs = pd.concat([x['summary'] for x in results], ignore_index=True)
    pert_dfs['Sample'] = sample
    perturbations_ko.append(pert_dfs)
    
    ct2ct = pd.concat([x['ct2ct'] for x in results], ignore_index=True)
    ct2ct['Sample'] = sample
    perts_ct2ct_ko.append(ct2ct)

    # break
    
    

Gliogenesis


  0%|          | 0/174 [00:00<?, ?it/s]

  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)


NeuralPCW16


  0%|          | 0/152 [00:00<?, ?it/s]

NeuralPCW20


  0%|          | 0/177 [00:00<?, ?it/s]

  log2_ratio = np.log2(ratio)


NeuralPCW21


  0%|          | 0/150 [00:00<?, ?it/s]

NeuralPCW24


  0%|          | 0/178 [00:00<?, ?it/s]

  log2_ratio = np.log2(ratio)


In [23]:
pert_folders_oe = {k: f+'pert_OE_full/perturbation_transitions/' for k, f in dirlist.items()}
# pert_folders['Gliogenesis'] = pert_folders['Gliogenesis'].replace('pert_KO', 'perturbations')
pert_folders_oe

{'Gliogenesis': '/users/genomics/xoel/codebases/co_new/data_foxg1/Gliogenesis/pert_OE_full/perturbation_transitions/',
 'NeuralPCW16': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW16/pert_OE_full/perturbation_transitions/',
 'NeuralPCW20': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW20/pert_OE_full/perturbation_transitions/',
 'NeuralPCW21': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW21/pert_OE_full/perturbation_transitions/',
 'NeuralPCW24': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW24/pert_OE_full/perturbation_transitions/'}

In [24]:
perturbations_oe = []
perts_ct2ct_oe = []
for sample, pertdir in pert_folders_oe.items():
    
    print(sample)
    # if not sample in ['Gliogenesis', 'NeuralPCW20']:
    #     continue

    # Group all RGC cell types in neurogenesis
    # if sample =='NeuralPCW21':
    #     continue
        
    if 'Neural' in sample:
        ct_common_labels = {
            'vRG E': 'RGC',
            'vRG L': 'RGC',
            'tRG': 'RGC',
            'oRG E': 'RGC',
            'oRG L': 'RGC'
        }
    else:
        ct_common_labels = None
    def __load_perturbation(gp, pertdir=pertdir):
        return(_load_perturbation(pertdir=pertdir,
                                  ct_common_labels=ct_common_labels,
                                  gp=gp))

    # raise
    folders = [x for x in os.listdir(pertdir) if '.ipynb' not in x]
    results = process_map(__load_perturbation, 
                          folders, 
                          max_workers=n_cores)
    
    
    pert_dfs = pd.concat([x['summary'] for x in results], ignore_index=True)
    pert_dfs['Sample'] = sample
    perturbations_oe.append(pert_dfs)
    
    ct2ct = pd.concat([x['ct2ct'] for x in results], ignore_index=True)
    ct2ct['Sample'] = sample
    perts_ct2ct_oe.append(ct2ct)

    # break
    

Gliogenesis


  0%|          | 0/174 [00:00<?, ?it/s]

  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)
  log2_ratio = np.log2(ratio)


NeuralPCW16


  0%|          | 0/152 [00:00<?, ?it/s]

NeuralPCW20


  0%|          | 0/177 [00:00<?, ?it/s]

NeuralPCW21


  0%|          | 0/150 [00:00<?, ?it/s]

NeuralPCW24


  0%|          | 0/178 [00:00<?, ?it/s]

In [25]:
perturbations = pd.concat(perturbations_ko+perturbations_oe, ignore_index=True)
perts_ct2ct = pd.concat(perts_ct2ct_ko+perts_ct2ct_oe, ignore_index=True)

In [26]:
# perturbations = pd.concat(perturbations_ko, ignore_index=True)
# perts_ct2ct = pd.concat(perts_ct2ct_ko, ignore_index=True)

In [128]:
perturbations.groupby('Sample')['cell.type'].unique()['Gliogenesis']

array(['Astro', 'OPC', 'mGPC', 'oRG E', 'oRG L', 'tRG', 'vRG E', 'vRG L'],
      dtype=object)

In [27]:
perturbations.to_csv(resultsdir+'PerturbationSummary.txt', index=False, header=True, sep='\t')
perturbations.to_excel(resultsdir+'PerturbationSummary.xlsx', index=False, header=True)

In [28]:
perts_ct2ct.to_csv(resultsdir+'PerturbationCT2CT.txt', index=False, header=True, sep='\t')
perts_ct2ct.to_excel(resultsdir+'PerturbationCT2CT.xlsx', index=False, header=True)

In [131]:
perturbations_foxg1 = perturbations.copy()
perts_ct2ct_foxg1 = perts_ct2ct.copy()

### Split RGCs

## Load results from full perturbation model

# Perturbation Score Sums (PSSs)

In [29]:
pss_folders = {k: f+'pert_KO_full/perturbation_hdf5/' for k, f in dirlist.items()}
# pss_folders['Gliogenesis'] = pss_folders['Gliogenesis'].replace('pert_KO', 'perturbations')
pss_folders

{'Gliogenesis': '/users/genomics/xoel/codebases/co_new/data_foxg1/Gliogenesis/pert_KO_full/perturbation_hdf5/',
 'NeuralPCW16': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW16/pert_KO_full/perturbation_hdf5/',
 'NeuralPCW20': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW20/pert_KO_full/perturbation_hdf5/',
 'NeuralPCW21': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW21/pert_KO_full/perturbation_hdf5/',
 'NeuralPCW24': '/users/genomics/xoel/codebases/co_new/data_foxg1/NeuralPCW24/pert_KO_full/perturbation_hdf5/'}

In [30]:
psss = pd.concat([pd.concat([pd.read_csv(v+s, index_col=0) for s in os.listdir(v) if '.csv' in s], ignore_index=True).assign(sample=k) for k, v in pss_folders.items() ], ignore_index=True)

In [31]:
psss

Unnamed: 0,group,score,gene,sample
0,vRG,0.055545,TFAP2C,Gliogenesis
1,vtRG,0.054022,TFAP2C,Gliogenesis
2,oRG,0.092332,TFAP2C,Gliogenesis
3,RG E,0.042577,TFAP2C,Gliogenesis
4,RG L,0.055545,TFAP2C,Gliogenesis
...,...,...,...,...
28382,not_nIPC,1.240779,SOX8,NeuralPCW24
28383,not_Neu E,1.894556,SOX8,NeuralPCW24
28384,not_GluN5,1.441386,SOX8,NeuralPCW24
28385,not_GluN6,1.560752,SOX8,NeuralPCW24


In [32]:
psss.group = pd.Categorical(psss.group, psss.group.unique())

In [33]:
psss.to_csv(resultsdir+'PSSs.txt', index=False, header=True, sep='\t')

In [79]:
# psss_wide = psss.pivot(index='gene', columns=['sample', 'group'], values='log1p')
# psss_wide.to_excel(resultsdir+'PSSs.xlsx', index=False, header=True)

In [80]:
psss_foxg1 = psss.copy()

### Add full model

In [81]:
pss_folders = {k: f.replace('data_foxg1', 'data')+'perturbations/perturbation_hdf5/' for k, f in dirlist.items()}
# pss_folders['Gliogenesis'] = pss_folders['Gliogenesis'].replace('pert_KO', 'perturbations')
pss_folders

{'Gliogenesis': '/users/genomics/xoel/codebases/co_new/data/Gliogenesis/perturbations/perturbation_hdf5/',
 'NeuralPCW16': '/users/genomics/xoel/codebases/co_new/data/NeuralPCW16/perturbations/perturbation_hdf5/',
 'NeuralPCW20': '/users/genomics/xoel/codebases/co_new/data/NeuralPCW20/perturbations/perturbation_hdf5/',
 'NeuralPCW21': '/users/genomics/xoel/codebases/co_new/data/NeuralPCW21/perturbations/perturbation_hdf5/',
 'NeuralPCW24': '/users/genomics/xoel/codebases/co_new/data/NeuralPCW24/perturbations/perturbation_hdf5/'}

In [82]:
pss_folders

{'Gliogenesis': '/users/genomics/xoel/codebases/co_new/data/Gliogenesis/perturbations/perturbation_hdf5/',
 'NeuralPCW16': '/users/genomics/xoel/codebases/co_new/data/NeuralPCW16/perturbations/perturbation_hdf5/',
 'NeuralPCW20': '/users/genomics/xoel/codebases/co_new/data/NeuralPCW20/perturbations/perturbation_hdf5/',
 'NeuralPCW21': '/users/genomics/xoel/codebases/co_new/data/NeuralPCW21/perturbations/perturbation_hdf5/',
 'NeuralPCW24': '/users/genomics/xoel/codebases/co_new/data/NeuralPCW24/perturbations/perturbation_hdf5/'}

In [83]:
psss = pd.concat([pd.concat([pd.read_csv(v+s, index_col=0) for s in os.listdir(v) if '.csv' in s], ignore_index=True).assign(sample=k) for k, v in tqdm(pss_folders.items()) ], ignore_index=True)

  0%|          | 0/5 [00:00<?, ?it/s]

In [84]:
psss

Unnamed: 0,group,score,gene,sample
0,vRG,0.009213,MEF2C,Gliogenesis
1,vtRG,0.009381,MEF2C,Gliogenesis
2,oRG,0.158319,MEF2C,Gliogenesis
3,RG E,0.004935,MEF2C,Gliogenesis
4,RG L,0.009213,MEF2C,Gliogenesis
...,...,...,...,...
26806,not_nIPC,19.159503,CREB5,NeuralPCW24
26807,not_Neu E,20.262669,CREB5,NeuralPCW24
26808,not_GluN5,7.495030,CREB5,NeuralPCW24
26809,not_GluN6,17.094604,CREB5,NeuralPCW24


In [85]:
psss.group = pd.Categorical(psss.group, psss.group.unique())

In [86]:
psssfull = pd.concat([psss.assign(model='full'), psss_foxg1.assign(model='foxg1')], axis=0, ignore_index=True)

In [87]:
psssfull.model.value_counts()

full     26811
foxg1     3993
Name: model, dtype: int64

In [88]:
psssfull.to_csv(resultsdir+'PSSsFull.txt', index=False, header=True, sep='\t')

# Gene presence

In [45]:
import pandas as pd
import numpy as np

In [46]:
## Files

In [47]:
from tqdm.auto import tqdm

In [48]:
import scanpy as sc
import pandas as pd
import numpy as np
from tqdm.contrib.concurrent import process_map  # or thread_map

In [51]:
paths = pd.DataFrame({'path': {
    'RNA dataset': 'RNA.unprocessed.h5ad',
    'RNA HV': 'RNA.processed.h5ad',
    'ATAC GRN': 'base_grn.coaccessibility=0.8.score=8.base_grn.parquet',
    'CT GRN dir': 'cluster_GRN/',
    'Perturbation dir': 'pert_KO_full/perturbation_transitions/'
}})

paths['type'] = paths['path'].apply(lambda x:
                                    'folder' if x[-1] == '/' else x.split('.')[-1])

paths

Unnamed: 0,path,type
ATAC GRN,base_grn.coaccessibility=0.8.score=8.base_grn....,parquet
CT GRN dir,cluster_GRN/,folder
Perturbation dir,pert_KO_full/perturbation_transitions/,folder
RNA HV,RNA.processed.h5ad,h5ad
RNA dataset,RNA.unprocessed.h5ad,h5ad


In [52]:
def search_genes_in_pertdir(x, genetable):
    
    pert_genes = list({y.split('_')[0].split('.')[1] for y in os.listdir(x['path'])})
        
    genetable['Perturbed'] = genetable.index.isin(pert_genes)
    
    
    return(genetable)

def search_genes_in_ct_grns(x, genetable):
    
    grn_csvs = {k.split('.')[0]: x['path']+k for k in [p for p in os.listdir(x['path']) if 
                                                                     'raw_GRN' not in p and
                                                                     '.ipynb' not in p]}
    # print(grn_csvs)
    grn_csvs = {k: pd.read_csv(f, index_col=0) for k,f in grn_csvs.items()}  
    # print(grn_csvs)
    grn_genes = {f'{k} GRN {t}': v.unique().tolist() for k,df in grn_csvs.items() for t,v in df[['source', 'target']].items()}
    # print(grn_genes)
    for col, gs in grn_genes.items():
        genetable[col] = genetable.index.isin(gs)
    
    return(genetable)

def search_genes_in_parquet(x, genetable):
    data = pd.read_parquet(x['path'])
    tfs = data.columns[2:].unique().tolist()
    targets = data['gene_short_name'].unique().tolist()
    
    genetable[x.name + ' SOURCE'] = genetable.index.isin(tfs)
    genetable[x.name + ' TARGET'] = genetable.index.isin(targets)
    
    
    return(genetable)

def search_genes_in_h5ad(x, genetable):
    
    data = sc.read_h5ad(x['path'], backed=True)
    gset = data.var.index.unique().tolist()
    
    genetable[x.name] = genetable.index.isin(gset)
    
    
    return(genetable)

def search_genes(x, genelists):
    
    
    genes = pd.Series(list({g for _,gs in glists.items() for g in gs}))
    genetable = pd.DataFrame({k: genes.isin(gs) for k, gs in glists.items()})
    genetable.index = genes.tolist()
    
    
    
    
    if x['type'] == 'parquet':
        # print('Doing parquet')
        genetable = search_genes_in_parquet(x, genetable=genetable)
        # print('Done')
        
    if x['type'] == 'h5ad':
        # print('Doing h5ad')
        genetable = search_genes_in_h5ad(x, genetable=genetable)
        # print('Done')
        
    if 'cluster_GRN/' in x['path']:
        # print('Doing cluster GRN')
        genetable = search_genes_in_ct_grns(x, genetable=genetable)    
        # print('Done')
        
    if 'perturbations/' in x['path']:
        # print('Doing perturbations')
        genetable = search_genes_in_pertdir(x, genetable=genetable)    
        # print('Done')

        
    return(genetable)

def search_path_prefix(prefix, paths, genelists):
    
    paths['path'] = prefix+paths['path']

    genetable = None
    dfs = list()
    for i in tqdm(range(paths.shape[0])):
        # print(f'Processing {paths.iloc[i,0]}')
        df = search_genes(x=paths.iloc[i,:], genelists=genelists)
        if genetable is None:
            genetable = df[list(genelists.keys())]
        dfs.append(df.drop(list(genelists.keys()), axis=1))

    genetable = pd.merge(
        genetable,
        pd.concat(dfs, axis=1, ignore_index=False),
        left_index=True, right_index=True)    
    
    return(genetable)

def _search_path_prefix(prefix):
    return(search_path_prefix(prefix=prefix, paths=paths.copy(), genelists=glists))


In [53]:
# presence_dfs = {k: search_path_prefix(prefix=v, paths=paths.copy(), genelists=glists) for k, v in dirlist.items()}
# presence_dfs

In [54]:
ks,vs = tuple(zip(*dirlist.items()))

In [55]:
# presence_dfs = {}
# for k, v in dirlist.items():
#     print(k)
#     presence_dfs[k] = search_path_prefix(prefix=v, paths=paths.copy(), genelists=glists)
    

presence_dfs = dict(list(zip(ks, 
                             process_map(_search_path_prefix, vs, max_workers=len(vs)))))

  0%|          | 0/5 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: '/users/genomics/xoel/codebases/co_new/data_foxg1/Gliogenesis/base_grn.coaccessibility=0.8.score=8.base_grn.parquet'

In [None]:
[df.index.str.strip().is_unique for k, df in presence_dfs.items()]

In [None]:
presencedir =  resultsdir+'GenePresence/'
os.makedirs(presencedir, exist_ok=True)

In [None]:
with pd.ExcelWriter(presencedir+'GenePresence.xlsx') as writer:
    for sample, sdf in presence_dfs.items():
        print(f'Writing excel sheet: {sample}')
        sdf.to_excel(writer, sheet_name=sample, index=True, header=True)
        sdf.to_csv(presencedir+sample+'.xlsx', index=True, header=True)
        

In [None]:
### Long format

In [None]:
pres_long = []
for sample, df in presence_dfs.items():
    df = df.reset_index().melt(id_vars=['index']+list(glists.keys())).rename({'index':'Gene'}, axis=1)
    df['Sample'] = sample
    pres_long.append(df)
pres_long = pd.concat(pres_long, axis=0)

In [None]:
steps = {
    'ATAC GRN SOURCE':1, 
    'ATAC GRN TARGET':2, 

    'RNA dataset':3,
    'RNA HV':4,
    
    'Perturbed':6}

pres_long['step'] = pres_long.variable.replace(steps)
pres_long.loc[pres_long.variable.apply(lambda x: 'GRN' in x and 'ATAC' not in x), 'step'] = 5    


In [None]:
pres_long['Cell type'] = np.where(
    pres_long.step == 5,
    pres_long.variable.apply(lambda x: x.split(' GRN')[0]),
    np.nan)

In [None]:
pres_long.variable.unique()

In [None]:
pres_long.to_excel(presencedir+'GenePresenceLong.xlsx', sheet_name='GenePresence', index=False, header=True)
pres_long.to_csv(presencedir+'GenePresenceLong.csv', index=False, header=True)

In [None]:
pres_long[(pres_long['Gene']=='FOXG1') & (pres_long['variable'].str.startswith('RNA'))]