### Target filtering procedure

In [1]:
pwd

'/home/share/huadjyin/home/zhoumin3/zhoumin/benchmark_data/01A_total_re/07grn/02graph_filter'

In [1]:
import pandas as pd
import glob
import scanpy as sc
import numpy as np
import os

In [None]:
def get_topk(df, tf, k):
    return df[df['TF'] == tf].sort_values('importance', ascending=False)[:k]

def filter_topk(grnboost_out, k=50):
    tfs = grnboost_out['TF'].unique()
    tf_dfs = []
    for tf in tfs:
        tf_dfs.append(get_topk(grnboost_out, tf, k=k))  
    return pd.concat(tf_dfs)

def get_pc(grnboost_out, pc=95):
    return grnboost_out.sort_values('importance', ascending=False)[:int(len(grnboost_out)*(1-0.01*pc))]

def get_filtered_adj_list(grnboost_out):
    filters = {}
    filters['top50'] = filter_topk(grnboost_out, k=50)
    filters['95pc'] = get_pc(grnboost_out, pc=95)

    return filters

In [6]:
# Generate filtered adjacency files for GRNboost graph

names = ['adamsonweissman2016_gsm2406675_1', 'adamsonweissman2016_gsm2406677_2', 'datlingerbock2017_stimulated',
        'datlingerbock2017_unstimulated', 'datlingerbock2021_stimulated', 'datlingerbock2021_unstimulated', 'dixit_combined',
        'dixit_gsm2396858', 'dixit_gsm2396861', 'papalexisatija2021_eccite_arrayed_rna', 'papalexisatija2021_eccite_rna',
        'tiankampmann2019_ipsc', 'tiankampmann2019_day7neuron', 'tiankampmann2021_crispra', 'xucao2023']

for name in names:
    for split in range(1,6):
        # Read GRNboost output
        grnboost_out = pd.read_csv(f'../01scenic_adj/{name}_adjacencies_' + str(split) + '.csv', index_col =0)
        filtered = get_filtered_adj_list(grnboost_out)

        # Save filtered graphs
        directory = f'./{name}'
        os.makedirs(directory, exist_ok=True)
        
        filtered['top50'].to_csv(f'./{name}/{name}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
        filtered['95pc'].to_csv(f'./{name}/{name}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)
    print(f'{name} finished')


adamsonweissman2016_gsm2406675_1 finished
adamsonweissman2016_gsm2406677_2 finished
datlingerbock2017_stimulated finished
datlingerbock2017_unstimulated finished
datlingerbock2021_stimulated finished
datlingerbock2021_unstimulated finished
dixit_combined finished
dixit_gsm2396858 finished
dixit_gsm2396861 finished
papalexisatija2021_eccite_arrayed_rna finished
papalexisatija2021_eccite_rna finished
tiankampmann2019_ipsc finished
tiankampmann2019_day7neuron finished
tiankampmann2021_crispra finished
xucao2023 finished


In [4]:
names = ['dixit_combined', 'dixit_gsm2396858', 'dixit_gsm2396861']

for name in names:
    for split in range(1,6):
        # Read GRNboost output
        grnboost_out = pd.read_csv(f'../01scenic_adj/{name}_adjacencies_' + str(split) + '.csv', index_col =0)
        filtered = get_filtered_adj_list(grnboost_out)

        # Save filtered graphs
        directory = f'./{name}'
        os.makedirs(directory, exist_ok=True)
        
        filtered['top50'].to_csv(f'./{name}/{name}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
        filtered['95pc'].to_csv(f'./{name}/{name}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)
    print(f'{name} finished')


dixit_combined finished
dixit_gsm2396858 finished
dixit_gsm2396861 finished


In [10]:
names = 'adamsonweissman2016_gsm2406681_3'

for split in range(1,6):
    grnboost_out = pd.read_csv(f'../01scenic_adj/{names}_adjacencies_' + str(split) + '.csv', index_col =0)
    filtered = get_filtered_adj_list(grnboost_out)
    directory = f'./{names}'
    os.makedirs(directory, exist_ok=True)

    filtered['top50'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
    filtered['95pc'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)
print(f'{names} finished')

adamsonweissman2016_gsm2406681_3 finished


In [11]:
names = 'normanweissman2019'

for split in range(1,6):
    grnboost_out = pd.read_csv(f'../01scenic_adj/{names}_adjacencies_' + str(split) + '.csv', index_col =0)
    filtered = get_filtered_adj_list(grnboost_out)
    directory = f'./{names}'
    os.makedirs(directory, exist_ok=True)

    filtered['top50'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
    filtered['95pc'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)
print(f'{names} finished')

normanweissman2019 finished


In [3]:
names = 'replogle_k562_essential'

for split in range(1,2):
    # Read GRNboost output
    grnboost_out = pd.read_csv(f'../01scenic_adj/{names}_adjacencies_' + str(split) + '.csv', index_col =0)
    filtered = get_filtered_adj_list(grnboost_out)

    filtered['top50'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
    filtered['95pc'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)

In [3]:
names = 'replogle_rpe1_essential'

for split in range(5,6):
    # Read GRNboost output
    grnboost_out = pd.read_csv(f'../01scenic_adj/{names}_adjacencies_' + str(split) + '.csv', index_col =0)
    filtered = get_filtered_adj_list(grnboost_out)

    filtered['top50'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
    filtered['95pc'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)

## 1

In [None]:
# Generate filtered adjacency files for GRNboost graph

#names = ['norman'] 
#names = ['tian2019_neuron_hvg', 'tian2019_ipsc_hvg', 'jost2020_hvg', 'replogle2020_hvg']
names = 'datlingerbock'

for name in names:
    for split in range(2,6):
        # Read GRNboost output
        grnboost_out = pd.read_csv(f'../01scenic_adj/{names}_adjacencies_' + str(split) + '.csv', index_col =0)
        filtered = get_filtered_adj_list(grnboost_out)

        # Save filtered graphs
        filtered['top50'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
        filtered['95pc'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)

## 2

In [11]:
# Generate filtered adjacency files for GRNboost graph

#names = ['norman'] 
#names = ['tian2019_neuron_hvg', 'tian2019_ipsc_hvg', 'jost2020_hvg', 'replogle2020_hvg']
names = 'ps_arrayed_rna_5000'

for split in range(1,6):
    # Read GRNboost output
    grnboost_out = pd.read_csv(f'../01scenic_adj/{names}_adjacencies_' + str(split) + '.csv', index_col =0)
    filtered = get_filtered_adj_list(grnboost_out)

        # Save filtered graphs
    filtered['top50'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
    filtered['95pc'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)


KeyboardInterrupt



## 3

In [3]:
# Generate filtered adjacency files for GRNboost graph

#names = ['norman'] 
#names = ['tian2019_neuron_hvg', 'tian2019_ipsc_hvg', 'jost2020_hvg', 'replogle2020_hvg']
names = 'ps_rna_5015'

for split in range(5,6):
    # Read GRNboost output
    grnboost_out = pd.read_csv(f'../01scenic_adj/{names}_adjacencies_' + str(split) + '.csv', index_col =0)
    filtered = get_filtered_adj_list(grnboost_out)

    filtered['top50'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
    filtered['95pc'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)

## 4

In [13]:
names = 'tk_a'
for split in range(1,6):
    grnboost_out = pd.read_csv(f'../01scenic_adj/{names}_adjacencies_' + str(split) + '.csv', index_col =0)
    filtered = get_filtered_adj_list(grnboost_out)

    filtered['top50'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
    filtered['95pc'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)

## 5

In [14]:
names = 'tk_i'

for split in range(1,6):
    grnboost_out = pd.read_csv(f'../01scenic_adj/{names}_adjacencies_' + str(split) + '.csv', index_col =0)
    filtered = get_filtered_adj_list(grnboost_out)
    filtered['top50'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
    filtered['95pc'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)

## 6

In [3]:
names = 'xucao'

for split in range(4,6):
    grnboost_out = pd.read_csv(f'../01scenic_adj/{names}_adjacencies_' + str(split) + '.csv', index_col =0)
    filtered = get_filtered_adj_list(grnboost_out)

    filtered['top50'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
    filtered['95pc'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)

In [None]:
names = 'xucao'

for split in range(1,4):
    grnboost_out = pd.read_csv(f'../01scenic_adj/{names}_adjacencies_' + str(split) + '.csv', index_col =0)
    filtered = get_filtered_adj_list(grnboost_out)

    filtered['top50'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
    filtered['95pc'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)

## 7

In [4]:
names = 'frangiehlzar'

for split in range(1,6):
    grnboost_out = pd.read_csv(f'../01scenic_adj/{names}_adjacencies_' + str(split) + '.csv', index_col =0)
    filtered = get_filtered_adj_list(grnboost_out)

    filtered['top50'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_top50.csv', 
                                 index=False, header=False)
    filtered['95pc'].to_csv(f'./{names}/{names}_spilt' + str(split) + '_95pc.csv',
                                 index=False, header=False)