### Target filtering procedure

In [1]:
import pandas as pd
import glob

import scanpy as sc
import numpy as np

In [2]:
# Top 50 targets for each TF (targets with highest weight)
def get_topk(df, tf, k):
    return df[df['TF'] == tf].sort_values('importance', ascending=False)[:k]

def filter_topk(grnboost_out, k=50):
    
    tfs = grnboost_out['TF'].unique()
    tf_dfs = []
    for tf in tfs:
        tf_dfs.append(get_topk(grnboost_out, tf, k=k))
        
    return pd.concat(tf_dfs)

# Targets with importance > the 95th percentile
def get_pc(grnboost_out, pc=95):
    return grnboost_out.sort_values('importance', ascending=False)[:int(len(grnboost_out)*(1-0.01*pc))]

# Get filtered adjacency lists
def get_filtered_adj_list(grnboost_out):
    filters = {}
    filters['top50'] = filter_topk(grnboost_out, k=50)
    filters['95pc'] = get_pc(grnboost_out, pc=95)

    return filters

In [None]:
# Generate filtered adjacency files for GRNboost graph

#names = ['norman']
#names = ['tian2019_neuron_hvg', 'tian2019_ipsc_hvg', 'jost2020_hvg', 'replogle2020_hvg']
names = ['adamson']

for name in names:
    for split in range(5,6):
        # Read GRNboost output
        grnboost_out = pd.read_csv('./adjacencies_'+name+'_'+str(split)+'_grnboost.csv', index_col =0)
        filtered = get_filtered_adj_list(grnboost_out)

        # Save filtered graphs
        filtered['top50'].to_csv('/dfs/project/perturb-gnn/graphs/linear/grnboost/'+name+'_'+str(split)+'_top50.csv', 
                                 index=False, header=False)
        filtered['95pc'].to_csv('/dfs/project/perturb-gnn/graphs/linear/grnboost/'+name+'_'+str(split)+'_95pc.csv',
                                 index=False, header=False)