In [None]:
import pandas as pd
from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2

from distributed import LocalCluster, Client

import os

# README

This notebook contains the code used to create GRNBoost2 networks from GTEx data.

The files `GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz` and `GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt` can be downloaded from https://www.gtexportal.org/home/datasets.

Set the paths to your directory below.

The Python [arboreto](https://github.com/aertslab/arboreto) package needs to be installed.


In [None]:
PATH = "./"

In [None]:
%cd {PATH}

In [None]:
local_cluster = LocalCluster(n_workers=12,
                                 threads_per_worker=2,
                                 memory_limit=8e9)
custom_client = Client(local_cluster)


In [None]:
def download_tf_names():
    tf_names = pd.read_excel(
        "https://www.biorxiv.org/content/biorxiv/early/2020/12/07/2020.10.28.359232/DC1/embed/media-1.xlsx",
        engine='openpyxl', sheet_name=1)
    tf_names = tf_names.loc[tf_names["Pseudogene"].isnull(), "HGNC approved gene symbol"].values
    return list(tf_names)

In [None]:
feather_file = "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.feather"
if not os.path.exists(feather_file):
    print("reading txt file and creating feather for fast access")
    GTEX = "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz"
    ex_matrix = pd.read_csv(GTEX, sep='\t', skiprows=2)
    ex_matrix = ex_matrix.drop(columns=["Name"])
    ex_matrix.to_feather(feather_file)
else:
    print("reading feather file")
    ex_matrix = pd.read_feather(feather_file)
ex_matrix = ex_matrix.set_index("Description")
ex_matrix = ex_matrix.groupby(ex_matrix.index).mean()

ex_matrix = ex_matrix.T

gene_names = list(ex_matrix.columns)
assert ex_matrix.shape[1] == len(gene_names)

In [None]:
ananse_tissues = [
    "Adrenal Gland", 
    "Bone Marrow", 
    "Brain",
    "Cervix Uteri", 
    "Colon", 
    "Esophagus", 
    "Heart", 
    "Liver", 
    "Lung", 
    "Ovary", 
    "Pancreas", 
    "Prostate", 
    "Muscle", 
    "Skin", 
    "Small Intestine", 
    "Spleen",
    "Stomach"
]

In [None]:
metadata = pd.read_table("GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt")

In [None]:
metacount = metadata.groupby("SMTS").count()
metacount

In [None]:
tf_names = download_tf_names()

In [None]:
for tissue in ananse_tissues:
    if os.path.exists(f"{PATH}/gtex.{tissue}.grnboost2.feather"):
        print("skipping", tissue, "already done")
        continue
    print(tissue)
    try:
        samples = metadata.loc[metadata["SMTS"] == tissue, "SAMPID"].values
        samples = [s for s in samples if s in ex_matrix.index]
        network = grnboost2(expression_data=ex_matrix.loc[samples].values,
                                gene_names=gene_names,  # specify the gene_names
                                tf_names=tf_names, 
                            client_or_address=custom_client)
        network.reset_index().to_feather(f"{PATH}/gtex.{tissue}grnboost2.feather")
    except:
        print(f"{tissue} failed")

In [None]:
custom_client.close()
local_cluster.close()