# LINGER ANALYSIS

## Preparation steps

In [9]:
%matplotlib inline
import os
os.chdir('/home/kl467102/thesis/')

In [10]:
import scanpy as sc
import scipy
import pandas as pd
import time
import os

In [11]:
os.chdir('/home/kl467102/thesis/')

In [12]:
input_dir = '10x_output_hESC' # path to 10x output files
label_file = '10x_output_hESC/label.txt' # path to cell-type annotations

The input consists of `matrix.mtx`, `features.tsv` and `barcodes.tsv` and an annotation file.

In [13]:
matrix=scipy.io.mmread(os.path.join(input_dir, 'matrix.mtx'))
features=pd.read_csv(os.path.join(input_dir, 'features.tsv'), sep='\t', header=None)
barcodes=pd.read_csv(os.path.join(input_dir, 'barcodes.tsv'), sep='\t', header=None)
label=pd.read_csv(label_file, sep='\t', header=0)

We have loaded the data, now we can inspect them.

In [14]:
matrix.shape

(17736, 758)

In [15]:
dense_matrix = matrix.toarray()
subset = dense_matrix[:5, :5]
subset

array([[2.5101334 , 0.        , 1.70225118, 4.25937163, 2.85114345],
       [1.42489576, 1.20869914, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [2.14110522, 2.93625025, 0.22070073, 0.77659877, 1.37593925]])

In [16]:
pd.concat([features.head(2), features.tail(2)])

Unnamed: 0,0,1,2
0,A1BG,A1BG,Gene Expression
1,A1CF,A1CF,Gene Expression
17734,ZZZ3,ZZZ3,Gene Expression
17735,KI270713.1:31340-32243,KI270713.1:31340-32243,Peaks


In [17]:
barcodes.head(3)

Unnamed: 0,0
0,TCTCGCTTCATCTTCA-1
1,CGGGGTGGGCGTTAAC-1
2,TTCGTATAAGTATCAT-1


In [18]:
label.head(3)

Unnamed: 0,barcode_use,label
0,CGGGGTGGGCGTTAAC-1,test cell type
1,TTCGTATAAGTATCAT-1,test cell type
2,CTACGTGGCGTGACCC-1,test cell type


## INPUT MANIPULATION - we need to make it work

In this part, we test LINGER performance on dataset provided in the BEELINE, namely hESC data (scRNA-Seq on human embrionic stem cells).

We need to convert provided data in `ExpressionData.csv` to format accepted by LINGER, i.e. generate `matrix.mtx`, `features.csv` and `barcodes.tsv`. We also create mock `label.txt`. Chromosome location needs to be updated based on annotation file for `features.tsv`.

In [6]:
import os
os.chdir('/home/kl467102/thesis/')

In [71]:
import numpy as np
import pandas as pd

def generate_atac_for_rna(input_csv, features, num_regions = 500, out_csv = None):
    # List of all chromosomes (Human Genome)
    df = pd.read_csv(input_csv, index_col=0)
    if not out_csv:
        # Split the file path into directory, base name, and extension
        directory, file_name = os.path.split(input_csv)
        base_name, ext = os.path.splitext(file_name)
        new_file_name = f"{base_name}_modified{ext}"
        out_csv = os.path.join(directory, new_file_name)

    chromosomes = [
        'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 
        'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 
        'chr20', 'chr21', 'chr22', 'chrX', 'chrY'
    ]

    # Chromosome lengths (based on the human genome assembly GRCh38)
    max_chrom_length = {
        'chr1': 249250621, 'chr2': 243199373, 'chr3': 198022430, 'chr4': 191154276, 
        'chr5': 180915260, 'chr6': 171115067, 'chr7': 159138663, 'chr8': 146364022, 
        'chr9': 141213431, 'chr10': 135534747, 'chr11': 135006516, 'chr12': 133851895, 
        'chr13': 115169878, 'chr14': 107349540, 'chr15': 102531392, 'chr16': 90354753, 
        'chr17': 81195210, 'chr18': 78077248, 'chr19': 59128983, 'chr20': 63025520, 
        'chr21': 48129895, 'chr22': 50818468, 'chrX': 156040895, 'chrY': 57227415
    }
    new_df = df.copy()
    new_features = features.copy()
    num_samples = new_df.shape[1]
    # Generate random genomic regions
    regions = []
    region_index = []
    
    for _ in range(num_regions):
        chrom = np.random.choice(chromosomes)
        start_pos = np.random.randint(0, max_chrom_length[chrom] - 1000)  # random start position
        end_pos = start_pos + np.random.randint(500, 1000)  # random length between 500 and 1000
        feature = f'{chrom}:{start_pos}-{end_pos}'
        region_index.append(feature)
        regions.append([feature, chrom, start_pos, end_pos])
        #new_features.loc[len(new_features)] = []
    # Generate random values for ATAC-seq signal (e.g., counts)
    #random_values = np.random.poisson(lam=5, size=(num_regions, num_samples))  # Poisson distribution for count data
    theta = 1
    mu = 0.5
    random_values = np.random.negative_binomial(n=theta, p=theta / (theta + mu), size=(num_regions, num_samples))

    # Create a DataFrame for the random values
    random_atac_df = pd.DataFrame(random_values, columns=new_df.columns)
    random_atac_df.index = region_index
    # Concatenate the random ATAC-seq data with the existing features DataFrame
    #updated_features_df = pd.concat([features_df, random_atac_df], axis=1)

    # Create a DataFrame
    #regions_df = pd.DataFrame(regions, columns=["feature", "Chromosome", "Start", "End"])

    # Combine the data into a final DataFrame
    #final_df = pd.concat([regions_df, random_data_df], axis=1)
    new_expr = pd.concat([df, random_atac_df])
    # Save to CSV (optional)
    new_expr.to_csv(out_csv, index=True)
    return new_expr

In [72]:
input_csv = "data/hESC_ExpressionData.csv"  # Input CSV file

In [73]:
new_df = generate_atac_for_rna(input_csv, features, num_regions = 500)

In [74]:
new_df

Unnamed: 0,H9_00hb4s_001,H9_00hb4s_002,H9_00hb4s_003,H9_00hb4s_004,H9_00hb4s_005,H9_00hb4s_006,H9_00hb4s_007,H9_00hb4s_008,H9_00hb4s_009,H9_00hb4s_010,...,H9_96h_182,H9_96h_183,H9_96h_184,H9_96h_185,H9_96h_186,H9_96h_187,H9_96h_188,H9_96h_189,H9_96h_190,H9_96h_192
A1BG,2.510133,0.000000,1.702251,4.259372,2.851143,0.000000,2.357210,0.925952,0.000000,1.473057,...,0.0,1.946191,3.524386,0.000000,0.0,3.516034,0.0,0.000000,1.510372,1.418294
A1CF,1.424896,1.208699,0.000000,0.000000,0.000000,0.000000,1.130935,1.880416,0.000000,0.000000,...,0.0,0.000000,0.000000,1.910541,0.0,0.000000,0.0,0.804444,0.000000,0.883394
A2LD1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.295367,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000
A2M,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000
A2ML1,2.141105,2.936250,0.220701,0.776599,1.375939,1.473409,0.000000,1.359460,2.312497,1.584052,...,0.0,1.244709,1.886233,0.632573,0.0,1.195413,0.0,1.539408,3.034209,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrY:23615416-23616071,3.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,1.000000,0.000000,1.0,0.000000,1.0,3.000000,0.000000,0.000000
chr8:113571844-113572367,1.000000,2.000000,0.000000,2.000000,1.000000,3.000000,0.000000,1.000000,0.000000,1.000000,...,0.0,0.000000,1.000000,0.000000,1.0,0.000000,1.0,2.000000,2.000000,2.000000
chr14:86401053-86401586,3.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,1.000000,2.000000,1.000000,...,0.0,1.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000
chr21:2429270-2430038,0.000000,0.000000,1.000000,1.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.0,1.000000,0.000000,1.000000,2.0,0.000000,0.0,0.000000,1.000000,0.000000


In [None]:
import csv
import numpy as np
import pandas as pd
from scipy.io import mmwrite
from scipy.sparse import csr_matrix
import random


def generate_sample_barcodes(num_barcodes, length=16):
    """Generate unique barcodes for a single sample, all prefixed with '1-'."""
    barcodes = set()
    nucleotides = "ATCG"
    while len(barcodes) < num_barcodes:
        random_sequence = "".join(random.choices(nucleotides, k=length))
        barcode = f"{random_sequence}-1"
        barcodes.add(barcode)
    return list(barcodes)

In [76]:
import pandas as pd
import csv
import os
import re
from scipy.sparse import csr_matrix
from scipy.io import mmwrite

def process_csv_to_10x(input_csv, output_dir, barcode_length=16):
    """
    Processes input CSV to 10x-compatible output files, handling gene and chromatin accessibility data.

    Args:
        input_csv (str): Path to input CSV file.
        output_dir (str): Directory to save 10x output files.
        barcode_length (int): Length of barcodes to generate for samples.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Read the input CSV file
    data = pd.read_csv(input_csv, index_col=0)

    # Extract row identifiers
    row_ids = data.index.tolist()
    num_samples = data.shape[1]  # Number of columns = number of samples

    # Classify rows: genes vs chromatin accessibility
    gene_rows = []
    chromatin_rows = []

    chromatin_pattern = re.compile(r"^(chr|KI|GL)\d+:\d+-\d+$")  # Matches chromatin-like rows (e.g., chr1:1000-2000)
    for row_id in row_ids:
        if chromatin_pattern.match(row_id):
            chromatin_rows.append(row_id)
        else:
            gene_rows.append(row_id)

    # Generate barcodes for all samples
    barcodes = generate_sample_barcodes(num_samples, length=barcode_length)

    # Write features.tsv
    features_file = f"{output_dir}/features.tsv"
    with open(features_file, "w", newline="") as f:
        writer = csv.writer(f, delimiter="\t")
        for gene in gene_rows:
            writer.writerow([gene, gene, "Gene Expression"])
        for chromatin in chromatin_rows:
            writer.writerow([chromatin, chromatin, "Peaks"])

    # Write barcodes.tsv
    barcodes_file = f"{output_dir}/barcodes.tsv"
    with open(barcodes_file, "w", newline="") as f:
        f.write("\n".join(barcodes) + "\n")
    
    print(f"Matrix shape before adding row: {data.shape}")
    # Add a new row with 0s for all columns
    new_row = {column_name: 0 for column_name in data.columns}
    data.loc['KI270713.1:31340-32243'] = new_row
    print(f"Matrix shape after adding row: {data.shape}")

    # Create a sparse matrix and write matrix.mtx
    matrix_data = csr_matrix(data.values)
    matrix_file = f"{output_dir}/matrix.mtx"
    mmwrite(matrix_file, matrix_data)

    print(
        f"Files created:\n"
        f"- {features_file}, number of features: {len(row_ids)} "
        f"(Genes: {len(gene_rows)}, Accessibility: {len(chromatin_rows)})\n"
        f"- {barcodes_file}\n"
        f"- {matrix_file}, matrix shape: {matrix_data.shape}"
    )



In [75]:
### OLD VERSION KEPT JUST IN CASE, NEED TO WYPIERDOLIC SOON
'''
def process_csv_to_10x(input_csv, output_dir, barcode_length=16):
    # Read the input CSV file
    data = pd.read_csv(input_csv, index_col=0)

    # Extract gene names and sample names
    gene_names = data.index.tolist()	
    num_samples = data.shape[1]  # Number of columns = number of samples

    # Generate barcodes for all samples
    barcodes = generate_sample_barcodes(num_samples, length=barcode_length)

    # Write features.tsv (gene names)
    features_file = f"{output_dir}/features.tsv"
    with open(features_file, "w", newline="") as f:
        writer = csv.writer(f, delimiter="\t")
        for gene in gene_names:
            writer.writerow([gene, gene, "Gene Expression"])  # Example: gene_id, gene_name, feature_type
    # Write barcodes.tsv
    barcodes_file = f"{output_dir}/barcodes.tsv"
    with open(barcodes_file, "w", newline="") as f:
        f.write("\n".join(barcodes) + "\n")
   
    print(f'teraz mtx ma kształ {data.shape}')
    # New row to add with 0s for all columns
    new_row = {column_name: 0 for column_name in data.columns}
    
    # Add row using loc with a string index
    data.loc['KI270713.1:31340-32243'] = new_row
    print(f'po dodaniu ma ksztalt{data.shape}')
    
    # Create a sparse matrix and write matrix.mtx
    matrix_data = csr_matrix(data.values)
    matrix_file = f"{output_dir}/matrix.mtx"
    mmwrite(matrix_file, matrix_data)

    print(f"Files created:\n- {features_file}, number of features: {len(gene_names)}\n- {barcodes_file}\n- {matrix_file}, matrix shape: {matrix_data.shape}")
'''

In [82]:
# Example usage
input_csv = "data/hESC_ExpressionData.csv"  # Input CSV file
output_dir = "10x_output_hESC"  # Directory to save output files

import os
os.makedirs(output_dir, exist_ok=True)

process_csv_to_10x(input_csv, output_dir, barcode_length=16)


teraz mtx ma kształ (17735, 758)
po dodaniu ma ksztalt(17736, 758)
Files created:
- 10x_output_hESC/features.tsv, number of features: 17735
- 10x_output_hESC/barcodes.tsv
- 10x_output_hESC/matrix.mtx, matrix shape: (17736, 758)


In [83]:
import pandas as pd

def create_label_file(input_file, output_file, label="test cell type"):
    """
    Create a label file with the specified label for all barcodes.
    
    Args:
        input_file (str): Path to the input CSV file with barcode information.
        output_file (str): Path to save the generated label.txt file.
        label (str): The label to assign to all barcodes in the output file.
    """
    # Read the input CSV file
    data = pd.read_csv(input_file, sep="\t")
    data.columns = ['barcode_use']
    # Replace all labels with the provided label
    data['label'] = label
    
    # Save the modified dataframe to the output file
    data.to_csv(output_file, sep="\t", index=False, header=True, columns=["barcode_use", "label"])
    print(f"Label file created: {output_file}")


# Example usage
input_file = "10x_output_hESC/barcodes.tsv"  # Replace with your input file name
output_file = "10x_output_hESC/label.txt"  # Output file name
label = "test cell type"  # Desired label for all barcodes

create_label_file(input_file, output_file, label)


Label file created: 10x_output_hESC/label.txt


At this step we have converted `ExpressionData.csv` to pseudo 10x output with no accesibility data in it).

We also need (or do we? - to be determined) to provide chromoseome locations for the genes in expression matrix.

In [84]:
import pandas as pd
import gzip


def extract_gene_locations_from_encode_gtf(gtf_file):
    """
    Extract gene locations (chromosome, start, end) from an ENCODE GTF file.
    
    Args:
        gtf_file (str): Path to the ENCODE GTF file.
    
    Returns:
        dict: A dictionary mapping gene names to (chromosome, start, end).
    """
    gene_locations = {}
    with gzip.open(gtf_file, 'rt') as f:
        for line in f:
            if line.startswith("#"):
                continue  # Skip header lines
            fields = line.strip().split("\t")
            if fields[2] == "gene":  # Only process gene entries
                info = {kv.split(" ")[0]: kv.split(" ")[1].strip('";') for kv in fields[8].split("; ") if kv}
                gene_name = info.get("gene_name", None)
                if gene_name:
                    # Extract chromosome, start, and end
                    chromosome = fields[0]
                    start = int(fields[3])
                    end = int(fields[4])
                    gene_locations[gene_name] = (chromosome, start, end)
    return gene_locations


def update_features_with_locations(features_file, gtf_file, output_file, unmatched_file=None):
    """
    Update features.tsv with chromosome, start, and end locations using an ENCODE GTF file.
    
    Args:
        features_file (str): Path to the input features.tsv file.
        gtf_file (str): Path to the ENCODE GTF file.
        output_file (str): Path to save the updated features.tsv file.
        unmatched_file (str, optional): Path to save unmatched genes for review.
    """
    # Load the existing features file
    features = pd.read_csv(features_file, sep="\t", header=None, names=["gene_id", "gene_name", "feature_type"])

    # Extract gene locations from the ENCODE GTF file
    gtf_gene_locations = extract_gene_locations_from_encode_gtf(gtf_file)

    # Merge the features data with gene locations from the GTF file
    updated_features = features.copy()
    updated_features["chromosome"] = updated_features["gene_name"].map(lambda gene_name: gtf_gene_locations.get(gene_name, (None, None, None))[0])
    updated_features["start"] = updated_features["gene_name"].map(lambda gene_name: gtf_gene_locations.get(gene_name, (None, None, None))[1])
    updated_features["end"] = updated_features["gene_name"].map(lambda gene_name: gtf_gene_locations.get(gene_name, (None, None, None))[2])

    # Separate unmatched genes for logging if needed
    unmatched_genes = updated_features[updated_features["chromosome"].isna()]
    if unmatched_file and not unmatched_genes.empty:
        unmatched_genes[["gene_id", "gene_name"]].to_csv(unmatched_file, sep="\t", index=False, header=False)
        print(f"Unmatched genes saved to: {unmatched_file}")

    # Save the updated features file (including unmatched rows)
    updated_features.to_csv(output_file, sep="\t", index=False, header=False)
    print(f"Updated features file created: {output_file}")
    return gtf_gene_locations

# Example usage
unmatched_file = "unmatched_genes.tsv"  # Optional file to save unmatched genes
features_file = "10x_output_hESC/features.tsv"  # Input features.tsv file
gtf_file = "gencode.v7.annotation.gtf.gz"  # ENCODE GTF file
output_file = "10x_output_hESC/updated_features_encode.tsv"  # Output file

gene_locations = update_features_with_locations(features_file, gtf_file, output_file, unmatched_file)


Unmatched genes saved to: unmatched_genes.tsv
Updated features file created: 10x_output_hESC/updated_features_encode.tsv


In [85]:
from itertools import islice
subset = dict(islice(gene_locations.items(), 10))
subset

{'DDX11L1': ('chr1', 11869, 14409),
 'AL627309.2': ('chr1', 11872, 14412),
 'DDX11L11': ('chr9', 11987, 14522),
 'WASH7P': ('chr1', 14363, 29806),
 'MIR1302-10': ('chr19', 71973, 72110),
 'FAM138A': ('chr1', 34554, 36081),
 'OR4G11P': ('chr1', 62948, 63887),
 'AL627309.1': ('chr1', 65882, 70008),
 'RP11-34P13.7': ('chr1', 89295, 133566),
 'RP11-34P13.8': ('chr1', 89551, 91105)}

In [86]:
list_of_genes_in_annot = list(gene_locations.keys())

In [87]:
list_of_genes_in_annot.sort()


In [88]:
from pathlib import Path
Path("sorted_genes.txt").write_text('\n'.join(list_of_genes_in_annot))


481062

In [89]:
original_cell_type = pd.read_csv('GSE75748_sc_cell_type_ec.csv', index_col=0)
original_cell_type.index.difference(list_of_genes_in_annot)

Index(['AAED1', 'ABRACL', 'ACTL10', 'ADGB', 'ADTRP', 'AJUBA', 'AKIP1', 'ALG9',
       'ALYREF', 'ANKHD1-EIF4EBP3',
       ...
       'ZNF322', 'ZNF559-ZNF177', 'ZNF587B', 'ZNF664-FAM101A', 'ZNF721',
       'ZNF735', 'ZNF816-ZNF321P', 'ZNF850', 'ZNF853', 'ZNF865'],
      dtype='object', length=737)

## Step 1 - preprocessing

### Extraction of matrices for RNASeq and ATACSeq

In [100]:
from LingerGRN.preprocess import *
adata_RNA, adata_ATAC = get_adata(matrix, features, barcodes, label)

  adata_RNA.obs['label']=label.loc[adata_RNA.obs['barcode']]['label'].values
  adata_ATAC.obs['label']=label.loc[adata_ATAC.obs['barcode']]['label'].values


In [101]:
adata_RNA

View of AnnData object with n_obs × n_vars = 757 × 17735
    obs: 'barcode', 'sample', 'label', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'gene_ids', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'

### Removing low count cells

In [102]:
sc.pp.filter_cells(adata_RNA, min_genes=200)
sc.pp.filter_genes(adata_RNA, min_cells=3)
#sc.pp.filter_cells(adata_ATAC, min_genes=200)
#sc.pp.filter_genes(adata_ATAC, min_cells=3)

  adata.obs['n_genes'] = number


In [103]:
# Find common barcodes between RNA and ATAC datasets
selected_barcode = list(set(adata_RNA.obs['barcode']) & set(adata_ATAC.obs['barcode']))

# Filter both datasets to keep only the shared barcodes
adata_RNA = adata_RNA[adata_RNA.obs['barcode'].isin(selected_barcode)]
adata_ATAC = adata_ATAC[adata_ATAC.obs['barcode'].isin(selected_barcode)]

### Generating metacells

Metacells are generated to create more stable, representative profiles of groups of cells. 

In [104]:
from LingerGRN.pseudo_bulk import *

In [105]:
samplelist=list(set(adata_ATAC.obs['sample'].values)) 
tempsample=samplelist[0]
TG_pseudobulk=pd.DataFrame([])
RE_pseudobulk=pd.DataFrame([])

In [106]:
singlepseudobulk = (adata_RNA.obs['sample'].unique().shape[0]*adata_RNA.obs['sample'].unique().shape[0]>100)
for tempsample in samplelist:
    adata_RNAtemp=adata_RNA[adata_RNA.obs['sample']==tempsample]
    adata_ATACtemp=adata_ATAC[adata_ATAC.obs['sample']==tempsample]
    TG_pseudobulk_temp,RE_pseudobulk_temp=pseudo_bulk(adata_RNAtemp,adata_ATACtemp,singlepseudobulk)                
    TG_pseudobulk=pd.concat([TG_pseudobulk, TG_pseudobulk_temp], axis=1)
    RE_pseudobulk=pd.concat([RE_pseudobulk, RE_pseudobulk_temp], axis=1)
    RE_pseudobulk[RE_pseudobulk > 100] = 100

  view_to_actual(adata)
  view_to_actual(adata)
  view_to_actual(adata)


ValueError: Cannot cut empty array

Write preprocessed data to a file:

In [17]:
linger_storage = 'results/linger_storage'
os.makedirs(linger_storage, exist_ok = True)

adata_ATAC.write(os.path.join(linger_storage,'adata_ATAC.h5ad'))
adata_RNA.write(os.path.join(linger_storage,'adata_RNA.h5ad'))
TG_pseudobulk=TG_pseudobulk.fillna(0)
RE_pseudobulk=RE_pseudobulk.fillna(0)
pd.DataFrame(adata_ATAC.var['gene_ids']).to_csv('data/Peaks.txt',header=None,index=None) #hard coded path to Peaks in preprocess
TG_pseudobulk.to_csv(os.path.join(linger_storage,'TG_pseudobulk.tsv'))
RE_pseudobulk.to_csv(os.path.join(linger_storage,'RE_pseudobulk.tsv'))

  df[key] = c
  df[key] = c


## Step 2 - training

In [4]:
method = 'LINGER'
Datadir = '/home/kl467102/proj_v0/downloads/' # Directory for the downloaded general gene regulatory network for LINGER
GRNdir = Datadir+'data_bulk/'
genome = 'hg38'
outdir = os.path.join(os.getcwd(),'results/LINGER_output/')
os.makedirs(outdir, exist_ok = True)

NOTE TO SELF: Because of `pybedtools` dependency `outdir` needs to be specified as an absolute path, otherwise resulting in an error

In [22]:
from LingerGRN.preprocess import *

In [24]:
preprocess(TG_pseudobulk,RE_pseudobulk,GRNdir,genome,method,outdir)

Mapping gene expression...
Generate TF expression...
Generate RE chromatin accessibility...
Generate TF binding...


100%|████████████████████████████████████████████████████████████████████████████████| 23/23 [1:12:17<00:00, 188.60s/it]


Generate Index...


100%|█████████████████████████████████████████████████████████████████████████████| 14907/14907 [05:31<00:00, 44.97it/s]


In [5]:
import LingerGRN.LINGER_tr as LINGER_tr

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [7]:
activef='ReLU'
LINGER_tr.training(GRNdir,method,outdir,activef,'Human')

chr1


  1%|▌                                                                             | 12/1520 [09:21<19:35:47, 46.78s/it]


KeyboardInterrupt: 

## Step 3 - population GNR inference

In [None]:
import LingerGRN.LL_net as LL_net
LL_net.TF_RE_binding(GRNdir,adata_RNA,adata_ATAC,genome,method,outdir)

In [None]:
LL_net.cis_reg(GRNdir,adata_RNA,adata_ATAC,genome,method,outdir)

In [None]:
LL_net.trans_reg(GRNdir,method,outdir,genome)

## Step 4 - cell-type specific GNR

In [None]:
celltype='all'

In [None]:
LL_net.cell_type_specific_TF_RE_binding(GRNdir,adata_RNA,adata_ATAC,genome,celltype,outdir,method)

In [None]:
LL_net.cell_type_specific_cis_reg(GRNdir,adata_RNA,adata_ATAC,genome,celltype,outdir)

In [None]:
LL_net.cell_type_specific_trans_reg(GRNdir,adata_RNA,celltype,outdir)

# Session information

In [27]:
import session_info
session_info.show()