#### Loads each of the count matrix directories from the ENCODE snRNA-seq project together into an overall project

In [1]:
import numpy as np
import scanpy as sc 
import pandas as pd 
import matplotlib.pyplot as plt
import re
import os

In [2]:
metadata_df = pd.read_csv("filtered_metadata.tsv", index_col = 0)
metadata_df = metadata_df[["Accession", "age", "sex"]]
metadata_df.head()

Unnamed: 0,Accession,age,sex
0,ENCSR814LMX,31,male
1,ENCSR630LZS,41,male
2,ENCSR012APQ,66,male
3,ENCSR763BII,42,female
4,ENCSR002SMQ,68,male


In [3]:
# Directory path
directory = 'tarballs/'

# List to store directories starting with "ENC"
enc_directories = []

# Iterate through all entries in the directory
for entry in os.listdir(directory):
    # Check if entry is a directory and starts with "ENC"
    if os.path.isdir(os.path.join(directory, entry)) and entry.startswith('ENC'):
        enc_directories.append(entry)

# Print the list of directories
print("Directories starting with 'ENC':")
print(enc_directories)

Directories starting with 'ENC':
['ENCFF201LRF', 'ENCFF768IJQ', 'ENCFF507PDS', 'ENCFF351UZO', 'ENCFF220HCP', 'ENCFF718XQH', 'ENCFF828HKS', 'ENCFF332PIV', 'ENCFF861LPM', 'ENCFF750OIR', 'ENCFF944XQK', 'ENCFF861QBB', 'ENCFF552HTP', 'ENCFF612FUI', 'ENCFF392PCC', 'ENCFF845YIP', 'ENCFF616DEQ', 'ENCFF431AQJ', 'ENCFF238ACZ', 'ENCFF858WYU', 'ENCFF356UPT', 'ENCFF243GIK', 'ENCFF141HGZ', 'ENCFF775IKH', 'ENCFF484HHL', 'ENCFF518WAA', 'ENCFF459HJH', 'ENCFF091JBK', 'ENCFF538RCL', 'ENCFF474ZVW', 'ENCFF568KOS', 'ENCFF474XJS', 'ENCFF802HTG', 'ENCFF669JZO', 'ENCFF500LDR', 'ENCFF323OMW', 'ENCFF907UBD', 'ENCFF169FQE', 'ENCFF544USL', 'ENCFF760MXT', 'ENCFF594SCL', 'ENCFF588EOL', 'ENCFF286MIT', 'ENCFF568GHX', 'ENCFF046HGL', 'ENCFF381ZJP', 'ENCFF936XYF', 'ENCFF625DIE', 'ENCFF128XVA', 'ENCFF672RPX', 'ENCFF696DZL', 'ENCFF435HFL', 'ENCFF379FXA', 'ENCFF004QRO']


In [4]:
len(enc_directories)

54

Identify the experimental accessions (which is linked to age and sex of each donor) with the tarball names

In [5]:
def determine_corresponding_accession(directory):
    sample_name_path = "tarballs/" + directory + "/filtered/ENCODE_Accession.txt"
    with open(sample_name_path, 'r') as file:
        line = file.readline().strip()
    return(line)

In [6]:
corresponding_accessions = [determine_corresponding_accession(directory) for directory in enc_directories]

In [7]:
ENCODE_df = pd.DataFrame({'directory': enc_directories, 'accession': corresponding_accessions})
ENCODE_df.head()

Unnamed: 0,directory,accession
0,ENCFF201LRF,ENCSR906MRL
1,ENCFF768IJQ,ENCSR084XKX
2,ENCFF507PDS,ENCSR352DXB
3,ENCFF351UZO,ENCSR237HWJ
4,ENCFF220HCP,ENCSR485GOL


In [8]:
merged_ENCODE_df = ENCODE_df.merge(metadata_df, left_on = "accession", right_on = "Accession", how = "inner")
merged_ENCODE_df = merged_ENCODE_df[["directory", "accession", "age", "sex"]]
merged_ENCODE_df.head()

Unnamed: 0,directory,accession,age,sex
0,ENCFF201LRF,ENCSR906MRL,44,male
1,ENCFF768IJQ,ENCSR084XKX,30,male
2,ENCFF507PDS,ENCSR352DXB,57,male
3,ENCFF351UZO,ENCSR237HWJ,64,male
4,ENCFF220HCP,ENCSR485GOL,51,male


#### Iterate through the directories, adding to adata_list

In [14]:
def create_adata(directory):
    count_directory_path = "tarballs/" + directory + "/filtered/" 
    count_matrix_path = count_directory_path + "matrix.mtx.gz"
    # read in count matrix and take transpose so that cells are the rows and the genes are the columns
    adata = sc.read_mtx(count_matrix_path)
    adata = adata.T

    # add the features to adata.var
    features_path = count_directory_path + "features.tsv.gz"
    features = pd.read_csv(features_path, delimiter = "\t", index_col = 0, header = None,
                        names = ['gene_id', 'gene_name', 'feature_type'])

    adata.var = features

    # add the barcodes to adata.obs
    barcodes_path = count_directory_path + "barcodes.tsv.gz"
    barcodes = pd.read_csv(barcodes_path, delimiter = "\t", index_col = 0, header = None, 
                       names = ["barcode"])

    adata.obs = barcodes

    return(adata)

Create a list of individual adata files and combine them

In [15]:
adata_list = list()

In [16]:
directory_names = merged_ENCODE_df['directory']
accession_names = merged_ENCODE_df['accession']
corresponding_ages = merged_ENCODE_df['age']
corresponding_sexes = merged_ENCODE_df['sex']

In [17]:
%%time 

for i in np.arange(len(directory_names)):
    directory_name = directory_names[i]
    accession_name = accession_names[i]
    age = corresponding_ages[i]
    sex = corresponding_sexes[i]

    adata = create_adata(directory_name)
    adata.obs['age'] = age
    adata.obs['sex'] = sex
    adata.obs['sample'] = accession_name
    adata.obs['directory'] = directory_name

    adata_list.append(adata)
    print(directory_name)

ENCFF201LRF
ENCFF768IJQ
ENCFF507PDS
ENCFF351UZO
ENCFF220HCP
ENCFF718XQH
ENCFF828HKS
ENCFF332PIV
ENCFF861LPM
ENCFF750OIR
ENCFF944XQK
ENCFF861QBB
ENCFF552HTP
ENCFF612FUI
ENCFF392PCC
ENCFF845YIP
ENCFF616DEQ
ENCFF431AQJ
ENCFF238ACZ
ENCFF858WYU
ENCFF356UPT
ENCFF243GIK
ENCFF141HGZ
ENCFF775IKH
ENCFF484HHL
ENCFF518WAA
ENCFF459HJH
ENCFF091JBK
ENCFF538RCL
ENCFF474ZVW
ENCFF568KOS
ENCFF474XJS
ENCFF802HTG
ENCFF669JZO
ENCFF500LDR
ENCFF323OMW
ENCFF907UBD
ENCFF169FQE
ENCFF544USL
ENCFF760MXT
ENCFF594SCL
ENCFF588EOL
ENCFF286MIT
ENCFF568GHX
ENCFF046HGL
ENCFF381ZJP
ENCFF936XYF
ENCFF625DIE
ENCFF128XVA
ENCFF672RPX
ENCFF696DZL
ENCFF435HFL
ENCFF379FXA
ENCFF004QRO
CPU times: user 12min 29s, sys: 1min 21s, total: 13min 50s
Wall time: 8min 57s


In [18]:
combined_adata = sc.concat(adata_list)

  utils.warn_names_duplicates("obs")


In [30]:
# add the sample to the obs_names to ensure uniqueness 
combined_adata.obs_names = combined_adata.obs['sample'].astype(str) + ":" + combined_adata.obs_names

In [31]:
adata_var_with_more_info = adata_list[0].var.reset_index()

Retain the information in adata.var lost after concatenation: 

In [32]:
combined_adata.var = combined_adata.var.reset_index().merge(adata_var_with_more_info, on = "gene_id")
combined_adata.var_names = combined_adata.var['gene_name']
combined_adata.var = combined_adata.var.drop(columns = ["gene_name"])

# remove the spikeins 
combined_adata.var["spikein"] = combined_adata.var_names.str.startswith("gSpikein")
combined_adata = combined_adata[:, combined_adata.var["spikein"] == False]
combined_adata.var_names_make_unique()

AnnData expects .var.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)
  utils.warn_names_duplicates("var")


In [33]:
combined_adata.var = combined_adata.var.drop(columns = ["spikein"])

In [23]:
combined_adata.write("02_combined_ENCODE_LV_snRNA.h5ad")