### INTERVAL RNA-seq sample and gene QC for misexpression

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import pysam
from collections import Counter

In [2]:
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/"
wkdir_path = Path(wkdir)

# inputs 
count_matrix_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq_n5188_v97/results/combined/5591-star-fc-genecounts.txt"
rna_id_pass_qc_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression/1_express_quant/sample_qc/pass_qc/rna_id_pass_qc.tsv"
# input gtf must be collapsed, sorted and tabix indexed 
gencode_gtf_path = wkdir_path.joinpath("reference/gencode/gencode.v31.annotation.collapsed.sorted.gtf.gz")
gtex_eqtl_dir = wkdir_path.joinpath("reference/gtex/eqtl/GTEx_Analysis_v8_eQTL_expression_matrices/")
out_dir = wkdir_path.joinpath("1_rna_seq_qc")

# variables 
aberrant_multiple = 5
tpm_fract_cutoff = 0.1
smpl_fraction = 0.05

In [3]:
# constants 
AUTOSOMES = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 
             'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 
             'chr18', 'chr19', 'chr20', 'chr21', 'chr22']
GENE_TYPES = ["lncRNA", "protein_coding"]

In [4]:
# check output directory exists, if not make directory 
out_dir_path = Path(out_dir)
out_dir_path.mkdir(parents=True, exist_ok=True)
# read in the count matrix
count_matrix_df = pd.read_csv(count_matrix_path, sep="\t").set_index("ENSEMBL_ID")

### sample QC 

# keep samples with 2x reads and remove 1x samples from batch 1 and batch 15
rna_id_list = count_matrix_df.columns.tolist()
print(f"Number of samples in feature count matrix: {len(rna_id_list)}")
rna_id_to_keep = [rna_id for rna_id in rna_id_list if not rna_id.endswith("_1") if not rna_id.endswith("_2")]

# remove columns with run1 and run2 sample IDs 
count_matrix_dplcts_rmvd_df = count_matrix_df[rna_id_to_keep]
print(f"Number of samples after removing samples ending in '_1' and '_2': {count_matrix_dplcts_rmvd_df.shape[1]}")

# swap samples 
swap_samples = ["INT_RNA7879032", "INT_RNA7879033", "INT_RNA7960192",
                "INT_RNA7960193", "INT_RNA7709692", "INT_RNA7709693",
                "INT_RNA7710161", "INT_RNA7710162", "INT_RNA7710163", 
                "INT_RNA7710164"]

count_matrix_dplcts_rmvd_swapped_df = count_matrix_dplcts_rmvd_df.copy()
for i in range(0, int(len(swap_samples)/2)):
    swap_1 = swap_samples[2*i]
    swap_2 = swap_samples[2*i + 1]
    a, b = count_matrix_dplcts_rmvd_df.loc[:, swap_1], count_matrix_dplcts_rmvd_df.loc[:, swap_2]
    count_matrix_dplcts_rmvd_swapped_df.loc[:, swap_2], count_matrix_dplcts_rmvd_swapped_df.loc[:, swap_1] = a,b

# check swaps
print("Checking swaps ...")
for i in range(0,int(len(swap_samples)/2)):
    swap_1= swap_samples[i*2]
    swap_2 = swap_samples[i*2+1]
    print(count_matrix_dplcts_rmvd_swapped_df[swap_1].tolist() == count_matrix_dplcts_rmvd_df[swap_2].tolist())
print("Done.")
# subset to samples passing QC 
rna_id_pass_qc = pd.read_csv(rna_id_pass_qc_path, sep="\t", header=None)[0].tolist()
smpl_number = len(rna_id_pass_qc)
print(f"Number of samples passing sample QC: {smpl_number}")

# subset count matrix
count_matrix_dplcts_rmvd_swapped_pass_qc_df = count_matrix_dplcts_rmvd_swapped_df[rna_id_pass_qc]

# check 
if count_matrix_dplcts_rmvd_swapped_pass_qc_df.shape[1] != len(rna_id_pass_qc): 
    raise ValueError("Samples passing QC missing from count matrix.")

fc_mtx_pass_smpl_qc_df = count_matrix_dplcts_rmvd_swapped_pass_qc_df.reset_index().rename(columns={"ENSEMBL_ID": "gene_id"})

Number of samples in feature count matrix: 5488
Number of samples after removing samples ending in '_1' and '_2': 4778
Checking swaps ...
True
True
True
True
True
Done.
Number of samples passing sample QC: 4731


In [5]:
### Gene QC 
print("QCing genes ...")
gene_ids_in_fx_mtx = set(fc_mtx_pass_smpl_qc_df.gene_id)
print(f"- Number of gene ids in count matrix: {len(gene_ids_in_fx_mtx)}")
# get gene IDs from input gencode gtf 
gencode_gene_ids = [gtf.gene_id.split(".")[0] for gtf in pysam.TabixFile(str(gencode_gtf_path)).fetch(parser = pysam.asGTF()) if gtf.feature == "gene"]
print(f"- Total gene IDs in gencode file: {len(gencode_gene_ids)}")
# remove duplicate gene IDs
gencode_gene_ids_uniq = set(gencode_gene_ids)
print(f"- Total unique gene IDs in gencode file: {len(gencode_gene_ids_uniq)}")
# get globin genes
globin_gene_symbols = ["HBA1","HBA2", "HBB", "HBD", "HBE1", "HBG1", "HBG2", "HBM", "HBQ1", "HBZ", "MB"]
globin_gene_ids = [gtf.gene_id.split(".")[0] for gtf in pysam.TabixFile(str(gencode_gtf_path)).fetch(parser = pysam.asGTF()) if gtf.feature == "gene" and gtf.gene_name in globin_gene_symbols]
if len(globin_gene_symbols) != len(globin_gene_ids): 
    raise ValueError("Missing globin gene IDs in gencode file.") 
# get list of entries which contain "rRNA" in class
rRNA_gene_ids = [gtf.gene_id.split(".")[0] for gtf in pysam.TabixFile(str(gencode_gtf_path)).fetch(parser = pysam.asGTF()) if "rRNA" in gtf.gene_type and gtf.feature == "gene"]
rRNA_globin_gene_ids = set(globin_gene_ids).union(set(rRNA_gene_ids))
# subset gene expression matrix 
genes_to_keep = gencode_gene_ids_uniq - rRNA_globin_gene_ids
gene_number = len(genes_to_keep)
fc_mtx_pass_smpl_gene_qc_df = fc_mtx_pass_smpl_qc_df[fc_mtx_pass_smpl_qc_df.gene_id.isin(genes_to_keep)].set_index("gene_id")
# check 
if fc_mtx_pass_smpl_gene_qc_df.shape[0] != len(genes_to_keep):
    raise ValueError("Genes passing QC missing from count matrix.")
print(f"- Total genes remaining: {gene_number}")

QCing genes ...
- Number of gene ids in count matrix: 60617
- Total gene IDs in gencode file: 59752
- Total unique gene IDs in gencode file: 59708
- Total genes remaining: 59144


In [6]:
# write to file 
fc_out_dir = out_dir_path.joinpath("fc_mtx")
Path(fc_out_dir).mkdir(parents=True, exist_ok=True)
fc_mtx_path = fc_out_dir.joinpath(f"fc_mtx_{smpl_number}samples_{gene_number}genes.csv")
fc_mtx_pass_smpl_gene_qc_df.to_csv(fc_mtx_path)

In [7]:
### calculate TPM 
print(f"Calculating TPM ...")
# calculate total exon length 
gene_id_exon_len = {}
for gtf in pysam.TabixFile(str(gencode_gtf_path)).fetch(parser = pysam.asGTF()):
    if gtf.feature == "exon":
        gene_id = gtf.gene_id
        exon_len = int(gtf.end) - int(gtf.start) 
        # check if there are zero length exons
        if exon_len == 0: 
            raise ValueError(f"{gene_id} has zero length exon.")
        gene_id_exon_len[gene_id] = gene_id_exon_len.get(gene_id, 0) + exon_len 

# converted exon length dictionary to dataframe and clean 
gene_id_exon_len_df = pd.DataFrame.from_dict(gene_id_exon_len, orient="index", columns=["length_exons"]).reset_index(drop=False).rename(columns={"index": "gene_id"})
gene_id_exon_len_no_vrsn_no_pary_df = gene_id_exon_len_df.copy()
gene_id_exon_len_no_vrsn_no_pary_df = gene_id_exon_len_no_vrsn_no_pary_df[~gene_id_exon_len_no_vrsn_no_pary_df.gene_id.str.endswith("PAR_Y")]
gene_id_exon_len_no_vrsn_no_pary_df.gene_id = gene_id_exon_len_no_vrsn_no_pary_df.gene_id.str.split(".").str[0]
# subset exon lengths to genes passing filters 
gene_id_exon_len_gene_pass_qc_df = gene_id_exon_len_no_vrsn_no_pary_df[gene_id_exon_len_no_vrsn_no_pary_df.gene_id.isin(genes_to_keep)].set_index("gene_id").sort_index()

# check gene order is identical 
if gene_id_exon_len_gene_pass_qc_df.index.tolist() != fc_mtx_pass_smpl_gene_qc_df.index.tolist(): 
    raise ValueError("Gene lists in count matrix and exon length matrix do not match.")

# compute RPK (reads per kilobase) - divide read count by exon length/1000
rpk_df = fc_mtx_pass_smpl_gene_qc_df.div((gene_id_exon_len_gene_pass_qc_df.length_exons/1000), axis=0).T
# divide by sum of RPK values to calculate TPM 
tpm_df = rpk_df.div(rpk_df.sum(axis=1)/1000000, axis=0)
tpm_tp_df = tpm_df.T
# check TPM sums to 1 million
if not round(tpm_tp_df.sum(axis=0)).unique().tolist() == [1000000.0]:
    raise ValueError("TPMs do not sum to 1,000,000.")
print("Done.")

Calculating TPM ...
Done.


In [8]:
# write to file 
tpm_out_dir = out_dir_path.joinpath("tpm_mtx")
Path(tpm_out_dir).mkdir(parents=True, exist_ok=True)
tpm_mtx_path = tpm_out_dir.joinpath(f"tpm_{smpl_number}samples_{gene_number}genes.csv")
tpm_tp_df.to_csv(tpm_mtx_path)

In [9]:
### aberrant sample QC 
print("Removing global overexpression outlier samples ...")
# remove genes that have TPM = 0 for all samples 
tpm_no_zeroes_df = tpm_tp_df.loc[~(tpm_tp_df==0).all(axis=1)]
genes_with_max_express = tpm_no_zeroes_df.shape[0]
print(f"Number of genes to count top expression events: {genes_with_max_express}")
# count max sample for each gene 
max_gene_per_smpl_genes_dict = Counter(tpm_no_zeroes_df.idxmax(axis=1).tolist())
max_gene_per_smpl_genes_df = pd.DataFrame.from_dict(max_gene_per_smpl_genes_dict, orient="index", columns=["count"]).reset_index(drop=False).rename(columns={"index":"rna_id"})
# add samples with no top expression events 
smpl_with_no_top_event_dict = {rna_id: 0 for rna_id in rna_id_pass_qc if rna_id not in max_gene_per_smpl_genes_dict.keys()}
smpl_with_no_top_event_df = pd.DataFrame.from_dict(smpl_with_no_top_event_dict, orient="index", columns=["count"]).reset_index(drop=False).rename(columns={"index":"rna_id"})
max_gene_per_smpl_genes_all_df = pd.concat([max_gene_per_smpl_genes_df, smpl_with_no_top_event_df]).reset_index(drop=True)
print(f"Total top expression events: {max_gene_per_smpl_genes_all_df['count'].sum()}")

# if multiple columns have same max value idxmax function selects sample in nearest column
# check that this does not lead to big differences in number of top expression events per sample
# reverse dataframe 
tpm_no_zeroes_df_rev = tpm_no_zeroes_df.iloc[:, ::-1]
max_gene_per_smpl_genes_rev_dict = Counter(tpm_no_zeroes_df_rev.idxmax(axis=1).tolist())
max_gene_per_smpl_genes_rev_df = pd.DataFrame.from_dict(max_gene_per_smpl_genes_rev_dict, orient="index", columns=["count_rev"]).reset_index(drop=False).rename(columns={"index":"rna_id"})
# add samples with no top expression events 
smpl_with_no_top_event_rev_dict = {rna_id: 0 for rna_id in rna_id_pass_qc if rna_id not in max_gene_per_smpl_genes_rev_dict.keys()}
smpl_with_no_top_event_rev_df = pd.DataFrame.from_dict(smpl_with_no_top_event_rev_dict, orient="index", columns=["count_rev"]).reset_index(drop=False).rename(columns={"index":"rna_id"})
max_gene_per_smpl_genes_all_rev_df = pd.concat([max_gene_per_smpl_genes_rev_df, smpl_with_no_top_event_rev_df]).reset_index(drop=True)
max_gene_per_smpl_genes_all_rev_merged_df = pd.merge(max_gene_per_smpl_genes_all_df, max_gene_per_smpl_genes_all_rev_df, how="inner", on="rna_id")
max_gene_per_smpl_genes_all_rev_merged_df["identical"] = np.where(max_gene_per_smpl_genes_all_rev_merged_df["count"] == max_gene_per_smpl_genes_all_rev_merged_df["count_rev"], True, False)
not_identical = max_gene_per_smpl_genes_all_rev_merged_df[~max_gene_per_smpl_genes_all_rev_merged_df["identical"]].shape[0]
print(f"Differences in top expression events: {not_identical}")
exp_max_express = genes_with_max_express/smpl_number

Removing global overexpression outlier samples ...
Number of genes to count top expression events: 57555
Total top expression events: 57555
Differences in top expression events: 0


In [10]:
# aberrant sample QC metrics 
print(f"Expected number of top gene expression outliers: {exp_max_express}")
max_express_cutoff = aberrant_multiple * exp_max_express
print(f"Top gene expression outliers cutoff: {max_express_cutoff}")
rna_id_fail_qc = max_gene_per_smpl_genes_all_df[max_gene_per_smpl_genes_all_df["count"] > max_express_cutoff].rna_id.tolist()
num_smpls_fail_qc = len(rna_id_fail_qc)
print(f"Number of samples failing QC: {num_smpls_fail_qc}")
print(f"Percentage of samples failing QC: {(num_smpls_fail_qc/smpl_number)*100}")
# keep samples passing QC 
rna_ids_pass_qc = max_gene_per_smpl_genes_all_df[~max_gene_per_smpl_genes_all_df.rna_id.isin(rna_id_fail_qc)].rna_id.unique()
num_rna_id_pass_qc = len(rna_ids_pass_qc)
print(f"Number of samples passing QC: {num_rna_id_pass_qc}")
max_gene_per_smpl_genes_all_df["pass_qc"] = np.where(max_gene_per_smpl_genes_all_df.rna_id.isin(rna_ids_pass_qc), "Pass", "Fail")
# subset expression matrix 
tpm_smpl_qc_df = tpm_tp_df[rna_ids_pass_qc]
print("Done.")

Expected number of top gene expression outliers: 12.165504121750159
Top gene expression outliers cutoff: 60.827520608750795
Number of samples failing QC: 163
Percentage of samples failing QC: 3.4453603889241173
Number of samples passing QC: 4568
Done.


In [11]:
# write to file 
aberrant_smpl_qc_dir = out_dir_path.joinpath("aberrant_smpl_qc")
Path(aberrant_smpl_qc_dir).mkdir(parents=True, exist_ok=True)
aberrant_smpl_qc_path = aberrant_smpl_qc_dir.joinpath("aberrant_gene_count.csv")
max_gene_per_smpl_genes_all_srtd_df = max_gene_per_smpl_genes_all_df.sort_values(by="count", ascending=False)
max_gene_per_smpl_genes_all_srtd_df.to_csv(aberrant_smpl_qc_path, index=False)

In [12]:
# write list of samples passing QC to file 
smpls_pass_qc_path = aberrant_smpl_qc_dir.joinpath(f"smpls_pass_qc_{num_rna_id_pass_qc}.csv")
with open(smpls_pass_qc_path, 'w') as f_out: 
    for rna_id in rna_ids_pass_qc: 
        f_out.write(f"{rna_id}\n")

In [13]:
# write TPM matrix with aberrant samples removed 
tpm_post_smpl_qc_path = tpm_out_dir.joinpath(f"tpm_{num_rna_id_pass_qc}samples_{gene_number}genes_smpl_qc.csv")
tpm_smpl_qc_df.to_csv(tpm_post_smpl_qc_path)

In [14]:
# write count matrix with aberrant samples removed 
fc_mtx_smpl_qc_path = fc_out_dir.joinpath(f"fc_mtx_{num_rna_id_pass_qc}samples_{gene_number}genes.csv")
fc_mtx_gene_qc_smpl_qc_pass_df = fc_mtx_pass_smpl_gene_qc_df[rna_ids_pass_qc]
fc_mtx_gene_qc_smpl_qc_pass_df.to_csv(fc_mtx_smpl_qc_path)

In [15]:
### inactive gene identification

# subset to genes passing eQTL QC in at least one GTEx tissue 
gtex_eqtl_path = Path(gtex_eqtl_dir)
normalised_gene_exp_path_list = gtex_eqtl_path.glob("*normalized_expression.bed.gz")
# collect genes passing eQTL QC in at least one tissue 
gene_ids_passed_eqtl_qc_list = []
for tissue_normalised_exp_path in normalised_gene_exp_path_list:
    gene_ids_passed_eqtl_qc_list += pd.read_csv(tissue_normalised_exp_path, sep="\t").gene_id.tolist()

gene_pass_eqtl_count_df = pd.DataFrame.from_dict(Counter(gene_ids_passed_eqtl_qc_list), orient="index", columns=["tissues_pass_eqtl_qc"])
gene_pass_eqtl_count_idx_df = gene_pass_eqtl_count_df.reset_index().rename(columns={"index":"gene_id_vrsn"})
gene_pass_eqtl_count_idx_df["gene_id"] = gene_pass_eqtl_count_idx_df.gene_id_vrsn.str.split(".").str[0]

# set keeping gene version 
gene_ids_passed_eqtl_set = set(gene_ids_passed_eqtl_qc_list)
# set without gene version 
gene_ids_passed_eqtl_no_vrsn_set = set([gene_id.split(".")[0] for gene_id in gene_ids_passed_eqtl_qc_list])
print(f"Number of genes passing eQTL QC in at least one tissue in GTEx: {len(gene_ids_passed_eqtl_no_vrsn_set)}")
genes_to_keep_pass_eqtl = genes_to_keep.intersection(gene_ids_passed_eqtl_no_vrsn_set)
print(f"Number of genes passing gene QC with eQTL in at least one tissue: {len(genes_to_keep_pass_eqtl)}")

Number of genes passing eQTL QC in at least one tissue in GTEx: 39832
Number of genes passing gene QC with eQTL in at least one tissue: 39402


In [16]:
# write number of tissues gene pass eQTL QC in 
gtex_pass_eqtl_dir = out_dir_path.joinpath("gtex_pass_eqtl_qc")
Path(gtex_pass_eqtl_dir).mkdir(parents=True, exist_ok=True)
gtex_pass_eqtl_count_path = gtex_pass_eqtl_dir.joinpath("gene_pass_eqtl_count.csv")
gene_pass_eqtl_count_idx_df.to_csv(gtex_pass_eqtl_count_path, sep=",", index=False)

# subset to protein-coding and lncRNA genes on autosomes 
genes_pass_filters = []
for gtf in pysam.TabixFile(str(gencode_gtf_path)).fetch(parser = pysam.asGTF()):
    if gtf.gene_id.split(".")[0] in genes_to_keep_pass_eqtl and gtf.feature == "gene": 
        if gtf.contig in AUTOSOMES and gtf.gene_type in GENE_TYPES: 
            genes_pass_filters.append(gtf.gene_id.split(".")[0])
if len(genes_pass_filters) != len(set(genes_pass_filters)):
    raise ValueError("Duplicates in final gene set.")
print(f"Number of genes passing all filters: {len(genes_pass_filters)}")
# subset expression matrix 
tpm_smpl_gene_qc_df = tpm_smpl_qc_df[tpm_smpl_qc_df.index.isin(genes_pass_filters)]
# compute TPM < 0.1 fraction per gene
tpm_fract_df = tpm_smpl_gene_qc_df.reset_index(drop=False)[["gene_id"]].copy()
tpm_fract_df[f"tpm{tpm_fract_cutoff}_fract"] = ((tpm_smpl_gene_qc_df > tpm_fract_cutoff).sum(axis=1)/num_rna_id_pass_qc).tolist()
# subset to genes with TPM > 0.1 in 5% of samples 
genes_tpm01_fract_grtr95 = set(tpm_fract_df[tpm_fract_df[f"tpm{tpm_fract_cutoff}_fract"] < smpl_fraction].gene_id)
num_inactive_genes = len(genes_tpm01_fract_grtr95)
print(f"Genes with TPM > {tpm_fract_cutoff} in {smpl_fraction*100}% samples: {num_inactive_genes}")
# subset expression matrix 
tpm_inactive_df = tpm_smpl_gene_qc_df[tpm_smpl_gene_qc_df.index.isin(genes_tpm01_fract_grtr95)]

Number of genes passing all filters: 29614
Genes with TPM > 0.1 in 5.0% samples: 8779


In [17]:
# write TPM > 0.1 fraction per gene to file
gene_set_dir = out_dir_path.joinpath("gene_sets")
Path(gene_set_dir).mkdir(parents=True, exist_ok=True)
gene_tpm_fract_path = gene_set_dir.joinpath(f"gene_tpm{tpm_fract_cutoff}_fract.csv")
tpm_fract_df.to_csv(gene_tpm_fract_path, index=False)
# write inactive gene set to file
inactive_gene_path = gene_set_dir.joinpath(f"inactive_genes_{num_inactive_genes}.txt")
with open(inactive_gene_path, "w") as f_out: 
    for gene_id in genes_tpm01_fract_grtr95: 
        f_out.write(f"{gene_id}\n")

In [18]:
# write inactive gene expression matrix  
inactive_tpm_out_dir = out_dir_path.joinpath("tpm_mtx_inactive")
Path(inactive_tpm_out_dir).mkdir(parents=True, exist_ok=True)
inactive_tpm_mtx_path = inactive_tpm_out_dir.joinpath(f"tpm_{num_rna_id_pass_qc}samples_{num_inactive_genes}genes_inactive.tsv")
tpm_inactive_df.to_csv(inactive_tpm_mtx_path, sep="\t")

In [19]:
### Gene expression z-score calculation

# remove genes where all samples TPM = 0
tpm_inactive_rmv_all_zeroes_df = tpm_inactive_df[(tpm_inactive_df != 0).any(axis=1)]
inactive_genes_rmv_all_zeroes = tpm_inactive_rmv_all_zeroes_df.index.unique()
num_inactive_genes_rmv_all_zeroes = len(inactive_genes_rmv_all_zeroes)
print(f"Number of inactive genes with at least one non-zero TPM: {num_inactive_genes_rmv_all_zeroes}")
# calculate gene z-scores 
tpm_inactive_tp_df = tpm_inactive_rmv_all_zeroes_df.transpose()
zscore_tp_df = (tpm_inactive_tp_df - tpm_inactive_tp_df.mean())/tpm_inactive_tp_df.std()
zscore_df = zscore_tp_df.transpose()

Number of inactive genes with at least one non-zero TPM: 8739


In [20]:
# flatten z-score and TPM expression matrix 
z_score_flat_df = pd.melt(zscore_df, ignore_index=False).reset_index(drop=False).rename(columns={"variable":"rna_id", "value":"z-score"})
tpm_inactive_flat_df = pd.melt(tpm_inactive_rmv_all_zeroes_df, ignore_index=False).reset_index(drop=False).rename(columns={"variable":"rna_id", "value":"TPM"})
# merge z-score and TPM 
tpm_zscore_inactive_flat_df = pd.merge(tpm_inactive_flat_df, z_score_flat_df, on=["gene_id", "rna_id"], how="inner")
# write flat z-score TPM matrix 
z_tpm_flat_out_dir = out_dir_path.joinpath("zscore_tpm_flat")
Path(z_tpm_flat_out_dir).mkdir(parents=True, exist_ok=True)
tpm_zscore_path_out = z_tpm_flat_out_dir.joinpath(f"tpm_zscore_{num_rna_id_pass_qc}smpls_{num_inactive_genes_rmv_all_zeroes}genes_tpm{tpm_fract_cutoff}_frac_{smpl_fraction*100}perc_flat.csv")
tpm_zscore_inactive_flat_df.to_csv(tpm_zscore_path_out, index=False)