### Features of misexpression-associated variants 

* Curate a set of misexpression-associated variant features: 
    * MSC VEP consequence
    * MSC Gene-level VEP consequence 
    * Position 
    * Genomic scores 
    * Functional scores 
    * Gene-sample information 
    * SV allele frequency and count - AC and AF 
    * Gene information

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/"
wkdir_path = Path(wkdir)

misexp_cntrl_sv_dir = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets")

# gene-level VEP MSC
misexp_vrnt_gene_msc_path = misexp_cntrl_sv_dir.joinpath("misexp_vrnt_gene_msc_consq.tsv")
# all VEP MSC
vep_msc_path = wkdir_path.joinpath("4_vrnt_enrich/sv_vep/msc/SV_vep_hg38_msc_parsed.tsv")
# variant position 
misexp_vrnt_position_path = misexp_cntrl_sv_dir.joinpath("misexp_vrnt_gene_position.tsv")
# genomic scores 
vrnt_scores_path = wkdir_path.joinpath("5_misexp_vrnts/scores/features/vrnt_features_scores.csv")
# functional scores 
vrnt_functional_scores_path = wkdir_path.joinpath("5_misexp_vrnts/functional/features/vrnt_features_reg_annot.csv")
# add count of misexpression events per SV-gene pair 
sv_info_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt"
# gene-level features
inactive_gene_features_path = wkdir_path.joinpath("3_misexp_genes/inactive_gene_features_8650.csv")
# gene-sample information 
misexp_vrnt_gene_smpl_path = misexp_cntrl_sv_dir.joinpath("misexp_vrnt_gene_smpl.tsv")

In [3]:
out_dir = wkdir_path.joinpath("6_misexp_dissect/vrnt_features")
out_dir.mkdir(parents=True, exist_ok=True)

In [4]:
# load all files 
misexp_vrnt_gene_msc_df = pd.read_csv(misexp_vrnt_gene_msc_path, sep="\t").rename(columns={"consequence": "gene_msc"})
vep_msc_df = pd.read_csv(vep_msc_path, sep="\t").rename(columns={"Uploaded_variation": "vrnt_id", "Consequence": "msc"})
misexp_vrnt_position_df = pd.read_csv(misexp_vrnt_position_path, sep="\t")
vrnt_scores_df = pd.read_csv(vrnt_scores_path, sep=",")
vrnt_functional_scores_df = pd.read_csv(vrnt_functional_scores_path, sep=",")
sv_info_df = pd.read_csv(sv_info_path, sep="\t", dtype={"plinkID": str}).rename(columns={"plinkID":"vrnt_id"})
inactive_gene_features_df = pd.read_csv(inactive_gene_features_path, sep=",")
misexp_vrnt_gene_smpl_df = pd.read_csv(misexp_vrnt_gene_smpl_path, sep="\t")

In [5]:
# features to include
vrnt_scores_to_include = ['vrnt_id', 'CADD_sv_raw_score', 'phylop_max', "gnomad_constraint_max_zscore", 'gwrvis_min',
                          'intersect_har']
functional_scores_to_include = ['vrnt_id', 'gm12878_shared_intersect_tad_boundary', 'A_overlap', 'B_overlap', 'CTCFonlyCTCFbound_all',
                                'CTCFonlyCTCFbound_CD14_monocyte', 'HighCTCF_B_cell', 'HighCTCF_Neutrophil',
                                'intersect_cpg_isl', 'TssA', 'TssAFlnk', 'TxFlnk', 'Tx', 'TxWk', 'EnhG',
                                'Enh', 'ZNFRpts', 'Het', 'TssBiv', 'BivFlnk', 'EnhBiv', 'ReprPC', 'ReprPCWk',
                                'Quies']
gene_features_to_include = ['gene_id', 'oncogene', 'Episcore', 'pHaplo', 'pTriplo','pLI','pNull', 'EDS', 
                            'oe_lof_upper', 'approved_target','decipher_gene','omim_gene']
misexp_smpl_info_to_keep = ["vrnt_id", "gene_id", "egan_id", "rna_id", "TPM", "z-score", "genotype"]

In [6]:
# add variant VEP MSC
misexp_vrnt_vep_msc_df = pd.merge(misexp_vrnt_gene_msc_df, 
                                  vep_msc_df[["vrnt_id", "msc"]], 
                                  on="vrnt_id", 
                                  how="inner")

# add variant position relative to misexpressed gene 
misexp_vrnt_vep_msc_pos_df = pd.merge(misexp_vrnt_vep_msc_df,
                                      misexp_vrnt_position_df.drop(columns=["SVTYPE", "consequence"]), 
                                      on=["vrnt_id", "gene_id"], 
                                      how="inner"
                                     )
# add variant genomic scores
misexp_vrnt_vep_msc_pos_scores_df = pd.merge(misexp_vrnt_vep_msc_pos_df, 
                                            vrnt_scores_df[vrnt_scores_to_include], 
                                             on="vrnt_id", 
                                             how="inner"
                                            )
# add variant functional scores 
misexp_vrnt_func_added_df = pd.merge(misexp_vrnt_vep_msc_pos_scores_df, 
                                            vrnt_functional_scores_df[functional_scores_to_include], 
                                             on="vrnt_id", 
                                             how="inner"
                                            )
# add SV info
misexp_vrnt_maf_ac_added_df = pd.merge(misexp_vrnt_func_added_df, 
                                      sv_info_df[["vrnt_id", "AF", "AC"]], 
                                       on="vrnt_id", 
                                       how="inner"
                                      )
# add misexpressed gene properties 
misexp_vrnt_gene_features_added_df = pd.merge(misexp_vrnt_maf_ac_added_df, 
                                      inactive_gene_features_df[gene_features_to_include], 
                                       on="gene_id", 
                                       how="inner"
                                      )
# add sample information
misexp_vrnt_features_final_df = pd.merge(misexp_vrnt_gene_features_added_df, 
                                        misexp_vrnt_gene_smpl_df[misexp_smpl_info_to_keep], 
                                         on=["vrnt_id", "gene_id"], 
                                         how="inner"
                                        )
if misexp_vrnt_gene_smpl_df.shape[0] != misexp_vrnt_features_final_df.shape[0]: 
    raise ValueError("Input dataframe has different number of vrnt-gene-samples")

In [7]:
# write to file 
misexp_vrnt_features_final_path = out_dir.joinpath(f"misexp_vrnt_features.tsv")
misexp_vrnt_features_final_df.to_csv(misexp_vrnt_features_final_path, sep="\t", index=False)

**Check no sample swaps for bams**

In [8]:
swap_samples = {"INT_RNA7879032", "INT_RNA7879033", "INT_RNA7960192",
                "INT_RNA7960193", "INT_RNA7709692", "INT_RNA7709693",
                "INT_RNA7710161", "INT_RNA7710162", "INT_RNA7710163", 
                "INT_RNA7710164"}
misexp_rna_ids = set(misexp_vrnt_features_final_df.rna_id.unique())
if not misexp_rna_ids.intersection(swap_samples): 
    print("No swapped samples in dataframe.")
else:
    print(f"Swapped samples in dataframe: {misexp_rna_ids.intersection(swap_samples)}")

No swapped samples in dataframe.


**EGAN IDs for DUP cram files**

In [9]:
dup_misexp_egan_ids = misexp_vrnt_features_final_df[(misexp_vrnt_features_final_df.SVTYPE == "DUP")].egan_id.unique()
print(f"Number of misexpression-associated DUP carriers: {len(dup_misexp_egan_ids)}")

Number of misexpression-associated DUP carriers: 22


In [10]:
dup_misexp_egan_id_path = out_dir.joinpath("misexp_dup_carriers_egan_ids.txt")
with open(dup_misexp_egan_id_path, "w") as f_out: 
    for egan_id in dup_misexp_egan_ids: 
        f_out.write(f"{egan_id}\n")