## Encoding feature names for the XGBoost Classifier.

In [2]:
import pandas as pd
import json, csv, sys, os

### Encoding the X_train dataset's features.

In [3]:
df = pd.read_csv('../data/X_train_500.csv')

In [4]:
print(df.shape)

(25546, 500)


In [5]:
print(df.columns.tolist())

['ID', 'IMPACT', 'BIOTYPE', 'CADD_raw', 'BLOSUM62', 'Conservation', 'ProteinLengthChange', 'MaxEntScan_alt', 'MaxEntScan_diff', 'MaxEntScan_ref', 'TSSDistance', 'ada_score', 'rf_score', '1000Gp3_AF', 'FATHMM_score', 'GERPplus_plus_NR', 'GERPplus_plus_RS', 'GM12878_fitCons_score', 'GenoCanyon_score', 'H1_hESC_fitCons_score', 'HUVEC_fitCons_score', 'LINSIGHT', 'LIST_S2_score', 'LRT_score', 'M_CAP_score', 'MPC_score', 'MVP_score', 'MutationAssessor_score', 'MutationTaster_score', 'PROVEAN_score', 'SiPhy_29way_logOdds', 'UK10K_AF', 'VEST4_score', 'fathmm_MKL_coding_score', 'fathmm_XF_coding_score', 'gnomAD_exomes_AF', 'gnomAD_genomes_AF', 'integrated_fitCons_score', 'phastCons100way_vertebrate', 'phastCons17way_primate', 'phastCons30way_mammalian', 'phyloP100way_vertebrate', 'phyloP17way_primate', 'phyloP30way_mammalian', 'NMD', 'GDI', 'MSC_95CI', 'rel_cDNA_pos', 'rel_CDS_pos', 'rel_prot_pos', 'Selective_pressure', 'Clarks_distance', 'CDS_len', 'Number_of_paralogs', 'denovo_Zscore', 'RVIS'

In [6]:
def encode_names(df):
    original_names = df.columns.tolist()
    name_dict = {f"feature{i}": name for i, name in enumerate(original_names)}
    rev_dict = {v: k for k, v in name_dict.items()}
    df_encoded = df.rename(columns=rev_dict)
    with open('feature_name_mapping.json', 'w') as f:
        json.dump(name_dict, f, indent=2)
    return df_encoded, name_dict
encoded_df, feature_dict = encode_names(df)
print(feature_dict)

{'feature0': 'ID', 'feature1': 'IMPACT', 'feature2': 'BIOTYPE', 'feature3': 'CADD_raw', 'feature4': 'BLOSUM62', 'feature5': 'Conservation', 'feature6': 'ProteinLengthChange', 'feature7': 'MaxEntScan_alt', 'feature8': 'MaxEntScan_diff', 'feature9': 'MaxEntScan_ref', 'feature10': 'TSSDistance', 'feature11': 'ada_score', 'feature12': 'rf_score', 'feature13': '1000Gp3_AF', 'feature14': 'FATHMM_score', 'feature15': 'GERPplus_plus_NR', 'feature16': 'GERPplus_plus_RS', 'feature17': 'GM12878_fitCons_score', 'feature18': 'GenoCanyon_score', 'feature19': 'H1_hESC_fitCons_score', 'feature20': 'HUVEC_fitCons_score', 'feature21': 'LINSIGHT', 'feature22': 'LIST_S2_score', 'feature23': 'LRT_score', 'feature24': 'M_CAP_score', 'feature25': 'MPC_score', 'feature26': 'MVP_score', 'feature27': 'MutationAssessor_score', 'feature28': 'MutationTaster_score', 'feature29': 'PROVEAN_score', 'feature30': 'SiPhy_29way_logOdds', 'feature31': 'UK10K_AF', 'feature32': 'VEST4_score', 'feature33': 'fathmm_MKL_coding_

In [7]:
# Export to CSV.
encoded_df.to_csv('../data/X_train_encoded.csv', index=False)

print("Exported to '../data/X_train_encoded.csv'")

Exported to '../data/X_train_encoded.csv'


### Encoding the NEGONE and MEDIAN feature lists for the preprocessing pipeline.

In [8]:
NEGONE_FEATURES = ['MOD_RES','REGION','INTERACTION_REGION','REQUIRED_FOR_INTER','ATP_binding_gbind','Ca2+_binding_gbind','DNA_binding_gbind','HEME_binding_gbind','Mg2+_binding_gbind','Mn2+_binding_gbind','RNA_binding_gbind','Dist2Mutation','BLOSUM62','ProteinLengthChange','TSSDistance','1000Gp3_AF','UK10K_AF','gnomAD_exomes_AF','gnomAD_genomes_AF','MSC_95CI','rel_cDNA_pos','rel_CDS_pos','rel_prot_pos','GDI','Selective_pressure','Clarks_distance','CDS_len','Number_of_paralogs','denovo_Zscore','RVIS','Indispensability_score','RSA','ASA','RSA_Zfit','before_RSA_3','before_RSA_8','before_RSA_15','after_RSA_3','after_RSA_8','after_RSA_15','before_ASA_3','before_ASA_8','before_ASA_15','after_ASA_3','after_ASA_8','after_ASA_15','Phosphorylation','Acetylation','Methylation','Ubiquitination','Glycosylation','PTM','AF_Relative_ASA','IUPRED2','ANCHOR2','before_IUPRED_3','before_IUPRED_8','before_IUPRED_15','after_IUPRED_3','after_IUPRED_8','after_IUPRED_15','before_ANCHOR_3','before_ANCHOR_8','before_ANCHOR_15','after_ANCHOR_3','after_ANCHOR_8','after_ANCHOR_15','A3D_SCORE','n_contacts','distance_com','concavity_score','S_DDG[SEQ]','S_DDG[3D]','hgmd_mutcount','gnomsingle_mutcount','gnom_mutcount','AF_confidence','isHomomultimer','num_interactions','ppi_combined_0','ppi_combined_1','ppi_combined_2','ppi_combined_3','ppi_combined_4','ppi_combined_5','ppi_combined_6','ppi_combined_7','ppi_combined_8','ppi_combined_9','ppi_combined_10','ppi_combined_11','ppi_combined_12','ppi_combined_13','ppi_combined_14','ppi_combined_15','ppi_combined_16','ppi_combined_17','ppi_combined_18','ppi_combined_19','ppi_combined_20','ppi_combined_21','ppi_combined_22','ppi_combined_23','ppi_combined_24','ppi_combined_25','ppi_combined_26','ppi_combined_27','ppi_combined_28','ppi_combined_29','ppi_combined_30','ppi_combined_31','ppi_combined_32','ppi_combined_33','ppi_combined_34','ppi_combined_35','ppi_combined_36','ppi_combined_37','ppi_combined_38','ppi_combined_39','ppi_combined_40','ppi_combined_41','ppi_combined_42','ppi_combined_43','ppi_combined_44','ppi_combined_45','ppi_combined_46','ppi_combined_47','ppi_combined_48','ppi_combined_49','ppi_combined_50','ppi_combined_51','ppi_combined_52','ppi_combined_53','ppi_combined_54','ppi_combined_55','ppi_combined_56','ppi_combined_57','ppi_combined_58','ppi_combined_59','ppi_combined_60','ppi_combined_61','ppi_combined_62','ppi_combined_63','s_het','DRNApredDNAscore_aa_window_3_prev','DRNApredDNAscore_aa_window_8_prev','DRNApredDNAscore_aa_window_15_prev','DRNApredDNAscore_aa_window_3_next','DRNApredDNAscore_aa_window_8_next','DRNApredDNAscore_aa_window_15_next','DRNApredDNAscore_aa','ASAquick_normscore_aa_window_3_prev','ASAquick_normscore_aa_window_8_prev','ASAquick_normscore_aa_window_15_prev','ASAquick_normscore_aa_window_3_next','ASAquick_normscore_aa_window_8_next','ASAquick_normscore_aa_window_15_next','ASAquick_normscore_aa','ASAquick_rawscore_aa_window_3_prev','ASAquick_rawscore_aa_window_8_prev','ASAquick_rawscore_aa_window_15_prev','ASAquick_rawscore_aa_window_3_next','ASAquick_rawscore_aa_window_8_next','ASAquick_rawscore_aa_window_15_next','ASAquick_rawscore_aa','DFLpredScore_aa_window_3_prev','DFLpredScore_aa_window_8_prev','DFLpredScore_aa_window_15_prev','DFLpredScore_aa_window_3_next','DFLpredScore_aa_window_8_next','DFLpredScore_aa_window_15_next','DFLpredScore_aa','DRNApredRNAscore_aa_window_3_prev','DRNApredRNAscore_aa_window_8_prev','DRNApredRNAscore_aa_window_15_prev','DRNApredRNAscore_aa_window_3_next','DRNApredRNAscore_aa_window_8_next','DRNApredRNAscore_aa_window_15_next','DRNApredRNAscore_aa','DisoDNAscore_aa_window_3_prev','DisoDNAscore_aa_window_8_prev','DisoDNAscore_aa_window_15_prev','DisoDNAscore_aa_window_3_next','DisoDNAscore_aa_window_8_next','DisoDNAscore_aa_window_15_next','DisoDNAscore_aa','DisoPROscore_aa_window_3_prev','DisoPROscore_aa_window_8_prev','DisoPROscore_aa_window_15_prev','DisoPROscore_aa_window_3_next','DisoPROscore_aa_window_8_next','DisoPROscore_aa_window_15_next','DisoPROscore_aa','DisoRNAscore_aa_window_3_prev','DisoRNAscore_aa_window_8_prev','DisoRNAscore_aa_window_15_prev','DisoRNAscore_aa_window_3_next','DisoRNAscore_aa_window_8_next','DisoRNAscore_aa_window_15_next','DisoRNAscore_aa','MMseq2_conservation_level_aa_window_3_prev','MMseq2_conservation_level_aa_window_8_prev','MMseq2_conservation_level_aa_window_15_prev','MMseq2_conservation_level_aa_window_3_next','MMseq2_conservation_level_aa_window_8_next','MMseq2_conservation_level_aa_window_15_next','MMseq2_conservation_level_aa','MMseq2_conservation_score_aa_window_3_prev','MMseq2_conservation_score_aa_window_8_prev','MMseq2_conservation_score_aa_window_15_prev','MMseq2_conservation_score_aa_window_3_next','MMseq2_conservation_score_aa_window_8_next','MMseq2_conservation_score_aa_window_15_next','MMseq2_conservation_score_aa','MoRFchibiScore_aa_window_3_prev','MoRFchibiScore_aa_window_8_prev','MoRFchibiScore_aa_window_15_prev','MoRFchibiScore_aa_window_3_next','MoRFchibiScore_aa_window_8_next','MoRFchibiScore_aa_window_15_next','MoRFchibiScore_aa','PSIPRED_helix_aa_window_3_prev','PSIPRED_helix_aa_window_8_prev','PSIPRED_helix_aa_window_15_prev','PSIPRED_helix_aa_window_3_next','PSIPRED_helix_aa_window_8_next','PSIPRED_helix_aa_window_15_next','PSIPRED_helix_aa','PSIPRED_strand_aa_window_3_prev','PSIPRED_strand_aa_window_8_prev','PSIPRED_strand_aa_window_15_prev','PSIPRED_strand_aa_window_3_next','PSIPRED_strand_aa_window_8_next','PSIPRED_strand_aa_window_15_next','PSIPRED_strand_aa','SCRIBERscore_aa_window_3_prev','SCRIBERscore_aa_window_8_prev','SCRIBERscore_aa_window_15_prev','SCRIBERscore_aa_window_3_next','SCRIBERscore_aa_window_8_next','SCRIBERscore_aa_window_15_next','SCRIBERscore_aa','SignalP_score_aa_window_3_prev','SignalP_score_aa_window_8_prev','SignalP_score_aa_window_15_prev','SignalP_score_aa_window_3_next','SignalP_score_aa_window_8_next','SignalP_score_aa_window_15_next','SignalP_score_aa','gtex_Adipose_-_Subcutaneous','gtex_Adipose_-_Visceral_(Omentum)','gtex_Adrenal_Gland','gtex_Artery_-_Aorta','gtex_Artery_-_Coronary','gtex_Artery_-_Tibial','gtex_Bladder','gtex_Brain_-_Amygdala','gtex_Brain_-_Anterior_cingulate_cortex_(BA24)','gtex_Brain_-_Caudate_(basal_ganglia)','gtex_Brain_-_Cerebellar_Hemisphere','gtex_Brain_-_Cerebellum','gtex_Brain_-_Cortex','gtex_Brain_-_Frontal_Cortex_(BA9)','gtex_Brain_-_Hippocampus','gtex_Brain_-_Hypothalamus','gtex_Brain_-_Nucleus_accumbens_(basal_ganglia)','gtex_Brain_-_Putamen_(basal_ganglia)','gtex_Brain_-_Spinal_cord_(cervical_c-1)','gtex_Brain_-_Substantia_nigra','gtex_Breast_-_Mammary_Tissue','gtex_Cells_-_Cultured_fibroblasts','gtex_Cells_-_EBV-transformed_lymphocytes','gtex_Cervix_-_Ectocervix','gtex_Cervix_-_Endocervix','gtex_Colon_-_Sigmoid','gtex_Colon_-_Transverse','gtex_Esophagus_-_Gastroesophageal_Junction','gtex_Esophagus_-_Mucosa','gtex_Esophagus_-_Muscularis','gtex_Fallopian_Tube','gtex_Heart_-_Atrial_Appendage','gtex_Heart_-_Left_Ventricle','gtex_Kidney_-_Cortex','gtex_Kidney_-_Medulla','gtex_Liver','gtex_Lung','gtex_Minor_Salivary_Gland','gtex_Muscle_-_Skeletal','gtex_Nerve_-_Tibial','gtex_Ovary','gtex_Pancreas','gtex_Pituitary','gtex_Prostate','gtex_Skin_-_Not_Sun_Exposed_(Suprapubic)','gtex_Skin_-_Sun_Exposed_(Lower_leg)','gtex_Small_Intestine_-_Terminal_Ileum','gtex_Spleen','gtex_Stomach','gtex_Testis','gtex_Thyroid','gtex_Uterus','gtex_Vagina','gtex_Whole_Blood','haplo','haplo_imputed','PHOSPHORYLATION','ACETYLATION','UBIQUITINATION','S-NITROSYLATION','N-GLYCOSYLATION','METHYLATION','O-GLYCOSYLATION','MYRISTOYLATION','C-GLYCOSYLATION','SUMOYLATION','S-GLYCOSYLATION','polyphen_nobs','polyphen_normasa','polyphen_dvol','polyphen_dprop','polyphen_bfact','polyphen_hbonds','polyphen_avenhet','polyphen_mindhet','polyphen_avenint','polyphen_mindint','polyphen_avensit','polyphen_mindsit','polyphen_idpmax','polyphen_idpsnp','polyphen_idqmin','motifECount','motifEHIPos','motifEScoreChng','Dst2Splice','motifDist','EncodeH3K4me1-sum','EncodeH3K4me1-max','EncodeH3K4me2-sum','EncodeH3K4me2-max','EncodeH3K4me3-sum','EncodeH3K4me3-max','EncodeH3K9ac-sum','EncodeH3K9ac-max','EncodeH3K9me3-sum','EncodeH3K9me3-max','EncodeH3K27ac-sum','EncodeH3K27ac-max','EncodeH3K27me3-sum','EncodeH3K27me3-max','EncodeH3K36me3-sum','EncodeH3K36me3-max','EncodeH3K79me2-sum','EncodeH3K79me2-max','EncodeH4K20me1-sum','EncodeH4K20me1-max','EncodeH2AFZ-sum','EncodeH2AFZ-max','EncodeDNase-sum','EncodeDNase-max','EncodetotalRNA-sum','EncodetotalRNA-max','Grantham_x','Freq100bp','Rare100bp','Sngl100bp','Freq1000bp','Rare1000bp','Sngl1000bp','Freq10000bp','Rare10000bp','Sngl10000bp','RemapOverlapTF','RemapOverlapCL','Charge','Volume','Hydrophobicity','Polarity','Ex','PAM250','JM','HGMD2003','VB','Transition','COSMIC','COSMICvsSWISSPROT','HAPMAP','COSMICvsHAPMAP',]
MEDIAN_FEATURES = ['CADD_raw','Conservation','MaxEntScan_alt','MaxEntScan_diff','MaxEntScan_ref','ada_score','rf_score','FATHMM_score','GERPplus_plus_NR','GERPplus_plus_RS','GM12878_fitCons_score','GenoCanyon_score','H1_hESC_fitCons_score','HUVEC_fitCons_score','LINSIGHT','LIST_S2_score','LRT_score','M_CAP_score','MPC_score','MVP_score','MutationAssessor_score','MutationTaster_score','PROVEAN_score','SiPhy_29way_logOdds','VEST4_score','fathmm_MKL_coding_score','fathmm_XF_coding_score','integrated_fitCons_score','phastCons100way_vertebrate','phastCons17way_primate','phastCons30way_mammalian','phyloP100way_vertebrate','phyloP17way_primate','phyloP30way_mammalian','Condel_score','SIFT_score','NearestExonJB_distance','NearestExonJB_len','Dominant_probability','Recessive_probability','polyphen_dscore','polyphen_score1','polyphen_score2','ConsScore','GC','CpG','minDistTSS','minDistTSE','priPhCons','mamPhCons','verPhCons','priPhyloP','mamPhyloP','verPhyloP','bStatistic_y','targetScan','mirSVR-Score','mirSVR-E','mirSVR-Aln','cHmm_E1','cHmm_E2','cHmm_E3','cHmm_E4','cHmm_E5','cHmm_E6','cHmm_E7','cHmm_E8','cHmm_E9','cHmm_E10','cHmm_E11','cHmm_E12','cHmm_E13','cHmm_E14','cHmm_E15','cHmm_E16','cHmm_E17','cHmm_E18','cHmm_E19','cHmm_E20','cHmm_E21','cHmm_E22','cHmm_E23','cHmm_E24','cHmm_E25','GerpRS','GerpRSpval','GerpN','GerpS','tOverlapMotifs','SpliceAI-acc-gain','SpliceAI-acc-loss','SpliceAI-don-gain','SpliceAI-don-loss','MMSp_acceptorIntron','MMSp_acceptor','MMSp_exon','MMSp_donor','MMSp_donorIntron','dbscSNV-ada_score','dbscSNV-rf_score',]

In [9]:
def encode_lists(feature_lists, mapping_file='feature_name_mapping.json'):
    with open(mapping_file, 'r') as f:
        feature_dict = json.load(f)
    rev_dict = {v: k for k, v in feature_dict.items()}
    encoded_lists = []
    for feature_list in feature_lists:
        encoded_list = []
        for feature in feature_list:
            if feature in rev_dict:
                encoded_list.append(rev_dict[feature])
        encoded_lists.append(encoded_list)
    return encoded_lists
encoded_negone, encoded_median = encode_lists([NEGONE_FEATURES, MEDIAN_FEATURES])
print(f'NEGONE_FEATURES = {encoded_negone}')
print(f'MEDIAN_FEATURES = {encoded_median}')

NEGONE_FEATURES = ['feature487', 'feature489', 'feature491', 'feature492', 'feature493', 'feature494', 'feature495', 'feature496', 'feature497', 'feature498', 'feature499', 'feature458', 'feature4', 'feature6', 'feature10', 'feature13', 'feature31', 'feature35', 'feature36', 'feature46', 'feature47', 'feature48', 'feature49', 'feature45', 'feature50', 'feature51', 'feature52', 'feature53', 'feature54', 'feature55', 'feature56', 'feature57', 'feature58', 'feature59', 'feature60', 'feature61', 'feature62', 'feature63', 'feature64', 'feature65', 'feature66', 'feature67', 'feature68', 'feature69', 'feature70', 'feature71', 'feature72', 'feature73', 'feature74', 'feature75', 'feature76', 'feature77', 'feature84', 'feature85', 'feature86', 'feature87', 'feature88', 'feature89', 'feature90', 'feature91', 'feature92', 'feature93', 'feature94', 'feature95', 'feature96', 'feature97', 'feature98', 'feature99', 'feature100', 'feature101', 'feature102', 'feature103', 'feature105', 'feature107', 'fe

### Encoding the X_test dataset's features.

In [10]:
df = pd.read_csv('../data/X_test_500.csv', low_memory=False)

In [11]:
def encode_names(df):
    original_names = df.columns.tolist()
    name_dict = {f"feature{i}": name for i, name in enumerate(original_names)}
    rev_dict = {v: k for k, v in name_dict.items()}
    df_encoded = df.rename(columns=rev_dict)
    with open('feature_name_mapping.json', 'w') as f:
        json.dump(name_dict, f, indent=2)
    return df_encoded, name_dict
encoded_df, feature_dict = encode_names(df)
print(feature_dict)

{'feature0': 'ID', 'feature1': 'IMPACT', 'feature2': 'BIOTYPE', 'feature3': 'CADD_raw', 'feature4': 'BLOSUM62', 'feature5': 'Conservation', 'feature6': 'ProteinLengthChange', 'feature7': 'MaxEntScan_alt', 'feature8': 'MaxEntScan_diff', 'feature9': 'MaxEntScan_ref', 'feature10': 'TSSDistance', 'feature11': 'ada_score', 'feature12': 'rf_score', 'feature13': '1000Gp3_AF', 'feature14': 'FATHMM_score', 'feature15': 'GERPplus_plus_NR', 'feature16': 'GERPplus_plus_RS', 'feature17': 'GM12878_fitCons_score', 'feature18': 'GenoCanyon_score', 'feature19': 'H1_hESC_fitCons_score', 'feature20': 'HUVEC_fitCons_score', 'feature21': 'LINSIGHT', 'feature22': 'LIST_S2_score', 'feature23': 'LRT_score', 'feature24': 'M_CAP_score', 'feature25': 'MPC_score', 'feature26': 'MVP_score', 'feature27': 'MutationAssessor_score', 'feature28': 'MutationTaster_score', 'feature29': 'PROVEAN_score', 'feature30': 'SiPhy_29way_logOdds', 'feature31': 'UK10K_AF', 'feature32': 'VEST4_score', 'feature33': 'fathmm_MKL_coding_

In [12]:
# Export to CSV.
encoded_df.to_csv('../data/X_test_encoded.csv', index=False)

print("Exported to '../data/X_test_encoded.csv'")

Exported to '../data/X_test_encoded.csv'


### This retrieves feature names that are listed in the RemoveBeforeAfterTransformer class of Utils.py.

###### After obtaining the list of matching features below, I manually added the feature numbers into the `RemoveBeforeAfterTransformer` class that I pasted into `classifiers/XGBoost.ipynb`.

In [13]:
patterns = ['after', 'before', 'gnomAD_exomes_AF', 'gnomAD_genomes_AF']

# Search for all feature names (keys) that match the specified patterns from `utils.py`.
keys = []
for key, value in feature_dict.items():
    if any(pattern in value for pattern in patterns):
        keys.append(key)

# Display the matches.
print("Matches:")
for key in keys:
    print(f"{key}: {feature_dict[key]}")

Matches:
feature35: gnomAD_exomes_AF
feature36: gnomAD_genomes_AF
feature60: before_RSA_3
feature61: before_RSA_8
feature62: before_RSA_15
feature63: after_RSA_3
feature64: after_RSA_8
feature65: after_RSA_15
feature66: before_ASA_3
feature67: before_ASA_8
feature68: before_ASA_15
feature69: after_ASA_3
feature70: after_ASA_8
feature71: after_ASA_15
feature87: before_IUPRED_3
feature88: before_IUPRED_8
feature89: before_IUPRED_15
feature90: after_IUPRED_3
feature91: after_IUPRED_8
feature92: after_IUPRED_15
feature93: before_ANCHOR_3
feature94: before_ANCHOR_8
feature95: before_ANCHOR_15
feature96: after_ANCHOR_3
feature97: after_ANCHOR_8
feature98: after_ANCHOR_15


### Locate the ID and IMPACT columns because they require special treatment during preprocessing.

###### After obtaining the list of matching features below, I manually replaced all mentions of `ID` or `IMPACT` in `classifiers/XGBoost.ipynb` with `feature0` or `feature1`, respectively.

In [14]:
patterns = ['ID', 'IMPACT']

# Find all the keys in feature_dict whose values match these patterns.
keys = []
for key, value in feature_dict.items():
    if any(pattern in value for pattern in patterns):
        keys.append(key)

print("Matches:")
for key in keys:
    print(f"{key}: {feature_dict[key]}")

Matches:
feature0: ID
feature1: IMPACT
