In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys, csv, json, re
from category_encoders import BinaryEncoder
from collections import defaultdict
import joblib

### Decoding Features after 12 Algorithms

In [2]:
df = pd.read_csv('../data/X_train_499.csv')
original_columns = df.columns.tolist()
print(len(original_columns))
print(sorted(original_columns))

500
['1000Gp3_AF', 'A3D_SCORE', 'ACETYLATION', 'AF_Relative_ASA', 'AF_confidence', 'AF_dssp_secondary_structure', 'ANCHOR2', 'ASA', 'ASAquick_normscore_aa', 'ASAquick_normscore_aa_window_15_next', 'ASAquick_normscore_aa_window_15_prev', 'ASAquick_normscore_aa_window_3_next', 'ASAquick_normscore_aa_window_3_prev', 'ASAquick_normscore_aa_window_8_next', 'ASAquick_normscore_aa_window_8_prev', 'ASAquick_rawscore_aa', 'ASAquick_rawscore_aa_window_15_next', 'ASAquick_rawscore_aa_window_15_prev', 'ASAquick_rawscore_aa_window_3_next', 'ASAquick_rawscore_aa_window_3_prev', 'ASAquick_rawscore_aa_window_8_next', 'ASAquick_rawscore_aa_window_8_prev', 'ATP_binding_gbind', 'Acetylation', 'BIOTYPE', 'BLOSUM62', 'C-GLYCOSYLATION', 'CADD_raw', 'CDS_len', 'COSMIC', 'COSMICvsHAPMAP', 'COSMICvsSWISSPROT', 'Ca2+_binding_gbind', 'Charge', 'Clarks_distance', 'Condel_score', 'ConsScore', 'Consequence_first', 'Conservation', 'CpG', 'DFLpredScore_aa', 'DFLpredScore_aa_window_15_next', 'DFLpredScore_aa_window_15

In [3]:
first_ranks = pd.read_csv('feature_ranks.csv')

In [4]:
encoded_dict = {rank + 1: feature for rank, feature in enumerate(first_ranks['Feature'])}
print(encoded_dict)

{1: 'Consequence_first_0', 2: 'IMPACT_HIGH', 3: 'ConsScore', 4: 'VEST4_score', 5: 'verPhyloP', 6: 'gnomAD_exomes_AF', 7: 'IMPACT_LOW', 8: 'verPhCons', 9: 'IMPACT_MODIFIER', 10: 'mamPhyloP', 11: 'mamPhCons', 12: 'Dominant_probability', 13: 'phyloP100way_vertebrate', 14: 'M_CAP_score', 15: 'Consequence_first_4', 16: 'fathmm_MKL_coding_score', 17: 'GerpS', 18: 'MPC_score', 19: 'cHmm_E15', 20: 'cHmm_E4', 21: 'IMPACT_MODERATE', 22: 'after_IUPRED_8', 23: 'priPhCons', 24: 'denovo_Zscore', 25: 'Condel_score', 26: 'MVP_score', 27: 'after_IUPRED_15', 28: 'phastCons100way_vertebrate', 29: 'Recessive_probability', 30: 'MutationTaster_score', 31: 'Number_of_paralogs', 32: 'dbscSNV-rf_score', 33: 'BIOTYPE_protein_coding', 34: 'cHmm_E7', 35: 'UK10K_AF', 36: 'before_ANCHOR_15', 37: 'Rare10000bp', 38: 'AF_confidence', 39: 'ANCHOR2', 40: 'SpliceAI-don-loss', 41: 'EncodeH3K36me3-sum', 42: 'FATHMM_score', 43: 'EncodeDNase-sum', 44: 'CADD_raw', 45: 'GerpN', 46: 'before_IUPRED_3', 47: 'BIOTYPE_nonsense_medi

In [5]:
encoded_dict = {value: key for key, value in encoded_dict.items()}
print(encoded_dict)

{'Consequence_first_0': 1, 'IMPACT_HIGH': 2, 'ConsScore': 3, 'VEST4_score': 4, 'verPhyloP': 5, 'gnomAD_exomes_AF': 6, 'IMPACT_LOW': 7, 'verPhCons': 8, 'IMPACT_MODIFIER': 9, 'mamPhyloP': 10, 'mamPhCons': 11, 'Dominant_probability': 12, 'phyloP100way_vertebrate': 13, 'M_CAP_score': 14, 'Consequence_first_4': 15, 'fathmm_MKL_coding_score': 16, 'GerpS': 17, 'MPC_score': 18, 'cHmm_E15': 19, 'cHmm_E4': 20, 'IMPACT_MODERATE': 21, 'after_IUPRED_8': 22, 'priPhCons': 23, 'denovo_Zscore': 24, 'Condel_score': 25, 'MVP_score': 26, 'after_IUPRED_15': 27, 'phastCons100way_vertebrate': 28, 'Recessive_probability': 29, 'MutationTaster_score': 30, 'Number_of_paralogs': 31, 'dbscSNV-rf_score': 32, 'BIOTYPE_protein_coding': 33, 'cHmm_E7': 34, 'UK10K_AF': 35, 'before_ANCHOR_15': 36, 'Rare10000bp': 37, 'AF_confidence': 38, 'ANCHOR2': 39, 'SpliceAI-don-loss': 40, 'EncodeH3K36me3-sum': 41, 'FATHMM_score': 42, 'EncodeDNase-sum': 43, 'CADD_raw': 44, 'GerpN': 45, 'before_IUPRED_3': 46, 'BIOTYPE_nonsense_mediated

In [6]:
with open('decoding_template.json', 'r') as file:
    loo_dict = json.load(file)
print(loo_dict)

{'1': 'gnomAD_exomes_AF', '2': 'before_ANCHOR_15', '3': 'Recessive_probability', '4': 'MMSp_acceptorIntron', '5': 'Dist2Mutation', '6': 'TSSDistance', '7': 'before_ANCHOR_3', '8': 'REQUIRED_FOR_INTER', '9': 'after_ANCHOR_8', '10': 'before_RSA_3', '11': 'cHmm_E14', '12': 'before_ASA_3', '13': 'before_ASA_8', '14': 'Mn2+_binding_gbind', '15': 'distance_com', '16': 'after_ANCHOR_3', '17': 'after_IUPRED_15', '18': 'before_IUPRED_8', '19': 'gnomAD_genomes_AF', '20': 'before_ASA_15', '21': 'RNA_binding_gbind', '22': 'A3D_SCORE', '23': 'Rare100bp', '24': 'REGION', '25': 'ATP_binding_gbind', '26': 'PTM', '27': 'DNA_binding_gbind', '28': 'after_RSA_15', '29': 'after_RSA_8', '30': 'UK10K_AF', '31': 'VEST4_score', '32': 'MMSp_exon', '33': 'MMSp_donor', '34': 'IUPRED2', '35': 'before_IUPRED_3', '36': 'SpliceAI-don-gain', '37': 'ppi_combined_22', '38': 'RVIS', '39': 'RSA_Zfit', '40': 'DisoRNAscore_aa_window_8_next', '41': 'BLOSUM62', '42': 'after_ASA_15', '43': 'Consequence_first', '44': 'SpliceAI-

In [7]:
inverted_dict = {value: key for key, value in loo_dict.items()}
print(inverted_dict)

{'gnomAD_exomes_AF': '1', 'before_ANCHOR_15': '2', 'Recessive_probability': '3', 'MMSp_acceptorIntron': '4', 'Dist2Mutation': '5', 'TSSDistance': '6', 'before_ANCHOR_3': '7', 'REQUIRED_FOR_INTER': '8', 'after_ANCHOR_8': '9', 'before_RSA_3': '10', 'cHmm_E14': '11', 'before_ASA_3': '12', 'before_ASA_8': '13', 'Mn2+_binding_gbind': '14', 'distance_com': '15', 'after_ANCHOR_3': '16', 'after_IUPRED_15': '17', 'before_IUPRED_8': '18', 'gnomAD_genomes_AF': '19', 'before_ASA_15': '20', 'RNA_binding_gbind': '21', 'A3D_SCORE': '22', 'Rare100bp': '23', 'REGION': '24', 'ATP_binding_gbind': '25', 'PTM': '26', 'DNA_binding_gbind': '27', 'after_RSA_15': '28', 'after_RSA_8': '29', 'UK10K_AF': '30', 'VEST4_score': '31', 'MMSp_exon': '32', 'MMSp_donor': '33', 'IUPRED2': '34', 'before_IUPRED_3': '35', 'SpliceAI-don-gain': '36', 'ppi_combined_22': '37', 'RVIS': '38', 'RSA_Zfit': '39', 'DisoRNAscore_aa_window_8_next': '40', 'BLOSUM62': '41', 'after_ASA_15': '42', 'Consequence_first': '43', 'SpliceAI-don-lo

In [8]:
inverted_dict = dict(sorted(inverted_dict.items()))
print(inverted_dict)

{'1000Gp3_AF': '58', 'A3D_SCORE': '22', 'ACETYLATION': '456', 'AF_Relative_ASA': '124', 'AF_confidence': '76', 'AF_dssp_secondary_structure': '372', 'ANCHOR2': '194', 'ASA': '123', 'ASAquick_normscore_aa': '424', 'ASAquick_normscore_aa_window_15_next': '351', 'ASAquick_normscore_aa_window_15_prev': '277', 'ASAquick_normscore_aa_window_3_next': '147', 'ASAquick_normscore_aa_window_3_prev': '198', 'ASAquick_normscore_aa_window_8_next': '195', 'ASAquick_normscore_aa_window_8_prev': '279', 'ASAquick_rawscore_aa': '297', 'ASAquick_rawscore_aa_window_15_next': '430', 'ASAquick_rawscore_aa_window_15_prev': '236', 'ASAquick_rawscore_aa_window_3_next': '169', 'ASAquick_rawscore_aa_window_3_prev': '113', 'ASAquick_rawscore_aa_window_8_next': '237', 'ASAquick_rawscore_aa_window_8_prev': '394', 'ATP_binding_gbind': '25', 'Acetylation': '80', 'BIOTYPE': '477', 'BLOSUM62': '41', 'C-GLYCOSYLATION': '322', 'CADD_raw': '440', 'CDS_len': '117', 'COSMIC': '221', 'COSMICvsHAPMAP': '321', 'COSMICvsSWISSPRO

In [9]:
encoded_dict = dict(sorted(encoded_dict.items()))
print(encoded_dict)

{'1000Gp3_AF': 51, 'A3D_SCORE': 418, 'ACETYLATION': 523, 'AF_Relative_ASA': 140, 'AF_confidence': 38, 'AF_dssp_secondary_structure_-': 464, 'AF_dssp_secondary_structure_B': 528, 'AF_dssp_secondary_structure_E': 439, 'AF_dssp_secondary_structure_G': 527, 'AF_dssp_secondary_structure_H': 453, 'AF_dssp_secondary_structure_I': 533, 'AF_dssp_secondary_structure_S': 505, 'AF_dssp_secondary_structure_T': 497, 'ANCHOR2': 39, 'ASA': 78, 'ASAquick_normscore_aa': 279, 'ASAquick_normscore_aa_window_15_next': 271, 'ASAquick_normscore_aa_window_15_prev': 99, 'ASAquick_normscore_aa_window_3_next': 373, 'ASAquick_normscore_aa_window_3_prev': 291, 'ASAquick_normscore_aa_window_8_next': 362, 'ASAquick_normscore_aa_window_8_prev': 252, 'ASAquick_rawscore_aa': 158, 'ASAquick_rawscore_aa_window_15_next': 238, 'ASAquick_rawscore_aa_window_15_prev': 92, 'ASAquick_rawscore_aa_window_3_next': 310, 'ASAquick_rawscore_aa_window_3_prev': 343, 'ASAquick_rawscore_aa_window_8_next': 451, 'ASAquick_rawscore_aa_window

##### Compare the length of the original dataset with the encoded dataset.

In [10]:
original = inverted_dict
encoded = encoded_dict
print(len(original))
print(len(encoded))

499
539


In [11]:
# Display the keys (feature names) not found in the original.
not_in_original = set(encoded.keys()) - set(original.keys())
encoded_keys = list(not_in_original)
print(sorted(encoded_keys))
print(len(encoded_keys))

['AF_dssp_secondary_structure_-', 'AF_dssp_secondary_structure_B', 'AF_dssp_secondary_structure_E', 'AF_dssp_secondary_structure_G', 'AF_dssp_secondary_structure_H', 'AF_dssp_secondary_structure_I', 'AF_dssp_secondary_structure_S', 'AF_dssp_secondary_structure_T', 'BIOTYPE_lncRNA', 'BIOTYPE_nonsense_mediated_decay', 'BIOTYPE_promoter_flanking_region', 'BIOTYPE_protein_coding', 'BIOTYPE_unprocessed_pseudogene', 'Consequence_first_0', 'Consequence_first_1', 'Consequence_first_2', 'Consequence_first_3', 'Consequence_first_4', 'Dst2SplType_ACCEPTOR', 'Dst2SplType_DONOR', 'EnsembleRegulatoryFeature_CTCF Binding Site', 'EnsembleRegulatoryFeature_Enhancer', 'EnsembleRegulatoryFeature_Open chromatin', 'EnsembleRegulatoryFeature_Promoter', 'EnsembleRegulatoryFeature_Promoter Flanking Region', 'EnsembleRegulatoryFeature_TF binding site', 'IMPACT_HIGH', 'IMPACT_LOW', 'IMPACT_MODERATE', 'IMPACT_MODIFIER', 'MOD_RES_DESCRIPTION_0', 'MOD_RES_DESCRIPTION_1', 'MOD_RES_DESCRIPTION_2', 'MOD_RES_DESCRIPTI

In [12]:
# Display the keys (feature names) not found in the encoded set.
not_in_encoded = set(original.keys()) - set(encoded.keys())
original_keys = list(not_in_encoded)
print(sorted(original_keys))
print(len(original_keys))

['AF_dssp_secondary_structure', 'BIOTYPE', 'Consequence_first', 'Dst2SplType', 'EnsembleRegulatoryFeature', 'IMPACT', 'MOD_RES_DESCRIPTION', 'NMD', 'REGION_DESCRIPTION', 'STABILITY[3D]', 'STABILITY[SEQ]']
11


In [13]:
not_in_original = set(encoded.keys()) - set(original.keys())
combi_dict = {}

for orig_key in original.keys():
    combi_val = []
    for encoded_key in not_in_original:
        if encoded_key.startswith(orig_key):
            combi_val.append(encoded[encoded_key])
    if combi_val:
        combi_dict[orig_key] = combi_val

for key in combi_dict:
    combi_dict[key] = ', '.join(map(str, combi_dict[key]))

df_combined = pd.DataFrame(list(combi_dict.items()), columns=['Original Key', 'Combined Values'])

# drop rows with the keys 'MOD_RES' or 'REGION' due to dupliate names.
df_combined = df_combined[~df_combined['Original Key'].isin(['MOD_RES', 'REGION'])]

df_combined

Unnamed: 0,Original Key,Combined Values
0,AF_dssp_secondary_structure,"439, 453, 505, 528, 527, 497, 464, 533"
1,BIOTYPE,"33, 447, 446, 342, 47"
2,Consequence_first,"1, 15, 54, 286, 159"
3,Dst2SplType,"499, 429"
4,EnsembleRegulatoryFeature,"459, 438, 433, 493, 531, 503"
5,IMPACT,"21, 7, 2, 9"
7,MOD_RES_DESCRIPTION,"484, 458, 473, 495, 381"
8,NMD,415
10,REGION_DESCRIPTION,"272, 325, 368, 409, 423, 397, 284, 462, 428"
11,STABILITY[3D],"411, 452, 309"


In [14]:
not_in_original = set(encoded.keys()) - set(original.keys())
combi_dict = {}

for orig_key in original.keys():
    combi_val = []
    for encoded_key in not_in_original:
        if encoded_key.startswith(orig_key):
            combi_val.append(encoded[encoded_key])
    if combi_val:
        combi_dict[orig_key] = combi_val

for key in combi_dict:
    combi_dict[key] = ', '.join(map(str, combi_dict[key]))
for key in not_in_original:
    if any(key.startswith(original_key) for original_key in original.keys()):
        del encoded[key]

encoded.update(combi_dict)
encoded

{'1000Gp3_AF': 51,
 'A3D_SCORE': 418,
 'ACETYLATION': 523,
 'AF_Relative_ASA': 140,
 'AF_confidence': 38,
 'ANCHOR2': 39,
 'ASA': 78,
 'ASAquick_normscore_aa': 279,
 'ASAquick_normscore_aa_window_15_next': 271,
 'ASAquick_normscore_aa_window_15_prev': 99,
 'ASAquick_normscore_aa_window_3_next': 373,
 'ASAquick_normscore_aa_window_3_prev': 291,
 'ASAquick_normscore_aa_window_8_next': 362,
 'ASAquick_normscore_aa_window_8_prev': 252,
 'ASAquick_rawscore_aa': 158,
 'ASAquick_rawscore_aa_window_15_next': 238,
 'ASAquick_rawscore_aa_window_15_prev': 92,
 'ASAquick_rawscore_aa_window_3_next': 310,
 'ASAquick_rawscore_aa_window_3_prev': 343,
 'ASAquick_rawscore_aa_window_8_next': 451,
 'ASAquick_rawscore_aa_window_8_prev': 398,
 'ATP_binding_gbind': 490,
 'Acetylation': 508,
 'BLOSUM62': 479,
 'C-GLYCOSYLATION': 536,
 'CADD_raw': 44,
 'CDS_len': 199,
 'COSMIC': 274,
 'COSMICvsHAPMAP': 240,
 'COSMICvsSWISSPROT': 436,
 'Ca2+_binding_gbind': 434,
 'Charge': 340,
 'Clarks_distance': 97,
 'Condel_

In [15]:
def process_encoded(encoded):
    def lowest_number(value):
        if isinstance(value, str):
            numbers = [int(num.strip()) for num in value.split(',')]
            return min(numbers)
        return value

    processed = {}
    mod_res_value = None
    region_value = None

    # Ignore MOD_RES_DESCRIPTION and REGION_DESCRIPTION due to duplicates.
    for key, value in encoded.items():
        if key == 'MOD_RES':
            mod_res_value = int(lowest_number(value))
            processed[key] = mod_res_value
        elif key == 'REGION':
            region_value = int(lowest_number(value))
            processed[key] = region_value
        elif key not in ['MOD_RES_DESCRIPTION', 'REGION_DESCRIPTION']:
            processed[key] = int(lowest_number(value))

    # Handle MOD_RES_DESCRIPTION and REGION_DESCRIPTION.
    if 'MOD_RES_DESCRIPTION' in encoded:
        numbers = [int(num.strip()) for num in encoded['MOD_RES_DESCRIPTION'].split(',')]
        numbers = [num for num in numbers if num != mod_res_value]
        if numbers:
            processed['MOD_RES_DESCRIPTION'] = min(numbers)

    if 'REGION_DESCRIPTION' in encoded:
        numbers = [int(num.strip()) for num in encoded['REGION_DESCRIPTION'].split(',')]
        numbers = [num for num in numbers if num != region_value]
        if numbers:
            processed['REGION_DESCRIPTION'] = min(numbers)

    return processed

processed_encoded = process_encoded(encoded)

print("MOD_RES:", processed_encoded.get('MOD_RES'))
print("MOD_RES_DESCRIPTION:", processed_encoded.get('MOD_RES_DESCRIPTION'))
print("REGION:", processed_encoded.get('REGION'))
print("REGION_DESCRIPTION:", processed_encoded.get('REGION_DESCRIPTION'))

def invert_dict(dictionary):
    return {value: key for key, value in dictionary.items()}

inverted_encoded = invert_dict(processed_encoded)

def sort_by_keys(dictionary):
    return dict(sorted(dictionary.items()))

sorted_encoded = sort_by_keys(inverted_encoded)

# Ensure they are the correct length (same as original dataset).
print(f'processed keys: {len(processed_encoded)}')
print(f'inverted keys: {len(inverted_encoded)}')
print(f'sorted keys: {len(sorted_encoded)}')

MOD_RES: 381
MOD_RES_DESCRIPTION: 458
REGION: 272
REGION_DESCRIPTION: 284
processed keys: 499
inverted keys: 499
sorted keys: 499


In [None]:
orig_dict = sorted_encoded

def renumber(original_dict):
    items = list(original_dict.items())
    renumbered_dict = {str(i+1): value for i, (_, value) in enumerate(items)}
    return renumbered_dict
renumbered_dict = renumber(orig_dict)

for i, (key, value) in enumerate(renumbered_dict.items()):
    if i < 5:
        print(f"{key}: {value}")
    elif i > len(renumbered_dict) - 6:
        if i == 5:
            print("...")
        print(f"{key}: {value}")

In [17]:
# Deposit the final rankings into a JSON file.
file_path = "decoded_ranks.json"
with open(file_path, 'w') as file:
    json.dump(renumbered_dict, file, indent=4)