In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
import pickle
def save_data(data, outfile):
    with open(outfile, 'wb') as outf:
        pickle.dump(data, outf)

def load_data(infile):
    with open(infile, 'rb') as inf:
        data = pickle.load(inf)
    return data

In [3]:
import time
loca = time.strftime('%Y-%m-%d')

# load max info data

In [13]:
ClinVar = pd.read_csv("ClinVar_max_info_2024-01-12.csv",dtype={'CHR': str})
VKGL = pd.read_csv("VKGL_max_info_2024-01-12.csv",dtype={'CHR': str})
HGMD = pd.read_csv("HGMD_max_info_2024-01-12.csv",dtype={'CHR': str})
HGMDsg = pd.read_csv("HGMDsg_max_info_2024-01-12.csv",dtype={'CHR': str})
real_world = pd.read_csv("real_world_max_info_2024-01-12.csv",dtype={'CHR': str})

# INDELpred

In [8]:
clf = load_data('../data2023/INDELpred_model_2024-01-12.pkl')

In [9]:
def load_and_predict(data_name):
    data_path = f"{data_name}_test_2024-01-12.pkl"

    x_data, y_data = load_data(data_path)
    y_pred_prob = clf.predict_proba(x_data)[:, 1] 
    y_pred_prob_df = pd.DataFrame(y_pred_prob, columns=['INDELpred score'])

    original_df = eval(data_name)
    result_df = pd.concat([original_df, y_pred_prob_df], axis=1)
    
    save_data(result_df, f"INDELpred_{data_name}_score_{loca}.pkl")
    return result_df

# Example usage:
# INDELpred_ClinVar = load_and_predict("ClinVar")


In [14]:
datasets = ["ClinVar", "VKGL", "HGMD", "HGMDsg", "real_world"]

for data_name in datasets:
    load_and_predict(data_name)


# MetaRNN

In [69]:
def process_and_merge_metaRNN(data_name):
    MetaRNN = pd.read_csv(f"/hwfssz1/ST_HEALTH/P20Z10200N0170/weiyilin/project/INDELpred/other_software/MetaRNN/{data_name}.indel.annotated", sep="\t", dtype={'#CHROM': str, 'Pos': int})
    MetaRNN['#CHROM'] = MetaRNN['#CHROM'].str.replace('chr', '')
    last_column = MetaRNN.columns[-1]
    MetaRNN[last_column] = MetaRNN[last_column].apply(lambda x: x.split(';')[-1] if ';' in x else x)
    original_df = eval(data_name)
    print(MetaRNN.shape,original_df.shape)
    merged_df = pd.merge(MetaRNN, original_df, 
                         left_on=['#CHROM', 'POS', 'REF', 'ALT'], 
                         right_on=['CHR', 'POS', 'REF', 'ALT'], how='right')
    print(merged_df[['#CHROM', 'POS']].isna().sum())
    merged_df = merged_df.dropna(subset=['#CHROM', 'POS'])
    save_data(merged_df, f"../final_intermediate_file/MetaRNN_{data_name}_score_{loca}.pkl")
    return merged_df

# Example usage:
# result_real_world = process_and_merge_metaRNN("real_world")


In [70]:
datasets = ["ClinVar", "VKGL", "HGMD", "HGMDsg", "real_world"]

for data_name in datasets:
    process_and_merge_metaRNN(data_name)


(9946, 6) (49385, 33)
#CHROM    47667
POS           0
dtype: int64
(1738, 6) (7985, 31)
#CHROM    7330
POS          0
dtype: int64
(2035, 6) (46729, 30)
#CHROM    44692
POS           0
dtype: int64
(1531, 6) (35191, 30)
#CHROM    33660
POS           0
dtype: int64
(1009, 6) (3329183, 30)
#CHROM    3328288
POS             0
dtype: int64


# CADD

In [7]:
def process_and_merge_CADD(data_name):
    CADD = pd.read_csv(f"/hwfssz1/ST_HEALTH/P20Z10200N0170/weiyilin/project/INDELpred/other_software/CADD/{data_name}.tsv", sep="\t", skiprows=1, dtype={'#Chrom': str, 'Pos': int}, usecols=["#Chrom", "Pos", "Ref", "Alt", 'PHRED'])
    CADD['#Chrom'] = CADD['#Chrom'].str.replace('chr', '')
    CADD = CADD.drop_duplicates(subset=["#Chrom", "Pos", "Ref", "Alt"])
    original_df = eval(data_name)
    merged_df = pd.merge(CADD, original_df, 
                         left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], 
                         right_on=['CHR', 'POS', 'REF', 'ALT'], how='right')
    print(CADD.shape,original_df.shape)
    merged_df = merged_df.dropna(subset=['#Chrom', 'Pos'])
    print(merged_df.shape)
    save_data(merged_df, f"../final_intermediate_file/CADD_{data_name}_score_{loca}.pkl")
    return merged_df


In [8]:
# datasets = ["ClinVar", "VKGL", "HGMD", "HGMDsg", "real_world"]
datasets = ["HGMD", "HGMDsg"]

for data_name in datasets:
    process_and_merge_CADD(data_name)


(45056, 5) (46729, 28)
(46729, 33)
(35190, 5) (35191, 28)
(35191, 33)


# VEST-INDEL

In [73]:
def process_and_merge_VEST(data_name):
    VEST = pd.read_csv(f"/hwfssz1/ST_HEALTH/P20Z10200N0170/weiyilin/project/INDELpred/other_software/VEST-INDEL/{data_name}.CSV", dtype={'Chromosome': str, 'Position': int})
    VEST['Chromosome'] = VEST['Chromosome'].str.replace('chr', '')
    cols_to_merge = [
    'VEST score (frameshift indels)', 
    'VEST score (inframe indels)', 
    'VEST score (stop-gain)', 
    'VEST score (stop-loss)', 
    'VEST score (splice site)'
    ]
    VEST['VEST score'] = VEST[cols_to_merge].sum(axis=1)
    VEST = VEST.drop(cols_to_merge, axis=1)
    VEST['Position'] -= 1
    
    original_df = eval(data_name)
    original_df['REF_ADJUSTED'] = original_df.apply(lambda row: '-' if len(row['REF']) == 1 else row['REF'][1:], axis=1)
    original_df['ALT_ADJUSTED'] = original_df.apply(lambda row: '-' if pd.isnull(row['ALT']) or isinstance(row['ALT'], float) or len(row['ALT']) == 1 else row['ALT'][1:], axis=1)

    
    merged_df = pd.merge(VEST, original_df, 
                         left_on=['Chromosome', 'Position', 'Reference base(s)', 'Alternate base(s)'],
                         right_on=['CHR', 'POS', 'REF_ADJUSTED', 'ALT_ADJUSTED'], how='right')
    print(VEST.shape,original_df.shape)
    print(merged_df[['Chromosome', 'Position']].isna().sum())
    merged_df = merged_df.dropna(subset=['Chromosome', 'Position'])
    save_data(merged_df, f"../final_intermediate_file/VEST_{data_name}_score_{loca}.pkl")
    return merged_df


In [74]:
datasets = ["ClinVar", "VKGL", "HGMD", "HGMDsg", "real_world"]

for data_name in datasets:
    process_and_merge_VEST(data_name)


(54374, 5) (49385, 33)
Chromosome    16076
Position      16076
dtype: int64
(7779, 6) (7985, 31)
Chromosome    3032
Position      3032
dtype: int64
(22160, 5) (46729, 30)
Chromosome    24562
Position      24562
dtype: int64
(17483, 5) (35191, 30)
Chromosome    17702
Position      17702
dtype: int64
(3113, 5) (3329183, 30)
Chromosome    3326664
Position      3326664
dtype: int64


# CAPICE

In [9]:
def process_and_merge_CAPICE(data_name):
    CAPICE = pd.read_csv(f"/hwfssz1/ST_HEALTH/P20Z10200N0170/weiyilin/project/INDELpred/other_software/CAPICE/{data_name}_capice.tsv", sep="\t", dtype={'chr': str, "pos": int})
    CAPICE['chr'] = CAPICE['chr'].str.replace('chr', '')
    CAPICE = CAPICE.drop_duplicates(subset=["chr", "pos", "ref", "alt"])
    original_df = eval(data_name)
    merged_df = pd.merge(CAPICE, original_df, 
                         left_on=["chr", "pos", "ref", "alt"], 
                         right_on=['CHR', 'POS', 'REF', 'ALT'], how='right')
    print(CAPICE.shape,original_df.shape)
    print(merged_df[["chr", "pos"]].isna().sum())
    merged_df = merged_df.dropna(subset=["chr", "pos"])
    save_data(merged_df, f"../final_intermediate_file/CAPICE_{data_name}_score_{loca}.pkl")
    return merged_df


In [10]:
# datasets = ["ClinVar", "VKGL", "HGMD", "HGMDsg", "real_world"]
datasets = ["HGMD", "HGMDsg"]

for data_name in datasets:
    process_and_merge_CAPICE(data_name)


(45056, 11) (46729, 28)
chr    0
pos    0
dtype: int64
(35190, 11) (35191, 28)
chr    0
pos    0
dtype: int64
