In [59]:
import pandas as pd
import numpy as np
import os

In [60]:
import pickle
def save_data(data, outfile):
    with open(outfile, 'wb') as outf:
        pickle.dump(data, outf)

def load_data(infile):
    with open(infile, 'rb') as inf:
        data = pickle.load(inf)
    return data


In [61]:
#https://annovar.openbioinformatics.org/en/latest/user-guide/gene/
dict_gene1 = {
    'downstream':5,
    'exonic':1,
    'exonic;splicing':1,
    #'Func.refGene',
    'intergenic':6,
    'intronic':4,
    'ncRNA_exonic':2.1,
    'ncRNA_splicing':2.1,
    'ncRNA_intronic':2.4,
    'ncRNA_exonic;splicing':2.1,
    'ncRNA_UTR5':2.3,
    'splicing':1,
    'upstream':5,
    'upstream;downstream':5,
    'UTR3':3,
    'UTR5':3,
    'UTR5;UTR3':3
}
dict_gene2={
    '.':12,
    #'ExonicFunc.refGene',
    'frameshift_deletion':2,
    'frameshift_insertion':1,
    'nonframeshift_deletion':7,
    'nonframeshift_insertion':6,
    'stopgain':4,
    'stoploss':5,
    'startloss':0, ##add
    'unknown':11
}

In [62]:
tag_list=['Pathogenic','Likely_pathogenic','Pathogenic/Likely_pathogenic',
         'Benign','Likely_benign','Benign/Likely_benign']


In [63]:
#https://www.ncbi.nlm.nih.gov/clinvar/docs/review_status/
dict_CLNREVSTAT={
    'criteria_provided,_single_submitter':1,
    'no_assertion_criteria_provided':0,
    'criteria_provided,_multiple_submitters,_no_conflicts':2,
    'reviewed_by_expert_panel':3,
    'practice_guideline':4
}

In [64]:
columns_list=['# [1]CHROM_[2]POS_[3]REF_[4]ALT',  '[9]Func.refGene',
       '[10]ExonicFunc.refGene', '[11]controls_AF_popmax', '0', 
       'exonic', 'stream', 'intergenic', 'intronic', 'ncRNA', 'splicing',
       'UTR', 'frameshift_deletion', 'frameshift_insertion',
       'nonframeshift_deletion', 'nonframeshift_insertion', 'startloss',
       'stopgain', 'stoploss',  'exac_syn_z', 'exac_mis_z',
       'exac_lof_z', 'exac_pLI', 'exac_cnv_z', 
       'RVIS[pop_maf_0.05%(any)]', '%RVIS[pop_maf_0.05%(any)]',
        'OE-ratio_[ExAC v2]',
       '%OE-ratio_[ExAC v2]', 'alternative-RVIS[maf_0.0025%]',
       'alternative-%RVIS[maf_0.0025%]']
rename_col=['INDEL_index',  'Func',
       'ExonicFunc', 'gnomad_genome_controls_AF_popmax', 'length', 
       'exonic', 'stream', 'intergenic', 'intronic', 'ncRNA', 'splicing',
       'UTR', 'frameshift_deletion', 'frameshift_insertion',
       'nonframeshift_deletion', 'nonframeshift_insertion', 'startloss',
       'stopgain', 'stoploss',  'exac_syn_z', 'exac_mis_z',
       'exac_lof_z', 'exac_pLI', 'exac_cnv_z', 
       'RVIS_pop_maf_0_05', 'p_RVIS_pop_maf_0_05',
        'OE-ratio_ExAC_v2',
       'p_OE-ratio_ExAC_v2', 'alternative-RVIS_maf_0_0025',
       'alternative-p_RVIS_maf_0_0025']

### train

In [66]:
data_dir='/media/bgi/zhangtongda/bgi//zfssz2/ST_MCHRI/BIGDATA/USER/zhangtongda/ldfssz1_ztd_project/project/variang_predict2/INDELpred_0124/train_test_data/'
train_data_f='fin_train_data.csv'
train_data=pd.read_csv(os.path.join(data_dir, train_data_f),sep='\t',header=0)
train_data = train_data.loc[train_data['[6]CLNSIG'].isin(tag_list),]
train_data.loc[train_data['[6]CLNSIG'].str.contains('athogenic'),'tag']=1
train_data.loc[train_data['[6]CLNSIG'].str.contains('enign'),'tag']=0 
train_data = train_data[columns_list+['tag','[7]CLNREVSTAT']]
train_data['[7]CLNREVSTAT'].replace(dict_CLNREVSTAT, inplace=True)
train_data.columns=rename_col+['tag','star']
train_data['Func'].replace(dict_gene1, inplace=True)
train_data['ExonicFunc'].replace(dict_gene2, inplace=True)
train_data = train_data.replace('.', np.nan)

for column in rename_col[1:]:
    train_data[column] = pd.to_numeric(train_data[column])
    train_data[column].fillna(0, inplace=True)


In [67]:
# x_train=
x_train = train_data.iloc[:,1:-2]
y_train = train_data['tag']
save_data([x_train,y_train],'train_data.pkl')

### test1

In [68]:
test1_data_f='fin_test1_data.csv'
test1_data=pd.read_csv(os.path.join(data_dir, test1_data_f),sep='\t',header=0)
test1_data = test1_data.loc[test1_data['[6]CLNSIG'].isin(tag_list),]
test1_data.loc[test1_data['[6]CLNSIG'].str.contains('athogenic'),'tag']=1
test1_data.loc[test1_data['[6]CLNSIG'].str.contains('enign'),'tag']=0 
test1_data = test1_data[columns_list+['tag','[7]CLNREVSTAT']]
test1_data['[7]CLNREVSTAT'].replace(dict_CLNREVSTAT, inplace=True)
test1_data.columns=rename_col+['tag','star']
test1_data['Func'].replace(dict_gene1, inplace=True)
test1_data['ExonicFunc'].replace(dict_gene2, inplace=True)
test1_data = test1_data.replace('.', np.nan)

for column in rename_col[1:]:
    test1_data[column] = pd.to_numeric(test1_data[column])
    test1_data[column].fillna(0, inplace=True)


In [69]:
x_test1 = test1_data.iloc[:,1:-2]
y_test1 = test1_data['tag']
test1_star = test1_data['star']
save_data([x_test1,y_test1,test1_star],'test1_data.pkl')

### test2

In [70]:
test2_data_f='fin_test2_data.csv'
test2_data=pd.read_csv(os.path.join(data_dir, test2_data_f),sep='\t',header=0)
test2_data = test2_data.loc[test2_data['[6]VKGL_clf'].isin(['LB','LP']),]
test2_data.loc[test2_data['[6]VKGL_clf'].str.contains('LB'),'tag']=0
test2_data.loc[test2_data['[6]VKGL_clf'].str.contains('LP'),'tag']=1 
test2_data = test2_data[columns_list+['tag','[7]VKGL_supp']]
test2_data['[7]VKGL_supp'].replace({'1 lab':1}, inplace=True)
test2_data.columns=rename_col+['tag','lab']
test2_data['Func'].replace(dict_gene1, inplace=True)
test2_data['ExonicFunc'].replace(dict_gene2, inplace=True)
test2_data = test2_data.replace('.', np.nan)

for column in rename_col[1:]:
    test2_data[column] = pd.to_numeric(test2_data[column])
    test2_data[column].fillna(0, inplace=True)


In [71]:
x_test2 = test2_data.iloc[:,1:-2]
y_test2 = test2_data['tag']
test2_lab = test2_data['lab']
save_data([x_test2,y_test2,test2_lab],'test2_data.pkl')