# data processing into a single dataframe


In [1]:
#!pip install pandas
import pandas as pd
'''
Feature set 5: diffrent direct, what VL thinks will give good results

### MK used:
for i in 2 3 4; do python3 keep_relevant_headers.py ../data/chr20-Y_with_gene/chr$i\_gene.tsv \
../data/chr20-Y_with_gene/chr$i.out_set5.tsv --cols chr,pos,ref,alt,aapos,codonpos,Polyphen2_HVAR_pred,\
GenoCanyon_score,LINSIGHT,clinvar_clnsig,RVIS_ExAC,ExAC_cnv.score,SORVA_LOF_MAF0.005_HetOrHom; done

'''

df_all = pd.read_csv('../../../data/chr20-Y_with_gene/chr20.out_set5.tsv', sep='\t', encoding = "UTF-8")
df_all['position'] = df_all['pos']
df_all = df_all.set_index(['chr','pos', 'ref', 'alt']) #index by identifier

for c in range (21,26):
    if c == 23:
        c = 'M'
    if c == 24:
        c = 'X'
    if c == 25:
        c = 'Y'
    path = '../../../data/chr20-Y_with_gene/chr' + str(c) + '.out_set5.tsv'
    df = pd.read_csv(path, sep='\t', encoding = "UTF-8")
    df['position'] = df['pos']
    df = df.set_index(['chr','pos', 'ref', 'alt']) #index by identifier
    df_all= df_all.append(df)
    
print('concat shape', df_all.shape)

df = df_all
print("shape before", df.shape)
df = df.replace({'.': 0})  ## FIXME: atm replace with nan and drop: later find a way to impute, maybe 0 but not sure
#print("SUM", pd.isnull(df).sum())  ## Use this to see how many elements have nan 
df = df.dropna()
print("shape after dropping all data with score = '.'", df.shape)
## still contains some mixed labels, see them by: df[~df['clinvar_clnsig'].isin(list(clinvar_labels.keys()))]

### prior to remapping of clinical significance labels to only pathogenic/benign, check out the distirbutions:
#print("TOTAL SUM OF clinsig: \n", df['clinvar_clnsig'].value_counts())
#print("PERCENT OF clinsig: \n", df['clinvar_clnsig'].value_counts(normalize=True))
#print("CLINVAR LABELS \n", df.groupby('clinvar_clnsig').count())


features_mapping = {
    'Polyphen2_HVAR_pred': {
        'D': -2,
        'P': -1, 
        'B': 1
    },
    'Ancestral_allele': {
        'A': 2,
        'C': 2,
        'T': 2,
        'G': 2,
        'a': 1,
        'c': 1,
        't': 1,
        'g': 1,
        'N': 1,
        '-': -2,
    },
    'Essential_gene_CRISPR': {
        'E': -1,
        'N': 1
    }
}
## Map labels to 1/0 and remove Unkn
clinvar_labels={
    'Pathogenic': 1,
    'Likely_pathogenic': 1,
    'Pathogenic/Likely_pathogenic': 1,
    'Benign/Likely_benign': 0,
    'Likely_benign': 0,
    'Benign': 0
}

df = df.replace(features_mapping)
#print("DF SHAPE", df.shape)
print(list(clinvar_labels.keys()))
print(df.head())
df = df[df['clinvar_clnsig'].isin(list(clinvar_labels.keys()))]  # best to drop everything that isn't in the dict

print("DF SHAPE", df.shape)
print(df.head())
df['Problematic'] = df['clinvar_clnsig'].map(clinvar_labels)

df= df.drop(columns=['clinvar_clnsig'])#, 'clinvar_id'])  #tbh, I don't know why I kept them. consider adding to index
print("shape after dropping Unk", df.shape)
print(df.head())


#print("PROBLEMATIC LABEL DISTRIBUTION \n", df.groupby('Problematic').count())
print("TOTAL SUM OF LABELS: \n", df['Problematic'].value_counts())
print("PERCENT OF LABELS: \n", df['Problematic'].value_counts(normalize=True))

concat shape (21191, 10)
shape before (21191, 10)
shape after dropping all data with score = '.' (21184, 10)
['Pathogenic', 'Likely_pathogenic', 'Pathogenic/Likely_pathogenic', 'Benign/Likely_benign', 'Likely_benign', 'Benign']
                    aapos codonpos  Polyphen2_HVAR_pred GenoCanyon_score  \
chr pos    ref alt                                                         
20  408749 C   A        4        2                    1                1   
    409917 C   T       20        2                    0                1   
    409971 G   A       38        2                    0         0.999999   
    417543 A   T       62        2                   -1                1   
    417560 A   G       68        1                    1          0.99997   

                   LINSIGHT          clinvar_clnsig     RVIS_ExAC  \
chr pos    ref alt                                                  
20  408749 C   A          0  Uncertain_significance  -0.713915349   
    409917 C   T          0  Unc

## the follow is the addition of mutational counts columns
* Note that the various kb ranges are selected to span a wide range of resolutions. 

In [2]:
new_df = df.copy()

def count_mutations(pos, kb, df):
    count = df[(pos-kb*1000 < df['position']) & (df['position']< pos+kb*1000)].shape[0]
    return count


new_df['Mutation<1kb'] = new_df.apply(lambda x: count_mutations(x['position'], 1, new_df), axis=1)
new_df['Mutation<5kb'] = new_df.apply(lambda x: count_mutations(x['position'], 5, new_df), axis=1)
new_df['Mutation<10kb'] = new_df.apply(lambda x: count_mutations(x['position'], 10, new_df), axis=1)
new_df['Mutation<30kb'] = new_df.apply(lambda x: count_mutations(x['position'], 30, new_df), axis=1)
new_df['Mutation<100kb'] = new_df.apply(lambda x: count_mutations(x['position'], 100, new_df), axis=1)
new_df.shape

(10154, 15)

In [3]:
### pickle to reuse dataframe:
new_df= new_df.drop(columns=['position'])
new_df.shape
new_df.to_pickle("../pickled/chr20-Y_featureset5.pkl")