In [1]:
'''
Preprocess script for SNPs, mostly covering the translation part
synonomous vs non-synonomous
'''
import pandas as pd, numpy as np, re, sys, json
from pathlib import Path
from multiprocessing import Pool
from collections import Counter

in_path= Path('/d/data/plasmo/nat_out/v2')
if not in_path.is_dir():
    in_path = Path('~/scratch/plasmo/preprocess')

in_file = in_path / 'snps_by_genes.tsv'
out_path = in_path / 'gene_scores_blosum2.tsv'
genes_file = in_path / 'filtered_genes.txt'
ref_file = in_path / '3d7.fasta'
sift_file = in_path / 'sift.tsv'
blosum_file = in_path / 'BLOSUM62.txt'

In [2]:
#loading the actual SNPs
in_df = pd.read_csv(in_file, sep='\t', index_col = [0,1])
#load sift
sift_df = pd.read_csv(sift_file, sep='\t', index_col = [0,1]).fillna('NA')
#load Blosum
blosum_df = pd.read_csv(blosum_file, sep='\s+', index_col=0)

In [3]:
by_genes = in_df.groupby('CHROM') #each iteration yields (name, df)

In [19]:
#function for assigning score to each element of each row
def toScore(row):
    #the stipulation here is that the 'ref' are the 5 reference bases surrounding the SNP (+2, -2)
    #scoring scheme: regular = 1, stop codon or indel = 3.
    def getSingleScore(snp, ref, alts, sift_info): 
        
        FS_SCORE = 60
        BLOSUM_OFFSET = 11
        
#         print('calling on ', snp, ref, alts)
        #ref may be a list
        if snp == ref:
            return 0 #no mutation
        
        if snp not in alts:
            return 0 #no annotation
        
#         print('scoring ', snp, ref, alts, sift_info)
        #actual scoring
        row = sift_info.loc[snp]

        if len(row.shape) > 1:
            row = row.head(1).squeeze()
        
        ref_aa = row['REF_AMINO']
        alt_aa = row['ALT_AMINO']

        if ref_aa == 'NA' or alt_aa == 'NA':
            if re.search('^FRAMESHIFT', str(row['VARIANT_TYPE'])):
                return FS_SCORE
            else:
                return 0

        if ref_aa == alt_aa:
            return 0
        else:
            try:
                this_score = int(blosum_df.loc[(ref_aa, alt_aa)]) * -1 + BLOSUM_OFFSET
            except:
                print('error at ', snp, ref, alt_aa, ref_aa, row)
                raise Exception
                
            return this_score

    
    def getSingleScoreWrapper(snp, ref, alts, sift_info):

        score = getSingleScore(snp, ref, alts, sift_info)
        return score
    
    def zeroScore(x):
        return 0
    
    global sift_df
       
    #the name of the chromosome should be id:chromosome
    full_id = row.name[0].split(':')
    id = full_id[0]
    chr = full_id[1]
    pos = int(row.name[1])
    
    try:
        sift_info = sift_df.loc[(chr, pos)]
    except:
        return row.apply(zeroScore)
    
    ref = sift_info['REF_ALLELE'].values[0]
    alts = sift_info['ALT_ALLELE'].values
    sift_info.set_index('ALT_ALLELE', inplace = True)
    scores = row.apply(getSingleScoreWrapper, 1, args = (ref, alts, sift_info))

    return scores

def worker(args):
    #hard coded args
    name = args[0]
    snps_df = args[1]
    
    scores = snps_df.apply(toScore, 1)
    result = scores.apply(sum, 0)
    result = pd.DataFrame(result, columns = [name]).T
    return result

with Pool() as pool:
    res_dfs = pool.map(worker, by_genes)

# res_dfs = map(worker, by_genes)

res = pd.concat(res_dfs)

(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,)
(14,

KeyboardInterrupt: 

In [None]:
res.T.to_csv(out_path, sep='\t')