In [1]:
'''
Preprocess script for SNPs, mostly covering the translation part
synonomous vs non-synonomous
'''
import pandas as pd, numpy as np, re, sys, json
from pathlib import Path
from multiprocessing import Pool
from collections import Counter

in_path = Path('/d/data/plasmo/nat_out/v3')
in_file = in_path / 'snps_by_genes.tsv'
out_path = in_path / 'gene_scores.tsv'
codons_file = in_path / 'codons.json'
genes_file = in_path / 'filtered_genes.txt'
ref_file = in_path / '3d7.fasta'
frames_file = in_path / 'frames.tsv'

In [2]:
#This is an adapted function to fetch the strandness from the gene location
def getLocation(row):
    col_names = ['id', 'loc', 'name', 'symbol']
    p = '[(]([+-])[)]'
    
    loc = row[col_names[1]]
    position = re.search(p, loc) #position is the search object
    strand = position.group(1).strip()
        
    return pd.Series([row[col_names[0]], strand])

genes_df = pd.read_csv(genes_file, sep='\t')
strand_df = genes_df.apply(getLocation, axis=1) #modified version containing decomposed position
strand_df.columns = ['id', 'strand']
strand_df.set_index('id', inplace=True)


In [3]:
#loading the reference sequence to eventually get codons
p = '(?s)[>](.+?) [|].+?\n(.+?)(?=[>]|$)'
with open(ref_file, 'r') as input:
    data = input.read()
    data = re.findall(p, data)
    ref_dict = {}
    for i, e in data:
        ref_dict[i] = e.replace('\n', '')
    data = None

In [4]:
#loading the actual SNPs
in_df = pd.read_csv(in_file, sep='\t', index_col = [0,1])


In [5]:
#loading the frames
frame_df = pd.read_csv(frames_file, sep='\t', index_col = [0, 1], header=None)

In [6]:
with open(codons_file, 'r') as input:
    codons = json.loads(input.read())
    
by_genes = in_df.groupby('CHROM') #each iteration yields (name, df)

In [7]:
#function for assigning score to each element of each row
def toScore(row):
    #the stipulation here is that the 'ref' are the 5 reference bases surrounding the SNP (+2, -2)
    #scoring scheme: regular = 1, stop codon or indel = 3.
    def getSingleScore(snp, ref, frame, rev):       
        def revComp(s):
            z_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
            return ''.join([z_dict[l] for l in s][::-1])
        
        base_score = 1
        strong_score = 3
        
        #input QC
        if snp == '.' or snp == '*':
            return 0 #not a snp
        
        if ref == '.':
            return 0 #illegal ref
        
        if snp == ref[2]:
            return 0 #no mutation
        
        if len(snp) > 1 or len(ref) > 1:
            return 0
        
        return base_score

        
        mut = ref[:2] + snp + ref[3:]
        frames = [(0,3), (1,4), (2,5)]
        
        #reverse complement and adjust frame
        if rev:
            ref = revComp(ref)
            mut = revComp(mut)
            if frame == 0:
                frame = 1
            elif frame == 1:
                frame = 0
                
        if frame == 0:
            frame = 3
        try:
            start, end = frames[frame - 1]
        except Exception as e:
            print('error frame was ' + str(frame))
            raise e
        
        r = codons[ref[start:end]] #original residue
        m = codons[mut[start:end]] #mutant residue
        
        #scoring
        if r == m:
            c = 0
        elif r == 'Z' or m == 'Z': #Z is the stop codon, which we give more attention to
            c = strong_score
        else:
            c = base_score   #mutation = 1
        
        #report
#         print('{r}[{r_n}] to {m}[{m_n}] mutation, for a score of {c}'.format(r=r, m=m, c=c, r_n = ref[start:end], m_n = mut[start:end] ))
            
        return c
    
       
    #the name of the chromosome should be id:chromosome
    full_id = row.name[0].split(':')
    id = full_id[0]
    chr = full_id[1]
    pos = int(row.name[1])
    frame = frame_df.loc[row.name[0], pos].values[0]
    rev = strand_df['strand'].loc[id] == '-'
    
    if np.isnan(frame):
        frame = 0
        ref_chunk = '.' #return 0 is the same as removing the position, really.
    else:
        frame = int(frame) % 3
        try:
            ref_chunk = ref_dict[chr][pos - 3: pos + 2]
        except:
            ref_chunk = '.'
            print('ref chunk out of bound at {0} {1}'.format(row.name[0], pos))
    
    scores = row.apply(getSingleScore, 1, args = (ref_chunk, frame, rev))

    return scores

def worker(args):
    #hard coded args
    name = args[0]
    snps_df = args[1]
    
    scores = snps_df.apply(toScore, axis = 1)
    result = scores.apply(sum, axis = 0)
    result = pd.DataFrame(result, columns = [name]).T
    return result

with Pool(6) as pool:
    res_dfs = pool.map(worker, by_genes)
res = pd.concat(res_dfs)
print(res)

                             PD0459-Cx.GT  PD0460-C.GT  PD0461-C.GT  \
PF3D7_0102200:Pf3D7_01_v3               0            0            0   
PF3D7_0102500:Pf3D7_01_v3               0            0            0   
PF3D7_0102600:Pf3D7_01_v3               0            0            0   
PF3D7_0102800:Pf3D7_01_v3               0            0            0   
PF3D7_0102900:Pf3D7_01_v3               0            0            0   
PF3D7_0103100:Pf3D7_01_v3               0            0            0   
PF3D7_0103200:Pf3D7_01_v3               0            0            0   
PF3D7_0103500:Pf3D7_01_v3               0            0            0   
PF3D7_0104000:Pf3D7_01_v3               0            0            0   
PF3D7_0104100:Pf3D7_01_v3               0            0            0   
PF3D7_0104200:Pf3D7_01_v3               0            0            0   
PF3D7_0104300:Pf3D7_01_v3               0            0            0   
PF3D7_0104400:Pf3D7_01_v3               0            0            0   
PF3D7_

In [8]:
res.T.to_csv(out_path, sep='\t')