In [11]:
#!/usr/bin/env python
# Copyright (C) 2017 Sur Herrera Paredes

# Imports
import os
import sutilspy
import csv
import numpy as np
import scipy.stats as stats
import argparse

In [25]:
# Arguments for ipython
args = argparse.Namespace()
args.indir = "/home/sur/micropopgen/exp/2017/today5/test/"
args.test = "MK"
args.outfile = "/home/sur/micropopgen/exp/2017/today5/mk_results.txt"
args.metadata_file = "/home/sur/micropopgen/exp/2017/today5/map.txt"
args.group1 = "Supragingival plaque"
args.group2 = "Tongue dorsum"
args.min_count = 3
args.nrows = 200

# indir = 'snps/'
# test = 'MK'
# outfile = 'mk_results.txt'
# metadata_file = 'map.txt'
# group1 = 'A'
# group2 = 'B'
# min_count = 3
# nrows = 30000

In [26]:
class GenomeSite:
    """A class for represintinc sites in genome that have potential SNPS"""
    
    def __init__(self,site_id, contig, position, ref_allele = '',
                 major_allele = '',
                 minor_allele = '', locus_type = '', gene_id = '',
                 aminoacid_A = '',
                 aminoacid_C = '', aminoacid_G = '', aminoacid_T = ''):
        self.id = site_id
        self.contig = contig
        self.position = position
        self.ref_allele = ref_allele
        self.major_allele = major_allele
        self.minor_allele = minor_allele
        self.locus_type = locus_type
        self.gene_id = gene_id
        self.aminoA = aminoacid_A
        self.aminoC = aminoacid_C
        self.aminoG = aminoacid_G
        self.aminoT = aminoacid_T
    
    def codon_aminoacid(self, base):
        if base in ['A','a']:
            return(self.aminoA)
        elif base in ['C','c']:
            return(self.aminoA)
        elif base in ['G','g']:
            return(self.aminoA)
        elif base in ['T','t']:
            return(self.aminoA)
        else:
            raise ValueError("base must be one of the four canonical nucleoties")
    
    def substitution_type(self):
        substitution_type = ''
        if self.codon_aminoacid(base = self.major_allele) == self.codon_aminoacid(base = self.minor_allele):
            substitution_type = 'synonymous'
        else:
            substitution_type = 'non-synonymous'
        
        return(substitution_type)
            

In [27]:
class Gene:
    """A class for representing a gene"""
    
    def __init__(self, gene_id,contig,start,end, strand = ''):
        if(start > end):
            raise ValueError("Start cannot be greater than end")
        self.id = gene_id
        self.contig = contig
        self.start = start
        self.end = end
        self.strand = strand
    
    def extend(self, pos):
        if pos > self.end:
            self.end = pos
        elif pos < self.start:
            self.start = pos
    
    def print(self):
        print("===Gene===")
        print(">Gene id: {}".format(self.id))
        print(">Gene contig: {}".format(self.contig))
        print(">Gene start: {}".format(str(self.start)))
        print(">Gene end: {}".format(str(self.end)))
            
    

In [28]:
class MKtest:
    """A class for holding the McDonald-Kreitmant test"""
    
    def __init__(self, name, Ds = 0, Dn = 0, Ps = 0, Pn = 0):
        self.name = name
        self.Dn = Dn
        self.Ds = Ds
        self.Ps = Ps
        self.Pn = Pn
    
    def update(self, Ds = 0, Dn = 0, Ps = 0, Pn = 0):
        """Update the contigency matrix"""
        self.Dn += Dn
        self.Ds += Ds
        self.Ps += Ps
        self.Pn += Pn
    
    def calculate(self):
        """Calculate the McDonald Kreitman ratio"""
        ratio = (self.Dn / self.Ds) / (self.Pn / self.Ps)
        return(ratio)
    
    def alpha(self):
        """Calculate the Smith & Eyre-Walker alpha"""
        alpha = 1 - ((self.Ds * self.Dn) / (self.Ps * self.Pn))
        return(alpha)
    
    def test(self):
        res = stats.fisher_exact([[self.Ds,self.Ps],[self.Dn,self.Pn]])
        return(res)
    

        

In [29]:
# Check files exist in input directory
# Convert into function
file_list = os.listdir(args.indir)
if 'snps_freq.txt' not in file_list:
    raise FileNotFoundError("Could not find snps_freq.txt at {}".format(args.indir))
if 'snps_info.txt' not in file_list:
    raise FileNotFoundError("Could not find snps_info.txt at {}".format(args.indir))
if 'snps_depth.txt' not in file_list:
    raise FileNotFoundError("Could not find snps_depth.txt at {}".format(args.indir))
if not os.path.isfile(args.metadata_file):
    raise FileNotFoundError("Could not find metadata file {}".format(args.metadata_file))


In [30]:
# Read metadata
Groups = sutilspy.io.process_run_list(args.metadata_file, 1, 0, header = True)
Samples = sutilspy.io.process_run_list(args.metadata_file, 0, 1, header = True)


> Processing map of runs
	Processed 54 runs in 3 samples

> Processing map of runs
	Processed 54 runs in 54 samples


In [32]:
# Read info
Genes = {}
Sites = {}
with open(args.indir + '/snps_info.txt') as info_fh:
    header = info_fh.readline()
    header = header.split('\t')
    print(header)
    info_reader = csv.reader(info_fh, delimiter = '\t')
    i = 0
    
    # Set columns
    site_id_col = 0
    contig_col = 1
    pos_col = 2
    ref_allele_col = 3
    major_allele_col = 4
    minor_allele_col = 5
    locus_type_col = 11
    gene_id_col = 12
    aminoacids_col = 15
    
    print("============HEADERs============")
    print(">Site id: {}".format(header[site_id_col]))
    print(">Contig: {}".format(header[contig_col]))
    print(">Position: {}".format(header[pos_col]))
    print(">Ref allele: {}".format(header[ref_allele_col]))
    print(">Major allele: {}".format(header[major_allele_col]))
    print(">Minor allele: {}".format(header[minor_allele_col]))
    print(">Locus type: {}".format(header[locus_type_col]))
    print(">Gene id: {}".format(header[gene_id_col]))
    print(">Aminoacids: {}".format(header[aminoacids_col]))
    
    #
    for row in info_reader:
        i += 1
        if i > args.nrows:
            break
        #print(row)
        #print(row[gene_id_col], row[site_id_col])
        #print(row[aminoacids_col])
        gene = row[gene_id_col]
        site_id = row[site_id_col]
        aminoacids = row[aminoacids_col]
        #print(aminoacids)
        #print(site_id)
        
        if gene == 'NA':
            # skip intergenig regions
            continue
            
        #print("\tgene")
        # Get aminoacid per position
        aa = aminoacids.split(',')
        #print(aa)
        
        # Define site
        #print(site_id)
        Sites[site_id] = GenomeSite(site_id = site_id,
                                    contig = row[contig_col],
                                    position = row[pos_col],
                                    ref_allele = row[ref_allele_col],
                                    major_allele = row[major_allele_col],
                                    minor_allele = row[minor_allele_col],
                                    locus_type = row[locus_type_col],
                                    gene_id = gene, aminoacid_A = aa[0],
                                    aminoacid_C = aa[1],
                                    aminoacid_G = aa[2],
                                    aminoacid_T = aa[3])
        
        # For genes
        if gene in Genes:
            # update genes
            Genes[gene].extend(row[pos_col])
            #print(gene)
            #print(Genes[gene])
            #Genes[gene].print()

        else:
            # Define gene
            Genes[gene] = Gene(gene_id=gene, contig = row[contig_col],
                               start = row[pos_col], end = row[pos_col])
            #Genes[gene].print()
            #print(Genes[gene])
        
 
info_fh.close()

['site_id', 'ref_id', 'ref_pos', 'ref_allele', 'major_allele', 'minor_allele', 'count_samples', 'count_a', 'count_c', 'count_g', 'count_t', 'locus_type', 'gene_id', 'snp_type', 'site_type', 'amino_acids\n']
>Site id: site_id
>Contig: ref_id
>Position: ref_pos
>Ref allele: ref_allele
>Major allele: major_allele
>Minor allele: minor_allele
>Locus type: locus_type
>Gene id: gene_id
>Aminoacids: amino_acids



In [33]:
#print(Groups)
print("Number of sites: {}".format(str(len(Sites))))
print("Number of genes: {}".format(str(len(Genes))))

Number of sites: 117
Number of genes: 2


In [36]:
# Chose sites based on depth in groups to compare
Counts = {}
with open(args.indir + '/snps_depth.txt') as depth_fh:
    header = depth_fh.readline()
    header = header.rstrip()
    header = header.split('\t')
    
    # Get sample and column indices
    samples = header[1:]
    indices = {}
    for s in samples:
        indices[s] = header.index(s)
    print(indices)
    
    
    depth_reader = csv.reader(depth_fh, delimiter = '\t')
    i = 0
    for row in depth_reader:
        i += 1
        if i > args.nrows:
            break 
        #print(row)
        
        site_id = row[0]
        #print(site_id)
        if not site_id in Sites:
            continue
        
        # Get all counts
        counts = row[1:]
        counts = list(map(int,counts))
        #print(counts)
        
        counts = [int(c >= args.min_count) for c in counts]
        
        # Get counts per group
        samples1 = [int(counts[ indices[l] - 1 ]) for l in Groups[args.group1]]
        samples2 = [int(counts[ indices[l] - 1 ]) for l in Groups[args.group2]]
        samples1 = sum(samples1)
        samples2 = sum(samples2)
        #print(samples1)
        #print(samples2)
        if not ((samples1 > 0 and samples2 > 0) and (samples1 > 1 or samples2 > 1)):
            # delete
            #print(site_id)
            if site_id in Sites:
                del Sites[site_id]
        else:
            # NOTE: ASSUMING SAME ORDER IN SAMPLES BETWEEN SITES
            Counts[site_id] = counts
        


depth_fh.close()

{'SRS011140': 1, 'SRS011247': 2, 'SRS013711': 3, 'SRS016575': 4, 'SRS054687': 5, 'SRS011243': 6, 'SRS049389': 7, 'SRS017439': 8, 'SRS017533': 9, 'SRS020856': 10, 'SRS017808': 11, 'SRS011310': 12, 'SRS017810': 13, 'SRS019607': 14, 'SRS017814': 15, 'SRS019389': 16, 'SRS018357': 17, 'SRS011152': 18, 'SRS018665': 19, 'SRS018157': 20, 'SRS018739': 21, 'SRS018145': 22, 'SRS012285': 23, 'SRS017691': 24, 'SRS015755': 25, 'SRS017511': 26, 'SRS017227': 27, 'SRS017209': 28, 'SRS017139': 29, 'SRS019974': 30, 'SRS019980': 31, 'SRS016569': 32, 'SRS016225': 33, 'SRS016037': 34, 'SRS015762': 35, 'SRS014894': 36, 'SRS014888': 37, 'SRS016200': 38, 'SRS043663': 39, 'SRS014573': 40, 'SRS045049': 41, 'SRS013723': 42, 'SRS013705': 43, 'SRS012279': 44, 'SRS011306': 45, 'SRS050029': 46, 'SRS051244': 47, 'SRS051941': 48, 'SRS052227': 49, 'SRS052604': 50, 'SRS052876': 51, 'SRS055118': 52, 'SRS055426': 53, 'SRS058053': 54}


In [37]:
print("Number of sites: {}".format(str(len(Sites))))
print("Number of genes: {}".format(str(len(Genes))))
print("Sites with counts: {}".format(str(len(Counts))))

Number of sites: 117
Number of genes: 2
Sites with counts: 117


In [42]:
# Read frequencies and calculate 
print(Groups)
MK = {}
with open(args.indir + '/snps_freq.txt') as freqs_fh:
    header = freqs_fh.readline()
    header = header.rstrip()
    header = header.split('\t')
    
    # Get sample and column indices
    samples = header[1:]
    indices = {}
    for s in samples:
        indices[s] = header.index(s)
    print(indices)
    print(header)
    
    freqs_reader = csv.reader(freqs_fh, delimiter = '\t')
    i = 0
    for row in freqs_reader:
        i += 1
        if i > args.nrows:
            break
        
        # Check if site was selected based on sites
        site_id = row[0]
        if not site_id in Sites:
            #print("==Skipping")
            continue
        
        print("==========================")
        gene = Sites[site_id].gene_id
        s_type = Sites[site_id].substitution_type()
        present_index = np.array(Counts[site_id])
        group_index = np.array([Samples[s][0] for s in samples])
        #print(row)
        #print(site_id)
        #allele_frequencies = row.split()
        #print("Major Allele: {}".format(Sites[site_id].major_allele))
        #print("Minor Allele: {}".format(Sites[site_id].minor_allele))
        #print("Substitution type: {}".format(s_type))
        #print("Gene: {}".format(gene))
        print(present_index)
        print(group_index)
        
        # Create MKtest if needed
        if gene not in MK:
            MK[gene]= MKtest(name=gene)
            
        # find allele per sample
        allele_freqs = np.array([int(float(f) < 0.5) for f in row[1:]])
        #print(allele_freqs)
        
        # Remove non covered positions
        ii = np.where(present_index)
        group_index = group_index[ii]
        allele_freqs = allele_freqs[ii]
        #print(group_index)
        #print(allele_freqs)
        
        # Count alleles per group
        group1_count = allele_freqs[np.where(group_index == args.group1)].sum()
        group2_count = allele_freqs[np.where(group_index == args.group2)].sum()
        print(group1_count)
        print(group2_count)
        
        if group1_count > 0 and group2_count > 0:
            fixed = False
        elif group1_count > 0 or group2_count > 0:
            fixed = True
#         else:
#             print("===============")
#             print(row)
#             print(group_index)
#             print(allele_freqs)
#             raise ValueError("At least one of the counts must be non-zero")
        
        if s_type is 'synonymous':
            if fixed:
                MK[gene].update(Ds = 1)
            else:
                MK[gene].update(Ps = 1)
        elif s_type is 'non-synonymous':
            if fixed:
                MK[gene].update(Dn = 1)
            else:
                MK[gene].update(Pn = 1)
        else:
            raise ValueError("Invalid substitution type")
            
        #print("==========================")
        
        

freqs_fh.close()

{'Tongue dorsum': ['SRS011140', 'SRS011243', 'SRS011306', 'SRS012279', 'SRS013705', 'SRS014573', 'SRS014888', 'SRS015762', 'SRS016037', 'SRS016225', 'SRS016569', 'SRS017209', 'SRS017439', 'SRS017533', 'SRS017808', 'SRS018145', 'SRS018357', 'SRS018739', 'SRS019389', 'SRS019607', 'SRS019974', 'SRS020856', 'SRS043663', 'SRS049389', 'SRS052227', 'SRS054687', 'SRS055426'], 'Supragingival plaque': ['SRS011152', 'SRS012285', 'SRS013723', 'SRS014894', 'SRS015755', 'SRS016200', 'SRS016575', 'SRS017139', 'SRS017227', 'SRS017511', 'SRS017691', 'SRS017814', 'SRS018157', 'SRS018665', 'SRS019980', 'SRS051244', 'SRS051941', 'SRS052604', 'SRS052876', 'SRS058053'], 'Buccal mucosa': ['SRS011247', 'SRS011310', 'SRS013711', 'SRS017810', 'SRS045049', 'SRS050029', 'SRS055118']}
{'SRS011140': 1, 'SRS011247': 2, 'SRS013711': 3, 'SRS016575': 4, 'SRS054687': 5, 'SRS011243': 6, 'SRS049389': 7, 'SRS017439': 8, 'SRS017533': 9, 'SRS020856': 10, 'SRS017808': 11, 'SRS011310': 12, 'SRS017810': 13, 'SRS019607': 14, 'SR

In [45]:
Genes[ Sites[site_id].gene_id ].print()

===Gene===
>Gene id: 997829.3.peg.1
>Gene contig: JNOS01000002
>Gene start: 692
>Gene end: 692


In [43]:
print("Number of sites: {}".format(str(len(Sites))))
print("Number of genes: {}".format(str(len(Genes))))
print("Sites with counts: {}".format(str(len(Counts))))
print("Genes with MK: {}".format(str(len(MK))))

Number of sites: 117
Number of genes: 2
Sites with counts: 117
Genes with MK: 2


In [51]:
args.tables = '/home/sur/micropopgen/exp/2017/today5/tables.txt'
with open(args.outfile,mode='w') as fh and open(args.tables) as th:
    for gene,mk in MK.items():
        th.write("=============================================\n")
        th.write(gene)
        th.write("\t\tFixed\tPolymorphic\n\tSynonymous\t{}\t{}\n\tnon-synonymous\t{}\t{}\n".format(mk.Ds,mk.Ps,mk.Dn,mk.Pn))
        #ratio = mk.calculate()
        oddratio,pval = mk.test()
        res = [gene, str(oddratio), str(pval)]
        th.write(res + "\n")
        fh.write("\t".join(res) + "\n")
        #alpha = mk.alpha()
        #print("MK ratio is: {}".format(str(ratio)))
        #print("MK alpha is: {}".format(str(alpha)))
fh.close()
th.close()

SyntaxError: invalid syntax (<ipython-input-51-30fa42f78291>, line 2)