In [6]:
import pandas as pd
from MKtest import *
import numpy as np

In [7]:
class ARGS:
    def __init__(self, indir, metadata_file,
                group1, group2, test='hg',
                outfile='mk_results', min_count=1,
                nrows=float('inf'), tables='mk_tables',
                pseudocount=0, permutations=0, seed=5):
        
        self.indir = indir
        self.metadata_file = metadata_file
        self.group1 = group1
        self.group2 = group2
        self.test = test
        self.outfile = outfile
        self.min_count = min_count
        self.nrows = nrows
        self.tables = tables
        self.pseudocount = pseudocount
        self.permutations = permutations
        self.seed = seed


In [24]:
args = ARGS(indir='/home/sur/micropopgen/data/test_data/midas/merged.snps/Veillonella_parvula_57794//',
            metadata_file='/home/sur/micropopgen/data/test_data/midas/map.txt',
            group1='Buccal mucosa',
            group2='Tongue dorsum',
            outfile='mktest/Veillonella_parvula_57794/mk_results.Buccal.mucosa_Tongue.dorsum.txt',
            tables='mktest/Veillonella_parvula_57794/mk_tables.Buccal.mucosa_Tongue.dorsum.txt',
            nrows=5000,
            pseudocount=1,
            permutations=10,
            seed=5,
            test='all')

In [25]:
print("Checking MIDAS files exist")
confirm_midas_merge_files(args)

# Read mapping files
# Create dictionaries that have all the samples per group (Groups),
# and the group to which each sample belongs (Samples)
# Probably should change this to pandas
print("Read metadata")
Samples, Groups = process_metadata_file(args.metadata_file)

print("Calculate MK contingency tables")
MK, Genes = calculate_contingency_tables(Samples, Groups, args)
# print(MK)
# print([MK])


Checking MIDAS files exist
	All files found
Read metadata
Calculate MK contingency tables
	Read snps_info.txt
>Site id: site_id
>Contig: ref_id
>Position: ref_pos
>Ref allele: ref_allele
>Major allele: major_allele
>Minor allele: minor_allele
>Locus type: locus_type
>Gene id: gene_id
>Aminoacids: amino_acids

Number of sites: 4675
Number of genes: 9
	Chose sites based on depth in groups to compare
Number of sites: 4591
Number of genes: 9
Sites with counts: 4591
	Read frequencies and calculate
Processing snp_freq.txt
Number of sites: 4591
Number of genes: 9
Sites with counts: 4591
Genes with MK: 9


In [26]:
if args.permutations > 0:
        MK = [MK]
        print("Permuting")
        print("Seed is {}".format(str(args.seed)))
        np.random.seed(args.seed)
        for i in range(args.permutations):
            Sp, Gp = process_metadata_file(args.metadata_file, permute=True)
            mk, genes = calculate_contingency_tables(Sp, Gp, args)
            MK.append(mk)

            print("========")
            # print(MK)
            print(len(MK))

print(MK)

Permuting
Seed is 5
	Read snps_info.txt
>Site id: site_id
>Contig: ref_id
>Position: ref_pos
>Ref allele: ref_allele
>Major allele: major_allele
>Minor allele: minor_allele
>Locus type: locus_type
>Gene id: gene_id
>Aminoacids: amino_acids

Number of sites: 4675
Number of genes: 9
	Chose sites based on depth in groups to compare
Number of sites: 4666
Number of genes: 9
Sites with counts: 4666
	Read frequencies and calculate
Processing snp_freq.txt
Number of sites: 4666
Number of genes: 9
Sites with counts: 4666
Genes with MK: 9
2
	Read snps_info.txt
>Site id: site_id
>Contig: ref_id
>Position: ref_pos
>Ref allele: ref_allele
>Major allele: major_allele
>Minor allele: minor_allele
>Locus type: locus_type
>Gene id: gene_id
>Aminoacids: amino_acids

Number of sites: 4675
Number of genes: 9
	Chose sites based on depth in groups to compare
Number of sites: 4669
Number of genes: 9
Sites with counts: 4669
	Read frequencies and calculate
Processing snp_freq.txt
Number of sites: 4669
Number of 

In [27]:
# Get list of tests to perform
test = args.test
supported_tests = ['NI', 'ratio', 'hg',
                   'G', 'G_Yates', 'G_Williams',
                   'alpha']
if test == 'all':
    test = supported_tests
elif test not in supported_tests:
    raise ValueError("Test not supported")
else:
    test = [test]

# Create header
header_base = ['gene', 'contig', 'start', 'end',
               'Dn', 'Ds', 'Pn', 'Ps']
pval_list = [''.join([t, '.pval']) for t in test]
header = header_base + test + pval_list
print(test)
print(pval_list)


['NI', 'ratio', 'hg', 'G', 'G_Yates', 'G_Williams', 'alpha']
['NI.pval', 'ratio.pval', 'hg.pval', 'G.pval', 'G_Yates.pval', 'G_Williams.pval', 'alpha.pval']


In [31]:
MK_true = MK[0]
MK_true
gene = list(MK_true.keys())[1]
mk = MK[0][gene]
print(gene)
print(mk)
print(mk.Dn)
print(mk.Ds)
print(mk.Pn)
print(mk.Ps)

686660.3.peg.2
<MKtest.MKtest object at 0x7fcdcdfb7470>
1
3
213
815


In [32]:
# Permute for one gene
perm_table = np.full(shape=(int(args.permutations + 1), len(test)), fill_value=np.nan)
row = 0
for p in MK:
    print("=========")
    # print(p)
    if gene in p:
        p_stat = calculate_statistic(p[gene], test, pseudocount=args.pseudocount)
        p_res = [p_stat[t] for t in test]
        perm_table[row] = p_res
    
    row = row + 1
perm_table


perm_pvals =(perm_table >= perm_table[0]).sum(axis=0) / (args.permutations + 1)



In [34]:
perm_table

array([[ 0.28024639,  1.90654206,  0.5245098 ,  0.50705433,  0.06023802,
         0.44456913,  0.4754902 ],
       [ 0.58176216,  3.8173516 ,  0.26196172,  1.66654593,  0.58110037,
         1.3765688 ,  0.73803828],
       [ 0.57885174,  3.7918552 ,  0.26372315,  0.82743491,  0.01925641,
         0.58256738,  0.73627685],
       [ 0.27730319,  1.89366516,  0.52807646,  0.24931232,  0.03479782,
         0.19474211,  0.47192354],
       [ 0.27730319,  1.89366516,  0.52807646,  0.24931232,  0.03479782,
         0.19474211,  0.47192354],
       [ 0.57885174,  3.7918552 ,  0.26372315,  0.82743491,  0.01925641,
         0.58256738,  0.73627685],
       [ 0.57885174,  3.7918552 ,  0.26372315,  0.82743491,  0.01925641,
         0.58256738,  0.73627685],
       [ 0.57885174,  3.7918552 ,  0.26372315,  0.82743491,  0.01925641,
         0.58256738,  0.73627685],
       [ 0.57885174,  3.7918552 ,  0.26372315,  0.82743491,  0.01925641,
         0.58256738,  0.73627685],
       [ 0.577814  ,  3.7828

In [54]:
a = np.array([[0,1,2,3], [1,2,3,np.nan]])
nperm = 2
(a >= a[0]).sum(axis=0) / (nperm - np.isnan(a).sum(axis=0))

  This is separate from the ipykernel package so we can avoid doing imports until


array([ 1.,  1.,  1.,  1.])

nan

In [112]:
def test_by_permutation(gene, MK, permutations, test, pval_list, pseudocount):
    nperm = int(permutations + 1)
    perm_table = np.full(shape=(nperm, len(test)), fill_value=np.nan)
    row = 0
    for p in MK:
        # print("=========")
        # print(p)
        if gene in p:
            p_stat = calculate_statistic(p[gene], test, pseudocount=pseudocount)
            p_res = [p_stat[t] for t in test]
            perm_table[row] = p_res

        row = row + 1
#     print(perm_table)
#     print("shape", perm_table.shape)
    
    # Pvalues
    nperms = nperm - np.isnan(perm_table).sum(axis=0)
    perm_pvals =(perm_table >= perm_table[0]).sum(axis=0) / nperms
    nperm_names = [''.join([t, '.nperm']) for t in test]
#     print("hola")
#     print("real", perm_table[0])
#     print("pvals", perm_pvals)
#     print("nperms", nperms)
    
#     print("============")
    # Result
    keys = np.concatenate((test, pval_list, nperm_names))
    vals = np.concatenate((perm_table[0], perm_pvals, nperms))
    vals = np.array(vals, dtype=np.character)
    res = dict(zip(keys,vals))
    
    return res

In [114]:
res = test_by_permutation(gene, MK, 10, test, pval_list, 1)
res

{'G': b'0.5070543291398715',
 'G.nperm': b'11.0',
 'G.pval': b'0.8181818181818182',
 'G_Williams': b'0.4445691298803579',
 'G_Williams.nperm': b'11.0',
 'G_Williams.pval': b'0.8181818181818182',
 'G_Yates': b'0.06023802398248529',
 'G_Yates.nperm': b'11.0',
 'G_Yates.pval': b'0.18181818181818182',
 'NI': b'0.2802463897406891',
 'NI.nperm': b'11.0',
 'NI.pval': b'0.8181818181818182',
 'alpha': b'0.47549019607843135',
 'alpha.nperm': b'11.0',
 'alpha.pval': b'0.8181818181818182',
 'hg': b'0.5245098039215687',
 'hg.nperm': b'11.0',
 'hg.pval': b'0.2727272727272727',
 'ratio': b'1.9065420560747663',
 'ratio.nperm': b'11.0',
 'ratio.pval': b'0.8181818181818182'}

In [102]:
pd.DataFrame.from_dict(res,orient='index').T

Unnamed: 0,NI,ratio,hg,G,G_Yates,G_Williams,alpha,NI.pval,ratio.pval,hg.pval,...,G_Yates.pval,G_Williams.pval,alpha.pval,NI.nperm,ratio.nperm,hg.nperm,G.nperm,G_Yates.nperm,G_Williams.nperm,alpha.nperm
0,0.280246,1.906542,0.52451,0.507054,0.060238,0.444569,0.47549,0.818182,0.818182,0.272727,...,0.181818,0.818182,0.818182,11.0,11.0,11.0,11.0,11.0,11.0,11.0


In [100]:
res

{'G': 0.50705432913987147,
 'G.nperm': 11.0,
 'G.pval': 0.81818181818181823,
 'G_Williams': 0.44456912988035791,
 'G_Williams.nperm': 11.0,
 'G_Williams.pval': 0.81818181818181823,
 'G_Yates': 0.060238023982485289,
 'G_Yates.nperm': 11.0,
 'G_Yates.pval': 0.18181818181818182,
 'NI': 0.28024638974068911,
 'NI.nperm': 11.0,
 'NI.pval': 0.81818181818181823,
 'alpha': 0.47549019607843135,
 'alpha.nperm': 11.0,
 'alpha.pval': 0.81818181818181823,
 'hg': 0.52450980392156865,
 'hg.nperm': 11.0,
 'hg.pval': 0.27272727272727271,
 'ratio': 1.9065420560747663,
 'ratio.nperm': 11.0,
 'ratio.pval': 0.81818181818181823}

In [111]:
print(list(np.array([1,2,3],dtype=np.character)))

[b'1', b'2', b'3']
