In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

In [2]:
def determine_site_dist(map, depth, freq, info, depth_thres = 1):
    """For all sites, determine if they are fixed or polymorphic"""

    Dist = []
    for i in range(info.shape[0]):
        # Add samples IDs as map header and match samples
        # in map with samples in depth

        # Create site data frame
        site = map.copy()
        site['depth'] = depth.loc[ depth.index[i], map.index ]
        site['freq'] = freq.loc[ freq.index[i], map.index ]

        # Remove samples without information for site
        site = site[site.depth >= depth_thres]

        # Determine if it is polymorphic or fixed
        site_crosstab = pd.crosstab(site.freq >= 0.5, site.Group)
        if site_crosstab.shape == (2,2):
            if (np.matrix(site_crosstab).diagonal() == [0,0]).all() or (np.fliplr(np.matrix(site_crosstab)).diagonal() == [0, 0]).all():
                mutation_type = 'fixed'
            else:
                mutation_type = 'polymorphic'
        else:
            mutation_type = np.nan

        Dist.append(mutation_type)
    
    return(Dist)

In [3]:
def determine_mutation_effect(r):
    """Mini function for apply, takes a series and checks if the mutation
    is synonymous (s) or non-synonymopus (n)"""
    
    ii = r.loc[['count_a', 'count_c', 'count_g', 'count_t']] > 0
    aa = np.array(r.amino_acids.split(sep=','))

    if all(aa[ii][0] == aa[ii]):
        effect = 's'
    else:
        effect = 'n'

    return effect

In [4]:
def mktest_fisher_exact(g):
    """Per perform the fisher's exact test on a gene MK 
    contingency table."""
    
    tab = np.array([[g.Dn,g.Pn],[g.Ds,g.Ps]])
    oddsratio, pval = stats.fisher_exact(tab, alternative='two-sided')
    # oddsratio, pval = stats.fisher_exact(tab, alternative='greater')
    
    return pval

In [5]:
map_file = "/home/sur/micropopgen/data/test_data/midas/map.txt"
depth_file = "/home/sur/micropopgen/data/test_data/midas/merged.snps/Streptococcus_sp_60086/snps_depth.txt"
info_file = "/home/sur/micropopgen/data/test_data/midas/merged.snps/Streptococcus_sp_60086/snps_info.txt"
freqs_file = "/home/sur/micropopgen/data/test_data/midas/merged.snps/Streptococcus_sp_60086/snps_freq.txt"
groups = ['Buccal mucosa', 'Tongue dorsum']
depth_thres = 1
cov_thres = 3

In [6]:
# Read map file and select groups
map = pd.read_csv(map_file, sep="\t")
map.index = map.ID
map = map.loc[map.Group.isin(groups),:].copy()

In [None]:
# Read data
info = pd.read_csv(info_file, sep="\t")
depth = pd.read_csv(depth_file, sep="\t")
freq = pd.read_csv(freqs_file, sep="\t")

# Remove non gene sites
ii = ~info.gene_id.isnull()
info = info.loc[ii, :]
depth = depth.loc[ii, :]
freq = freq.loc[ii, :]

# Remove site_id columns
depth = depth.drop(axis=1, labels='site_id')
freq = freq.drop(axis=1, labels='site_id')

# Get list of genes
gene_ids = info.gene_id.unique()

# subset for tests
# info = info.head(5000)
# depth = depth.head(5000)
# freq = freq.head(5000)

# Determine effect of sites (this is constant and indepentent of samples)
info['Effect'] = info.apply(determine_mutation_effect, axis=1)

In [None]:
# Check that sample names match between freq and depth
if not all(freq.columns == depth.columns):
    raise ValueError("Columns don't match between freq and depth files")

    
# Remove samples from other groups
ci = depth.columns.isin(map.ID)
depth = depth.loc[:, ci]
freq = freq.loc[:, ci]

# Reorder map
map = map.loc[depth.columns,:]

# Calculate coverage in sites
map['coverage'] = depth.mean(axis=0)

# Remove samples below coverage
ci = map.coverage >= cov_thres
map = map.loc[ci, :]
depth = depth[map.index]
freq = freq[map.index]

if not all(map.index == depth.columns):
    raise ValueError("Samples in map and depth don't match")

# Determine type of mutation
info['Type'] = determine_site_dist(map=map, depth=depth, freq=freq, info=info, depth_thres=depth_thres)
info.head()

In [None]:
# Calculate MK contingency table per gene
Genes = pd.DataFrame(columns=['Gene', 'Dn', 'Ds', 'Pn', 'Ps'])
for g in info.gene_id.unique():
    dat = info.loc[info.gene_id == g,:].copy()
    tab = pd.crosstab(dat.Effect, dat.Type, rownames=['Effect'], colnames=['Type'])
    tab = tab.reindex(index=pd.Index(['n','s']), columns=pd.Index(['fixed', 'polymorphic']), fill_value=0)
    s = pd.Series(g, index=['Gene']).append(tab.fixed).append(tab.polymorphic)
    Genes = Genes.append(pd.DataFrame([list(s)], columns=Genes.columns), ignore_index=True)
Genes.head()

In [None]:
# Calculate ratio
np.seterr(divide='ignore', invalid='ignore')
Genes['ratio'] = pd.to_numeric(Genes.Dn * Genes.Ps) / pd.to_numeric(Genes.Ds * Genes.Pn)
np.seterr(divide='raise', invalid='raise')
Genes.replace(np.inf, np.nan, inplace=True)
Genes['hg.pval'] = Genes.apply(mktest_fisher_exact, axis=1)
Genes.head()

In [None]:
plt.hist(Genes['hg.pval'][~pd.isna(Genes.ratio)], bins=20);

In [None]:
# Genes.loc[~pd.isna(Genes.ratio), :]

In [None]:
Genes.sort_values(['hg.pval'], ascending=True)

In [None]:
%timeit np.matrix(site_crosstab)[:,::-1].diagonal()
%timeit np.fliplr(np.matrix(site_crosstab)).diagonal()