In [1]:
import pandas as pd
import numpy as np

In [None]:
def determine_site_dist(map, depth, freq, info, depth_thres = 1):
    """For all sites, determine if they are fixed or polymorphic"""

    Dist = []
    for i in range(info.shape[0]):
        # Add samples IDs as map header and match samples
        # in map with samples in depth

        # Create site data frame
        site = map.copy()
        site['depth'] = depth.loc[ depth.index[i], map.index ]
        site['freq'] = freq.loc[ freq.index[i], map.index ]
        # print(site.head())

        # Remove samples without information for site
        site = site[site.depth >= depth_thres]
        # print(site.shape)
        # print(site.head())
        # site


        # Determine if it is polymorphic or fixed
        site_crosstab = pd.crosstab(site.freq >= 0.5, site.Group)
        # site_crosstab

        if site_crosstab.shape == (2,2):
            if (np.matrix(site_crosstab).diagonal() == [0,0]).all() or (np.fliplr(np.matrix(site_crosstab)).diagonal() == [0, 0]).all():
                mutation_type = 'fixed'
            else:
                mutation_type = 'polymorphic'
        else:
            mutation_type = np.nan

        Dist.append(mutation_type)
    
    return(Dist)

In [174]:
map_file = "/home/sur/micropopgen/data/test_data/midas/map.txt"
depth_file = "/home/sur/micropopgen/data/test_data/midas/merged.snps/Streptococcus_sp_60086/snps_depth.txt"
info_file = "/home/sur/micropopgen/data/test_data/midas/merged.snps/Streptococcus_sp_60086/snps_info.txt"
freqs_file = "/home/sur/micropopgen/data/test_data/midas/merged.snps/Streptococcus_sp_60086/snps_freq.txt"
groups = ['Buccal mucosa', 'Tongue dorsum']
depth_thres = 1
cov_thres = 3

In [None]:
map = pd.read_csv(map_file, sep="\t")
map.head()

In [None]:
info = pd.read_csv(info_file, sep="\t")
info.head()

In [None]:
depth = pd.read_csv(depth_file, sep="\t")
depth.head()

In [None]:
freq = pd.read_csv(freqs_file, sep="\t")
freq.head()

In [None]:
freq.shape

In [None]:
depth.shape

In [None]:
info.shape

In [None]:
groups

In [None]:
type(map.Group)

In [247]:
map = pd.read_csv(map_file, sep="\t")
map.index = map.ID
map = map.loc[map.Group.isin(groups),:].copy()

In [248]:
# Read data
info = pd.read_csv(info_file, sep="\t")
depth = pd.read_csv(depth_file, sep="\t")
freq = pd.read_csv(freqs_file, sep="\t")

# Remove non gene sites
ii = ~info.gene_id.isnull()
info = info.loc[ii, :]
depth = depth.loc[ii, :]
freq = freq.loc[ii, :]

# Remove site_id columns
depth = depth.drop(axis=1, labels='site_id')
freq = freq.drop(axis=1, labels='site_id')

# Get list of genes
gene_ids = info.gene_id.unique()

# subset for tests
info = info.head(100)
depth = depth.head(100)
freq = freq.head(100)

In [249]:
# Check that sample names match between freq and depth
if not all(freq.columns == depth.columns):
    raise ValueError("Columns don't match between freq and depth files")

    
# Remove samples from other groups
ci = depth.columns.isin(map.ID)
depth = depth.loc[:, ci]
freq = freq.loc[:, ci]

# Reorder map
map = map.loc[depth.columns,:]
# print(map.head())
# print(map.shape)

# Calculate coverage in sites
map['coverage'] = depth.mean(axis=0)
# print(map.head())
# print(map.shape)

# Remove samples below coverage
ci = map.coverage >= cov_thres
map = map.loc[ci, :]
depth = depth[map.index]
freq = freq[map.index]

if not all(map.index == depth.columns[1:]):
    raise ValueError("Samples in map and depth don't match")

In [301]:


print(Dist)
info['Type'] = Dist
info.head()

[nan, nan, nan, 'polymorphic', nan, 'polymorphic', 'polymorphic', 'polymorphic', 'polymorphic', 'polymorphic', 'polymorphic', nan, 'polymorphic', nan, 'polymorphic', 'fixed', 'polymorphic', nan, 'polymorphic', 'polymorphic', 'fixed', 'fixed', nan, 'polymorphic', 'polymorphic', 'fixed', 'fixed', 'polymorphic', 'polymorphic', 'fixed', 'polymorphic', 'polymorphic', 'polymorphic', 'polymorphic', nan, 'polymorphic', nan, 'polymorphic', nan, nan, nan, nan, nan, nan, nan, nan, 'fixed', 'fixed', 'fixed', 'fixed', 'fixed', nan, 'polymorphic', 'fixed', nan, 'fixed', nan, 'polymorphic', 'fixed', 'fixed', nan, nan, nan, nan, nan, nan, nan, nan, nan, 'polymorphic', nan, 'polymorphic', 'polymorphic', nan, nan, nan, 'fixed', 'fixed', 'polymorphic', nan, nan, 'polymorphic', nan, 'polymorphic', 'polymorphic', 'polymorphic', 'polymorphic', 'polymorphic', 'fixed', nan, 'polymorphic', nan, nan, nan, 'fixed', 'fixed', 'polymorphic', 'fixed', 'polymorphic', nan]


Unnamed: 0,site_id,ref_id,ref_pos,ref_allele,major_allele,minor_allele,count_samples,count_a,count_c,count_g,count_t,locus_type,gene_id,snp_type,site_type,amino_acids,Type
12,419,CM002128,419,A,A,T,42,440,0,0,17,CDS,1316408.3.peg.1,bi,3D,"I,M,I,I",
13,425,CM002128,425,G,G,T,42,1,0,428,9,CDS,1316408.3.peg.1,bi,3D,"I,M,I,I",
14,437,CM002128,437,A,A,T,41,353,0,0,17,CDS,1316408.3.peg.1,bi,4D,"S,S,S,S",
15,443,CM002128,443,A,A,G,38,275,0,37,0,CDS,1316408.3.peg.1,bi,4D,"P,P,P,P",polymorphic
16,453,CM002128,453,T,T,C,32,0,6,0,259,CDS,1316408.3.peg.1,bi,1D,"I,S,T,N",


[nan,
 nan,
 nan,
 'polymorphic',
 nan,
 'polymorphic',
 'polymorphic',
 'polymorphic',
 'polymorphic',
 'polymorphic',
 'polymorphic',
 nan,
 'polymorphic',
 nan,
 'polymorphic',
 'fixed',
 'polymorphic',
 nan,
 'polymorphic',
 'polymorphic',
 'fixed',
 'fixed',
 nan,
 'polymorphic',
 'polymorphic',
 'fixed',
 'fixed',
 'polymorphic',
 'polymorphic',
 'fixed',
 'polymorphic',
 'polymorphic',
 'polymorphic',
 'polymorphic',
 nan,
 'polymorphic',
 nan,
 'polymorphic',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'fixed',
 'fixed',
 'fixed',
 'fixed',
 'fixed',
 nan,
 'polymorphic',
 'fixed',
 nan,
 'fixed',
 nan,
 'polymorphic',
 'fixed',
 'fixed',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'polymorphic',
 nan,
 'polymorphic',
 'polymorphic',
 nan,
 nan,
 nan,
 'fixed',
 'fixed',
 'polymorphic',
 nan,
 nan,
 'polymorphic',
 nan,
 'polymorphic',
 'polymorphic',
 'polymorphic',
 'polymorphic',
 'polymorphic',
 'fixed',
 nan,
 'polymorphic',
 nan,
 nan,
 nan,
 'fixed',
 'fi

TypeError: Required argument 'object' (pos 1) not found

In [137]:
np.matrix(site_crosstab)

matrix([[ 1, 39],
        [ 0,  2]])

In [135]:
np.matrix(site_crosstab).diagonal(axis1=0, axis2=1)

matrix([[1, 2]])

In [136]:
np.matrix(site_crosstab).diagonal(axis1=1, axis2=0)

matrix([[1, 2]])

In [149]:
%timeit np.matrix(site_crosstab)[:,::-1].diagonal()

29.9 µs ± 1.58 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [150]:
%timeit np.fliplr(np.matrix(site_crosstab)).diagonal()

30.7 µs ± 404 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


12