In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
import time
import numpy as np
import pandas as pd
import optimize_parameters as opt
import distance_calculations as dist
from multiprocessing.pool import Pool
from scipy import special
from contigs_clustering import cluster_by_connecting_centroids

def optimize_prior_fortrimers(*argv):

    with Pool() as pool:
        awa_values = pool.starmap(opt.obtain_optimized_alphas, [(x, y) for x, y in zip(argv[0], argv[1])])
    awa_values = np.array(awa_values)
    alpha_values = awa_values.sum(axis=1)

    return alpha_values, awa_values



if __name__ == "__main__" :
    
    tmp_dir = "/big/work/metadevol/benchmark_dataset1/"
    s = time.time()
    
    """ clustering parameters (default) """
    d0 = 1
    min_shared_contigs = 5
    
    """ load contig read counts """
    read_counts = pd.read_pickle(tmp_dir + 'X_pickle')
    contig_length = pd.read_csv(tmp_dir + 'contigs_umap_hoverdata1', header=None, usecols=[3],sep=' ')
    contig_length = np.column_stack(contig_length.to_numpy())
    read_counts = read_counts.to_numpy()
    n_size, total_contigs = np.shape(read_counts)
    Rc_reads = read_counts.sum(axis=0)
    Rn_reads = read_counts.sum(axis=1)
    
    ss = time.time()
    gamma_shape, gamma_scale = opt.optimize_gammaparameters(Rc_reads,contig_length, total_contigs)
    print('obtained gamma parameters in', time.time()-ss,'seconds')
    
    ss = time.time()
    dirichlet_prior = opt.optimize_alpha(read_counts, Rc_reads, Rn_reads, n_size)
    print('obtained alpha parameter in', time.time()-ss,'seconds')
    dirichlet_prior_persamples  = dirichlet_prior * Rn_reads / Rn_reads.sum()
    dirichlet_prior_persamples = dirichlet_prior_persamples
    
    """ load kmer counts """
#     kmer_counts = pd.read_csv(tmp_dir + "kmer_counts", header=None)
#     kmer_counts = kmer_counts.to_numpy()
    dat = pd.read_csv(tmp_dir + "kmer_counts", header=None)
    dat = dat.to_numpy()

    tetramer_counts = dat.reshape(45760,256)
    tetramer_counts = (tetramer_counts / 2).transpose()
    trimercountsper_nt = np.split(tetramer_counts.sum(axis=1), 64)

    trimer_submatrices = np.vsplit(tetramer_counts, 64)
    ss = time.time()
    awa_values = optimize_prior_fortrimers(trimer_submatrices, trimercountsper_nt)
    dirichlet_prior_perkmers = awa_values[0]
    dirichlet_prior_kmers = awa_values[1]
#     print(dirichlet_prior_perkmers)
#     cluster_parameters = list([read_counts, Rc_reads, contig_length, n_size, total_contigs, \
#                           dirichlet_prior, dirichlet_prior_persamples, gamma_shape, gamma_scale, \
#                           kmer_counts, Rc_kmers, dirichlet_prior_kmers, dirichlet_prior_perkmers, \
#                           d0, min_shared_contigs])
    
    clusters, numclust_incomponents = cluster_by_connecting_centroids(cluster_parameters)

    
    print(time.time()-ss,'seconds')

obtained gamma parameters in 0.2241685390472412 seconds
obtained alpha parameter in 0.8197908401489258 seconds
computing cluster_by_connecting_centroids
entering iterative distance_calculation
[ 491.22998 3918.417    240.9895  ... 3496.1682   344.35776  888.63275]
[ 858.8697 7608.098   403.5993 ... 6172.4277  709.7136 1600.7379]
[ 647.2992  5933.977    336.65137 ... 4798.2153   532.217   1118.81   ]
[ 525.0357   4164.3125    286.18814  ... 1575.8514    374.83936
   92.499725]
[ 423.58533 3596.9124   338.05542 ... 3030.2788   335.89124 1054.855  ]
[ 547.4194  4342.551    156.46701 ... 3631.0842   454.8168  1318.0332 ]
[ 511.21185 3310.192    245.61172 ... 2749.907    372.14484  900.7191 ]
[ 244.54356  205.45839  180.72606 ... 2440.8936   107.05566  980.8401 ]
[ 577.5475  2915.4841   286.33624 ... 2668.9038   500.8729  1027.8271 ]
[ 582.26776 3360.1782   307.47028 ... 2084.235    433.80658 1118.2493 ]
[ 607.9224  2848.871    323.89413 ... 2316.2202   455.11343  676.459  ]
[ 522.8489   29

In [None]:
a = numpy.ascontiguous((1024,1024,5))

In [15]:
dirichlet_prior_perkmers

array([[1.73455597, 1.72745033, 1.73247135, 1.80395531],
       [1.76207882, 1.68478761, 1.76814774, 1.78563456],
       [1.86340203, 1.75492729, 1.73656529, 1.76823071],
       [1.74253835, 1.69898486, 1.69893153, 1.78389677],
       [1.82215512, 1.7466299 , 1.7188554 , 1.77000003],
       [1.65710051, 1.87751495, 1.6718135 , 1.84420038],
       [1.81392846, 1.80268165, 1.77321941, 1.80569182],
       [1.78031858, 1.69784611, 1.74342559, 1.7828012 ],
       [1.77974384, 1.76835523, 1.75043537, 1.70845876],
       [1.82139175, 1.75517995, 1.71479821, 1.84436176],
       [1.75049738, 1.75293616, 1.79409628, 1.7625124 ],
       [1.69382886, 1.78677495, 1.71560862, 1.86101218],
       [1.74222551, 1.68182518, 1.71707094, 1.77414329],
       [1.82752458, 1.74490762, 1.74557745, 1.75073337],
       [1.78405268, 1.76583534, 1.6786469 , 1.78579492],
       [1.75835104, 1.79854267, 1.72540146, 1.71582605],
       [1.67165822, 1.7757106 , 1.64759976, 1.83863696],
       [1.7700738 , 1.80980622,

In [63]:
np.shape(dirichlet_prior_persamples.T)

(1, 20)

In [62]:
kmer_counts = np.load(tmp_dir + "kmer_counts.npy")
kmer_counts = kmer_counts
kmer_counts = kmer_counts.reshape(total_contigs, 256)        # convert 1D array to a 2D array with {total_contigs, all 4-mers} shape  
kmer_counts = kmer_counts / 2

In [68]:
np.shape(kmer_counts[0])

(256,)

In [17]:
import os
import scipy.special as sc
import timeit
cpus = os.cpu_count()

In [39]:
tp = ThreadPool(cpus)

%timeit np.concatenate(tp.map(sc.gammaln, np.array_split((Rc_reads[0] + Rc_reads + dirichlet_prior),cpus)))

2.08 ms ± 122 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [77]:
np.concatenate(tp.map(sc.gammaln, np.array_split((Rc_reads[0] + Rc_reads + dirichlet_prior),cpus)))

array([ 2564.4305539 , 22673.78950047,  1627.87859116, ...,
       12327.80807743,  2321.14144908,  3547.64576089])

In [19]:
np.ascontiguousarray(Rc_reads)

array([ 242.5 , 2955.25,   87.  , ..., 1634.  ,  203.  ,  397.5 ])

In [101]:
import simdefy

simdefy.init()


simdefy.log_gamma(xx)

ValueError: Arguemnt has to be a dense float32 numpy array

In [121]:
xx=Rc_kmers.astype(np.float32)
xx = xx + dirichlet_prior_kmers

In [123]:
np.nonzero(xx<np.exp(-5))

(array([], dtype=int64), array([], dtype=int64))

In [74]:
Rc_kmers[:,0].flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

In [128]:
kmer_counts[:,0:4]

array([[ 0.   , 10.875, 13.75 ,  6.875],
       [ 1.375,  6.625,  0.75 ,  4.25 ],
       [38.125,  0.625,  4.625,  1.375],
       ...,
       [ 8.25 , 13.375,  9.875,  9.5  ],
       [ 2.   , 18.375,  2.625,  2.25 ],
       [ 2.125,  3.25 ,  3.375,  1.375]])

In [13]:
np.concatenate(np.split(kmer_counts[0],4))

array([  0.   ,  10.875,  13.75 ,   6.875,  10.   ,   7.875,  11.125,
         0.   ,  12.625,   3.375,   0.5  ,   0.   ,   5.25 ,   2.375,
         8.875,   3.875,   0.875,   0.   ,   0.5  ,   5.5  ,   2.   ,
         0.25 ,   0.75 ,   0.5  ,   0.25 ,   1.375,   5.375,   0.   ,
         0.875,   0.   ,   1.125,   0.75 ,   1.375,   1.375,   0.   ,
        11.375,   6.375,  18.25 ,   0.125,   0.   ,   0.   ,   3.625,
         0.   ,   5.   ,   0.25 ,   4.875,   2.   ,  12.625,  16.25 ,
         1.375, 121.   ,   0.875,   0.875,   3.375,   0.   ,   0.375,
         0.5  ,   0.125,   1.75 ,   1.125,   2.625,   0.75 ,  15.875,
         8.375,   6.25 ,   8.5  ,  11.5  ,   8.75 ,   5.5  ,   3.375,
         5.125,   9.5  ,   6.875,   9.25 ,   5.625,  14.375,   1.875,
         0.   ,   6.   ,   0.   ,   0.25 ,   0.375,   0.625,   0.5  ,
         0.   ,   1.125,   0.   ,   0.625,   0.375,   0.75 ,  11.5  ,
         0.   ,   3.   ,   2.375,  34.625,   0.   ,   7.875,   0.125,
         0.375,   1.

In [17]:
kmer_counts[0]

array([  0.   ,  10.875,  13.75 ,   6.875,  10.   ,   7.875,  11.125,
         0.   ,  12.625,   3.375,   0.5  ,   0.   ,   5.25 ,   2.375,
         8.875,   3.875,   0.875,   0.   ,   0.5  ,   5.5  ,   2.   ,
         0.25 ,   0.75 ,   0.5  ,   0.25 ,   1.375,   5.375,   0.   ,
         0.875,   0.   ,   1.125,   0.75 ,   1.375,   1.375,   0.   ,
        11.375,   6.375,  18.25 ,   0.125,   0.   ,   0.   ,   3.625,
         0.   ,   5.   ,   0.25 ,   4.875,   2.   ,  12.625,  16.25 ,
         1.375, 121.   ,   0.875,   0.875,   3.375,   0.   ,   0.375,
         0.5  ,   0.125,   1.75 ,   1.125,   2.625,   0.75 ,  15.875,
         8.375,   6.25 ,   8.5  ,  11.5  ,   8.75 ,   5.5  ,   3.375,
         5.125,   9.5  ,   6.875,   9.25 ,   5.625,  14.375,   1.875,
         0.   ,   6.   ,   0.   ,   0.25 ,   0.375,   0.625,   0.5  ,
         0.   ,   1.125,   0.   ,   0.625,   0.375,   0.75 ,  11.5  ,
         0.   ,   3.   ,   2.375,  34.625,   0.   ,   7.875,   0.125,
         0.375,   1.