In [20]:
import time
import numpy as np
import pandas as pd
from multiprocessing import Pool
from optimize_alpha import optimize_alpha
from distance_calculations import distance


def compute_distance(dat, c, Rc, N, an, alpha):
    # pool = Pool()
    dist = distance(dat[:,c], dat, Rc[c], Rc, N, an, alpha)
    return dist

def cluster_by_centroids(dat, d0, N, C, Rc, Rn):

    members = []
    cluster_curr = 0
    cluster_assigned = np.zeros(C) - 1
    dists = []

    s = time.time()
    a     = optimize_alpha(dat, Rc, Rn, N)
    an    = a * Rn / Rn.sum()
    alpha = an.sum()

    print("alpha optimized is:",a, "and took", time.time() - s, "seconds\n")

    s = time.time()
    iterate_ind = np.argsort(Rc)[::-1]
    for c in iterate_ind:
    # for c in np.arange(C):

        if cluster_assigned[c] < 0 :

            dist = compute_distance(dat, c, Rc, N, an, alpha)
            reassign_ind = np.intersect1d(np.nonzero(cluster_assigned < 0)[0], np.nonzero(np.array(dist < d0))[0])
            members.append(reassign_ind)
            cluster_assigned[reassign_ind] = cluster_curr
            dists.append([dist])
            cluster_curr += 1

    # with open("/big/work/metadevol/benchmark_dataset1/initial_clusters_new", 'a') as file:
    #     for f in range(len(members)):
    #         for q in members[f]:
    #             file.write(str(q) + " " + str(f) + "\n")

    print("Obtained {} clusters from initial clustering".format(len(members)))
    print("Initial clustering took:", time.time() - s,"seconds")

    return members, np.array(dists).squeeze()


def count_numbers_of_sharedcontigs(dists, d1, min_shared_contigs):
    s = time.time()
    K, C = np.shape(dists)
    shared = np.zeros([K,K])
    # shared1 = np.zeros([K,K])
    neigh = np.array(dists < d1).astype(int)
    neigh_sum = neigh.sum(axis=0)
    for c in np.arange(C):
        if neigh_sum[c] >= 2:
            non_zeroind = neigh[:,c].nonzero()
            k_ind = np.array(np.meshgrid(non_zeroind,non_zeroind)).T.reshape(-1,2)
            # k_ind = k_ind[k_ind[:,0] != k_ind[:,1]]
            for i in k_ind:
                shared[i[0],i[1]] += 1
    shared = np.array(shared > min_shared_contigs).astype(int)

    # for c in np.arange(C):

    #     neigh = np.nonzero(np.array(dists)[:,c] < d1)[0]
    #     if len(neigh) >= 2:
    #         for k in neigh:
    #             for l in neigh:
    #                 shared[k,l] += 1

    # shared = np.array(shared > min_shared_contigs).astype(int)
    # print(shared1)

    links = []
    for k in np.arange(K):
        links.append(list(np.nonzero(shared[k])))
    print("count_numbers_of_sharedcontigs took ", time.time()-s, "seconds")
    return links


def recursive_set_allneighbors(k, component_curr, components, links):
    components[k] = component_curr
    for l in links[k][0]:

        if components[l] < 0 :

            recursive_set_allneighbors(l, component_curr, components, links)


def find_connected_components(links):
    s = time.time()
    K = np.shape(links)[0]
    components = np.zeros(K).astype(int) - 1
    component_curr = 0

    for k in np.arange(K):

        if components[k] < 0:

            recursive_set_allneighbors(k, component_curr, components, links)
            component_curr += 1

    if (component_curr != len(set(components))):
        raise Exception("problem with component calculations")
        exit()
    print("find_connected_components took ", time.time()-s, "seconds")
    num_components = component_curr
    numclust_incomponents = np.unique(components, return_counts=True)[1]
    print(numclust_incomponents)
    return components, num_components, numclust_incomponents


def merge_members_by_connnected_components(components, num_components, members):
    K = len(components)
    clusters = [[] for i in range(num_components)]

    for k in np.arange(K):
        clusters[components[k]].append(members[k])
 
    # for i in np.arange(num_components):
        # clusters[i] = np.unique(np.concatenate(clusters[i]).ravel())
       
    return clusters


def cluster_by_connecting_centroids(dat, d0, min_shared_contigs):
    s = time.time()
    contig_names = dat.columns
    dat = dat.to_numpy()
    N, C = np.shape(dat)

    Rc    = dat.sum(axis=0)
    Rn    = dat.sum(axis=1)

    members, dists = cluster_by_centroids(dat, d0, N, C, Rc, Rn)
    # d1 = d0 * np.sqrt(N)
    d1 = d0 * 2
    # d1 = d0 
    links = count_numbers_of_sharedcontigs(dists, d1, min_shared_contigs)
    components, num_components, numclust_incomponents = find_connected_components(links)
    print("number of connected components", num_components)
    clusters = merge_members_by_connnected_components(components, num_components, members)

    print("count_by_connecting_centroids took ", time.time()-s, "seconds")
    return clusters, numclust_incomponents

if __name__ == "__main__":
    s = time.time()
    print("clustering initiated"+'\n')
    tmp_dir = "/big/work/metadevol/benchmark_dataset1/"
    # tmp_dir = "/big/work/metadevol/scripts/bamtools_api/build/"
    dat = pd.read_pickle(tmp_dir + 'X_pickle')
    d0 = 1
    min_shared_contigs = 5
    clusters, numclust_incomponents = cluster_by_connecting_centroids(dat, d0, min_shared_contigs)
    print("overall time taken for new clustering is: ", time.time()-s)






clustering initiated

alpha optimized is: 8.445274503124452 and took 0.8554117679595947 seconds

Obtained 37 clusters from initial clustering
Initial clustering took: 2.337231397628784 seconds
count_numbers_of_sharedcontigs took  0.8554587364196777 seconds
find_connected_components took  0.0004763603210449219 seconds
[2 1 2 1 2 1 2 7 1 1 1 1 1 1 1 1 2 1 1 2 3 2]
number of connected components 22
count_by_connecting_centroids took  4.058035612106323 seconds
overall time taken for new clustering is:  4.066399812698364


  result = asarray(a).shape


In [32]:
import time
import numpy as np
import pandas as pd
import tensorflow as tf
from new_clustering_algorithm import cluster_by_connecting_centroids

def initialize_Z(W_t, dat):
    
    W = np.transpose(W_t)
    lmda = 0.1
    inverse_term = np.linalg.inv(np.eye(W.shape[0]) + (lmda ** -1) * np.matmul(W,W_t))
    woodbury = (lmda ** -1) * np.eye(W.shape[1]) - np.matmul((lmda ** -2) * W_t , np.matmul(inverse_term, W))
    print(np.matmul(W_t,dat)[:,11977])
    return np.matmul(woodbury, np.matmul(W_t,dat))


# def maximize_function(w, z, x, qt):
    
#     mean = tf.matmul(w, z)
#     mean = tf.math.maximum(0.1 * qt * x, 1e-20 + tf.nn.relu(mean))
#     W_term = Lw * tf.reduce_sum(tf.abs(w))
#     Z_term = Lz * tf.reduce_sum(tf.abs(z))
#     block_penalty = Lzp * np.sqrt(sum(lc) / len(lc)) * sum(tf.norm(z / np.sqrt(lc), axis = 1))
#     R = - W_term - Z_term - block_penalty
#     negative_log_likelihood = - (tf.reduce_sum(- mean + tf.multiply(x, tf.math.log(mean))) + R)
#     # negative_log_likelihood = - tf.reduce_sum(- mean + tf.multiply(x, tf.math.log(mean))) 
        
#     return negative_log_likelihood


def maximize_regularizedfunction(w, z, x, qt):
    mean = tf.matmul(w, z)
    mean = tf.math.maximum(0.1 * qt * x, 1e-20 + tf.nn.relu(mean))

    """ L1 norm """
    W_term = Lw * tf.reduce_sum(tf.abs(w))
    Rc = tf.reduce_sum(z, axis=0)
    Z_term = Lz * tf.reduce_sum(tf.abs(z)/Rc)

    """ block penalty """
    W_bterm = Lwp * tf.reduce_sum(tf.norm(w, axis=0))
    Z_bterm = Lzp * tf.reduce_sum(tf.norm(z/Rc, axis=1))
    R = - W_term - Z_term - W_bterm - Z_bterm

    negative_log_likelihood = - (tf.reduce_sum(- mean + tf.multiply(x, tf.math.log(mean))) + R)
        
    return negative_log_likelihood


def optimize_wz(W, Z, X, qt, opt):
        
    with tf.GradientTape() as tape:
        loss = maximize_regularizedfunction(W, Z, X, qt)
        LL1.append(loss)
    optimized = opt.minimize(loss, [W, Z], tape = tape)
    
    return optimized


def calc_aic(w, z, x, qt):

    mean = tf.matmul(w, z)
    mean = tf.math.maximum(0.1 * qt * x, 1e-20 + tf.nn.relu(mean))
    log_likelihood = tf.reduce_sum(- mean + tf.multiply(x, tf.math.log(mean)))
    AIC_score = log_likelihood - tf.math.count_nonzero(tf.nn.relu(w), dtype="float64") - tf.math.count_nonzero(tf.nn.relu(z), dtype="float64")
    
    return AIC_score



def nmf_with_adam(W, Z, X, n, revert_flag, convergence_criterion, Lw, Lz, Lzp, AIC_check_value):

        LL1 = []
        W_nzcount = []
        Z_nzcount = []

        opt = tf.keras.optimizers.Adam(learning_rate=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
        X = tf.convert_to_tensor(X)
        X = tf.Variable(X, trainable = False)
        W = tf.Variable(W, trainable = True)
        Z = tf.Variable(Z, trainable = True)
        W_prebest = tf.Variable(W, trainable = False)
        Z_prebest = tf.Variable(Z, trainable = False)

        for i in range(n):
            qt = np.exp(-i/10)
            optimize = optimize_wz(W, Z, X, qt, opt)
            # if len(LL1) >= 150:
            #     if abs((LL1[-1] - LL1[-100])/ LL1[-1]) < convergence_criterion and abs((LL1[-50] - LL1[-150])/ LL1[-50]) < convergence_criterion and revert_flag == 0:
            #         AIC_score = calc_aic(W, Z, X, qt)
            #         if AIC_score > AIC_check_value:
            #             # opt = tf.keras.optimizers.Adam(learning_rate=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
            #             print(AIC_score, "new AIC is higher than the old one", AIC_check_value, i+1)
            #             Lz = 2 * Lz
            #             Lzp = 2 * Lzp
            #             W_prebest.assign(W)
            #             Z_prebest.assign(Z)
            #             AIC_check_value = AIC_score
            #             print(W_prebest[0][0], W[0][0], " is set at",  i+1)
            #             print(Z_prebest[0][0], Z[0][0], " is set at",  i+1)
            #             # revert_flag = 0

            #         else:
            #             W.assign(W_prebest)
            #             print("else condition", i)
            # #             print(W_prebest[0][0], W[0][0], "iteration ", i)
            # #             print(Z_prebest[0][0], Z[0][0], "iteration before correction", i)
            #             # Z.assign(tf.Variable(initialize_Z(tf.transpose(W), X)))
            # #             # print(Z[0][0], "iteration after correction", i)
            # #             # print(initialize_Z(tf.transpose(W), X))
            #             Z.assign(Z_prebest)
            #             Lz = Lz / 2
            #             Lzp = Lzp / 2
            #             revert_flag = 1

            #     if revert_flag == 1:
            #         if abs((LL1[-1] - LL1[-100])/ LL1[-1]) < convergence_criterion * 0.01 and abs((LL1[-50] - LL1[-150])/ LL1[-50]) < convergence_criterion * 0.01:
            #             break

            W.assign(tf.nn.relu(W))
            W.assign(W / tf.reduce_sum(W, axis = 0))
            Z.assign(tf.nn.relu(Z))

            W_nzcount.append(tf.math.count_nonzero(W>0.01))
            Z_nzcount.append(tf.math.count_nonzero(Z>0.01))


            if (i+1 == 50 or i+1 == 100 or i+1 == 500 or i+1 == 1000 or i+1 == 9000 or i+1 == 10000):
            #         print(np.count_nonzero(W>0.01, axis=0), "W matrix at iteration", i)
        #         print(np.count_nonzero(Z>0.01, axis=1), "Z matrix at iteration", i)
        #         np.save("/big/work/metadevol/scripts/bamtools_api/build/count_plot/W_matrix_trial"+ str(i+1), W)
        #         np.save("/big/work/metadevol/scripts/bamtools_api/build/count_plot/Z_matrix_trial"+ str(i+1), Z)

                Rc_c  = np.sum(Z, axis=0)
                pb_c  = Z / Rc_c
                cov_b = np.sum(Z, axis=1) / np.sum((np.array(lc) * Z) / Rc_c, axis=1)
                # np.savetxt(tmp_dir + "count_plot/bin_coverage_trial"+ str(num_iterations),cov_b.reshape(len(cov_b),1), fmt="%f")
                pb_min = 0.8 * (cov_b.reshape(len(cov_b),1) * np.sum(np.square(pb_c), axis=0) \
                                / np.sum(cov_b.reshape(len(cov_b),1) * pb_c, axis=0))
                pb_min[pb_min > 0.5] = 0.5
                contig_assign0 = tf.argmax(pb_c/pb_min, axis=0).numpy()
                print(len(set(contig_assign0)), "new assignment at iteration", i)
                print(len(set(tf.argmax(pb_c, axis=0).numpy())), "old assignment at iteration", i)
                Bz_bc =[]
                for f in range(np.shape(Z)[1]):
                    Bz_bc.append(Z[contig_assign0[f],f])
                bins0 = np.c_[contig_assign0, np.array(Bz_bc).reshape(len(Bz_bc),1), np.array(contig_names)]
        #         bins0 = bins0[np.lexsort((bins0[:,1], bins0[:,0]))]
                print("Number of total bins ", len(set(bins0[:,0])), " from new assignment automatic differentiation \n")
                # np.savetxt(tmp_dir + "count_plot/contig_bins_" + str(i+1) + "trial", bins0, delimiter = ",", fmt=['%d','%f','%d'])

        return W, Z


if __name__ == "__main__":
    s = time.time()
    print("clustering initiated"+'\n')
    tmp_dir = "/big/work/metadevol/benchmark_dataset1/"
    # tmp_dir = "/big/work/metadevol/scripts/bamtools_api/build/"
    dat = pd.read_pickle(tmp_dir + 'X_pickle')
    d0 = 1
    min_shared_contigs = 5
    clusters, numclust_incomponents = cluster_by_connecting_centroids(dat, d0, min_shared_contigs)
    print("overall time taken for new clustering is: ", time.time()-s)

    # print(clusters)
    dat = dat.to_numpy()
    for k in numclust_incomponents:
        connected_component_count = []
        if k == 3 :
            print(clusters[k])
        #     for i in clusters[k]:
        #         dat_s = dat[:, i]
        #         connected_component_count.append(dat_s.sum(axis=1))

        # if connected_component_count:
        #     print(len(np.array(connected_component_count)), k)
        #     print(np.array(connected_component_count))
            # W_t = cluster_members_count/cluster_members_count.sum(axis=1,keepdims=1)
            # Z_matrix = initialize_Z(W_t, dat)
            # print("Initializing W and Z matrices took:",time.time()-s1,'seconds\n')

            # print("Optimizing W and Z matrices"+'\n')

            # s1 = time.time()
            

            # Lw = Lz = 1
            # Lwp = 2 * Lw
            # Lzp = 2 * Lw
            
            # AIC_check_value = 0.0

            # convergence_criterion = 0.001
            
            # revert_flag = 0

            # num_iterations = 1000

            # """ W and Z optimization using SGD with automatic differentiation in Adam """
            # W1, Z1 = nmf_with_adam(np.transpose(W_t), Z_matrix, dat, num_iterations, revert_flag, convergence_criterion, Lw, Lz, Lzp, AIC_check_value)





    

clustering initiated

alpha optimized is: 8.445274503124452 and took 0.8735842704772949 seconds

Obtained 37 clusters from initial clustering
Initial clustering took: 2.3102469444274902 seconds
count_numbers_of_sharedcontigs took  0.7183699607849121 seconds
find_connected_components took  0.00038695335388183594 seconds
[2 1 2 1 2 1 2 7 1 1 1 1 1 1 1 1 2 1 1 2 3 2]
number of connected components 22
count_by_connecting_centroids took  3.911520004272461 seconds
overall time taken for new clustering is:  3.9179892539978027
[array([    9,    22,    24, ..., 45750, 45752, 45759])]


  result = asarray(a).shape


In [10]:
for f, fi in zip(clusters, range(len(clusters))):
    f = map(str,f)
    for q in f:
        with open("/big/work/metadevol/benchmark_dataset1/" + "connected_components_new", 'a') as file:
        # with open("/big/work/metadevol/scripts/bamtools_api/build/" + "connected_components_new", 'a') as file:
            file.write(str(q) + " " + str(fi) + '\n')

In [None]:
for f in xx[xx[:,0] != xx[:,1]]:
    print(f[0], " - ", f[1])

0  -  1
0  -  4
1  -  0
1  -  4
4  -  0
4  -  1


In [None]:
%timeit list(itertools.permutations([0,1,4], 2))
%timeit np.array(np.meshgrid(np.array([0, 1, 4]),np.array([0, 1, 4]))).T.reshape(-1,2)

KeyboardInterrupt: 

In [6]:
import os
os.path.dirname(os.path.abspath("distance_calculations.py"))

'/big/work/metadevol/scripts'