In [3]:
import pandas as pd
import numpy as np
import bin_assignments as assign
import nmf_connected_components as nmf

def multiplicative_updates(W, Z, X, n, method):
    
    lw = 0.0
    lz = 0.8
    beta = 0.5
    convergence_criteria = np.exp(-15)
    epsilon_reg = 1e-05
    loss_values = []

    if method == 0:
        
        for i in range(n):
            
            mean = np.matmul(Z.T, W)
            mean = 1e-10 + nmf.np_relu(mean)
            X_by_mean = X / mean
            
            W = np.multiply(W, nmf.np_relu(np.matmul(Z, X_by_mean)))
            W = W / np.array([W.sum(axis = 1)]).T
            Z = np.multiply(Z, nmf.np_relu(np.matmul(W, X_by_mean.T)))

            loss_values.append(nmf.maximize_function(W, Z, X))
            
            if len(loss_values) >= 10 :
                
                if ((loss_values[i] - loss_values[i-10]) / loss_values[i]) < convergence_criteria:
                    AIC = nmf.calc_aic(W, Z, X)
                    # print("Function is converged")
                    break
                else:
                    raise RuntimeError(f'function not converged') 
        print(AIC, "within method mul update")
        return  Z, AIC
    else:
        raise Exception("")

def assign_shortcontigs(working_dir, sel_inds, Rc_reads, contigs, bins_):
    contig_names = contigs[:,1]
    contig_length = contigs[:,2].astype(int)
    fractional_counts = pd.read_csv(working_dir + "total_readcount", header=None,sep=' ', engine="pyarrow")
    read_counts = fractional_counts.pivot_table(index = 1, columns = 0, values = 2)
    del(fractional_counts)

    read_counts = read_counts.to_numpy().T
    read_counts_sel =  read_counts[sel_inds]

    read_counts_n = np.delete(read_counts, sel_inds, axis=0)
    del(read_counts)

    bins_ = bins_.T

    print(bins_, np.shape(bins_), np.shape(Rc_reads))
    Rc_reads_bins = Rc_reads[bins_[:,0]]
    print(np.shape(Rc_reads_bins), np.min(Rc_reads_bins))
    
    if np.min(Rc_reads_bins) != 0.0:
        
        bins_withRc = np.column_stack((bins_, Rc_reads_bins.T)).astype(int)
        bins_withRc = pd.DataFrame(bins_withRc)
        bins_withRc.columns = ['ind','bin','Rc']
        bin_selectedinds = bins_withRc.loc[bins_withRc.groupby('bin').Rc.idxmax()]['ind']

        W_bins = read_counts_sel[bin_selectedinds] / read_counts_sel[bin_selectedinds].sum(axis=1, keepdims=True)
        
        Z = nmf.initialize_Z(W_bins, read_counts_n)

        contig_length_n =np.delete(contig_length, sel_inds)
        contig_names_n = np.delete(contig_names, sel_inds)
        contig_names = contig_names[sel_inds]

        split_count = 3
        Z_parts = np.array_split(Z, split_count, axis=1)
        read_counts_npart = np.array_split(read_counts_n, split_count, axis=0)
        Z_optimized = []
        AIC_values = []
        
        for f in range(split_count):
            Z_opt, AIC = multiplicative_updates(W_bins, Z_parts[f], read_counts_npart[f], 1000, 0)
            Z_optimized.append(Z_opt)
            AIC_values.append(AIC)

        print(np.shape(np.concatenate(Z_optimized, axis=1)), len(Z_optimized))
        Z_optimized = np.concatenate(Z_optimized, axis=1)
        # print(np.shape(Z_optimized))
        # bin_assign = assign.assignment(Z_optimized, contig_length_n, contig_names_n, 0, short=True)
        # initial_bins = np.stack((contig_names[bins_[0]], bins_[1]))
        # print(initial_bins)
        # total_bins = np.vstack([initial_bins, bin_assign])
        # print(total_bins)
        return Z_optimized, contig_length_n, contig_names_n

    else:
        
        raise RuntimeError("some contigs may have zero total count. Filter them before processing")

In [1]:
import numpy as np
import pandas as pd

working_dir = '/big/work/metadevol/cami2_datasets/marine/pooled_assembly/all_alignment/tmp/'

bins_ = np.loadtxt(working_dir + 'bin_assignments_density_inds', dtype=int)

contigs = pd.read_csv(working_dir + 'selected_contigs', header=None, sep=' ').to_numpy()

fractional_counts = pd.read_csv(working_dir + "total_readcount", header=None,sep=' ')
read_counts = fractional_counts.pivot_table(index = 1, columns = 0, values = 2)
del(fractional_counts)
contig_length = contigs[:,2].astype(int)
read_counts = read_counts.to_numpy().T
total_contigs_source = read_counts.shape[0]
print(total_contigs_source, "shape 0")

long_contigs = np.nonzero(contig_length>=2500)[0]
read_counts = read_counts[long_contigs]

""" process high read counts """
Rc_reads = read_counts.sum(axis=1)

print(np.shape(Rc_reads), len(long_contigs), np.shape(read_counts))

707974 shape 0
(122715,) 122715 (122715, 10)


In [4]:
Z_opt, contig_length_n, contig_names_n = assign_shortcontigs(working_dir, long_contigs, Rc_reads, contigs, bins_)

[[ 57841      1]
 [ 93491      1]
 [ 33135      2]
 ...
 [121930   3890]
 [122008   3891]
 [122360   3892]] (122715, 2) (122715,)
(122715,) 5.48333
-441761135.76450735 within method mul update
-442672140.1069196 within method mul update
(3892, 585259) 2


In [5]:
np.shape(Z_opt)

(3892, 585259)

In [47]:
fractional_counts = pd.read_csv(working_dir + "total_readcount", header=None,sep=' ')
read_counts = fractional_counts.pivot_table(index = 1, columns = 0, values = 2)
read_counts = read_counts.to_numpy().T

np.min(read_counts.sum(axis=1))

0.5

In [7]:
import numpy as np

def get_binindex(bins):
    
    sequential_index = dict(np.vstack([np.unique(bins),np.arange(len(np.unique(bins)))]).T)
    bins = [sequential_index[x] for x in bins]

    return np.array(bins)



def assignment(Z_bc, contig_length, mode):
    
    Rc  = np.sum(Z_bc, axis=0)
    print("here")
    if mode == 0:

        weights = Z_bc ** 5 / Rc ** 4
        cov_b1 = np.sum(weights  * (Z_bc / contig_length) , axis = 1)
        cov_b2 = weights.sum(axis=1)
        cov_b = cov_b1 / cov_b2
        pi_bc = (Z_bc / cov_b[:,None]) / (Z_bc / cov_b[:,None]).sum(axis = 0)
        pi_bc[pi_bc < 0.5] = 0.0
        poorprob_inds = np.nonzero(pi_bc.sum(axis=0)==0)[0]
        print(len(poorprob_inds), np.shape(pi_bc), flush=True)
        pi_bc = np.delete(pi_bc, poorprob_inds, axis=1)
        print(np.shape(pi_bc))
        bins = np.argmax(pi_bc, axis=0)
        bins = get_binindex(bins)
        # bins_m = np.nonzero(pi_bc >= np.max(pi_bc, axis=0) * 0.6)
    
    else:

        weights = Z_bc / Rc
        selected_inds = np.nonzero(weights>0.95)[1]
        print(len(selected_inds), "length of selected_inds")
        
        if selected_inds.size != 0:

            cov_b1 = (Z_bc[:,selected_inds] / contig_length[selected_inds]).sum(axis=1)
            cov_b2 = Z_bc[:,selected_inds].sum(axis=1)
        
        else:

            cov_b1 = (Z_bc ** 2 / contig_length).sum(axis=1)
            cov_b2 = Z_bc.sum(axis=1)

        cov_b = cov_b1 / cov_b2
        pi_bc = (Z_bc / cov_b[:,None]) / (Z_bc / cov_b[:,None]).sum(axis = 0)

        print(np.max(pi_bc),np.max(pi_bc, axis=0))
        pi_bc1 = pi_bc
        pi_bc1[pi_bc1 < 0.7] = 0.0
        poorprob_inds = np.nonzero(pi_bc1.sum(axis=0)==0)[0]

        bins = np.argmax(pi_bc, axis=0)
        bins = get_binindex(bins)

        # bins_m = np.nonzero(pi_bc >= np.max(pi_bc, axis=0) * 0.6)

    return bins, poorprob_inds

In [8]:
# short_bins_0, poorprob_inds = assignment(Z_opt, contig_length_n, mode=1)

here
0 length of selected_inds


In [29]:
Rc  = np.sum(Z_opt, axis=0)
weights = Z_opt / Rc

cov_b1 = (Z_opt ** 2 / contig_length).sum(axis=1)
cov_b2 = Z_opt.sum(axis=1)

cov_b = cov_b1 / cov_b2
pi_bc = (Z_opt / cov_b[:,None]) / (Z_opt / cov_b[:,None]).sum(axis = 0)

# print(np.max(pi_bc),np.max(pi_bc, axis=0))
# pi_bc1 = pi_bc
# pi_bc1[pi_bc1 < 0.7] = 0.0
# poorprob_inds = np.nonzero(pi_bc1.sum(axis=0)==0)[0]

bins = np.argmax(pi_bc, axis=0)
bins = get_binindex(bins)



In [42]:
x = [np.array([[0,1,3],[1,2,3]]),np.array([[0,1,3]]),np.array([[0,1,3],[1,2,3]])]
np.savetxt("x",x)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3,) + inhomogeneous part.

In [32]:
cov1_b1 = (Z_opt ** 2 / contig_length_n).sum(axis=1)
cov1_b2 = Z_opt.sum(axis=1)

cov1_b = cov1_b1 / cov1_b2
pi_bc1 = (Z_opt / cov1_b[:,None]) / (Z_opt / cov1_b[:,None]).sum(axis = 0)


In [33]:
np.max(pi_bc1)

0.20968806541088583

In [None]:
import bin_assignments as assign

bin_assign, bin_m_assign = assign.assignment(Z_opt, contig_length_n, 0)

  weights = Z_bc ** 5 / Rc ** 4


In [None]:
# argmax = np.argmax(Z_opt, axis=0)

# """ Assignment """
Z_assign = Z_opt
Rc_c  = np.sum(Z_assign, axis=0)
pb_c  = Z_assign / Rc_c
cov_b = np.sum(Z_assign, axis=1) / np.sum((np.array(contig_length_n) * Z_assign) / Rc_c, axis=1)
pb_min = 0.8 * (cov_b.reshape(len(cov_b),1) * np.sum(np.square(pb_c), axis=0) \
                / np.sum(cov_b.reshape(len(cov_b),1) * pb_c, axis=0))
pb_min[pb_min > 0.5] = 0.5
contig_assign0 = np.argmax(pb_c/pb_min, axis=0)

print(len(set(contig_assign0)), "number of bins")
print(len(set(np.argmax(Z_assign, axis=0))), "just max")

87 number of bins
197 just max


In [81]:
import bin_assignments as assign
# bins_, bins_m = assign.assignment(Z_opt, contig_length_n, 0)
bins_1, bins_m1 = assign.assignment(Z_opt, contig_length_n, contig_names_n, 0)

TypeError: assignment() takes 3 positional arguments but 4 were given

In [19]:
np.savetxt(working_dir + "/bin_assignments_shortcontigs_single_check", np.stack((contig_names_n, short_bins_0)).T, fmt='%s\t%d')

In [None]:
np.savetxt(working_dir + "try/length_2500/bin_assignments_shortcontigs_multi", np.vstack((contig_names_n[bin_m_assign[1]], bin_m_assign[0])).T, fmt='%s\t%d')

In [None]:
np.savetxt(working_dir + "try/length_2500/bin_assignments_shortcontigs_argmax", np.vstack((contig_names_n, argmax)).T, fmt='%s\t%d')

In [None]:
np.savetxt(working_dir + "/bin_assignments_shortcontigs_single", np.stack((contig_names_n, bins_)).T, fmt='%s\t%d')

In [None]:
bins_1

array([1641, 1425, 1499, ..., 1008, 1672, 1446])

In [None]:
bins_

array([439, 635, 804, ..., 202,  78, 299])

In [None]:
bins_1

array([1641, 1425, 1499, ..., 1008, 1672, 1446])

In [None]:
np.savetxt(working_dir + "/bin_assignments_shortcontigs_single1", np.stack((contig_names_n, bins_1)).T, fmt='%s\t%d')

In [68]:
xx = np.array([[0,0.2,0.5],[0.0,0.9,0], [0.0,0.5,0.7]])

xx[xx < 0.3] = 0.0

yy = np.argmax(xx, axis=0)

np.nonzero(yy)[0]

array([1, 2])

In [78]:
inds = np.nonzero(xx.sum(axis=0)==0)[0]

In [75]:
xx.sum(axis=0)

array([0. , 1.4, 1.2])

In [79]:
np.delete(xx, inds, axis=1)

array([[0. , 0.5],
       [0.9, 0. ],
       [0.5, 0.7]])