<a href="https://colab.research.google.com/github/wasineer-dev/braid/blob/develop/inputBioPlex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from scipy.ndimage import gaussian_filter1d 
from time import time as timer

class CountBioplexMatrix:

    def __init__(self, filePath, bait_inds, incidence):
        Nd, Np = incidence.shape
        nProteins = Np
        self.nProteins = Np
        self.mObserved = np.zeros(shape=(nProteins, nProteins), dtype=int)
        for i, bait in zip(range(len(bait_inds)), bait_inds):
            for j in range(nProteins):
                if incidence[i,j]:
                    self.mObserved[j,:] += incidence[i,:] 
                    self.mObserved[:,j] += incidence[i,:]
    
        self.mTrials = np.zeros(shape=(nProteins, nProteins), dtype=int)
        for i, bait in zip(range(len(bait_inds)), bait_inds):
            for j in range(nProteins):
                if incidence[i,j]:
                    self.mTrials[j,:] += np.ones(nProteins, dtype=int) 
                    self.mTrials[:,j] += np.ones(nProteins, dtype=int)
                
        for i in range(nProteins):
            assert(np.sum(self.mTrials[i,:]) == np.sum(self.mTrials[:,i]))

        #
        # Create the adjacency list
        #
        self.lstAdjacency = {}
        for i in np.arange(nProteins):
            self.lstAdjacency[i] = set()
            for j in np.arange(nProteins):
                t = self.mTrials[i][j]
                if (i < j):
                    s = self.mObserved[i][j] 
                else:
                    s = self.mObserved[j][i] 
                assert(s <= t)
                if (i != j and t > 0):    
                    self.lstAdjacency[i].add(j)
                    
#
# TODO: cpmFunc can be countSpokeModel or countMatrixModel
#
class CInputBioplex:

    def __init__(self, filePath, cpmFunc):
        super().__init__()

        df = pd.read_csv(filePath, sep='\t')
        #df_filtered = df[df.apply(lambda x: not x['bait_symbol'].isnumeric() and x['bait_symbol'] != "nan", axis=1)]
        #df_filtered = df_filtered[df_filtered.apply(lambda x: isinstance(x['symbol'], str) and not x['symbol'].isnumeric(), axis=1)]

        df_filtered = df
        bait_list = np.array(df_filtered['Bait Symbol'], dtype='U21')
        prey_list = np.array(df_filtered['Prey Symbol'], dtype='U21')

        proteins_list = np.append(bait_list, prey_list)
            
        self.nProteins = len(np.unique(proteins_list))
        nProteins = self.nProteins

        print('Number of baits = ', len(np.unique(bait_list)))
        print('Number of preys = ', len(np.unique(prey_list)))
        print('Number of proteins = ', nProteins)

        self.aSortedProteins = np.sort(np.unique(proteins_list))  # sorted proteins list
        
        bait_inds = np.searchsorted(self.aSortedProteins, np.array(bait_list, dtype='U21'))
        prey_inds = np.searchsorted(self.aSortedProteins, np.array(prey_list, dtype='U21'))


        nBaits = len(np.unique(bait_list))
        self.incidence = np.zeros((nBaits, nProteins), dtype=int)
        aSortedBaits = np.sort(np.unique(bait_list))
        inds = np.searchsorted(aSortedBaits, np.array(bait_list, dtype='U21'))
        for bait, prey in zip(inds, prey_inds):
            self.incidence[bait][prey] = 1
        del df

        self.observationG = cpmFunc(filePath, range(nBaits), self.incidence)

    def writeCluster2File(self, baseName, matQ, indVec):
        nRows, nCols = matQ.shape
        filePath = baseName + ".tab"
        with open(filePath, "w") as fh:
            for i in range(nRows):
                ind = indVec[i]
                fh.write(str(self.aSortedProteins[i]) + '\t' + str(indVec[i]) + '\t' + str(max(matQ[ind])) + '\n')
            fh.close()
        with open("bioplex_out.csv", "w") as fh:
            for k in range(nCols):
                for i in range(nRows):
                    ind = indVec[i]
                    if (ind == k):
                        fh.write(self.aSortedProteins[i] + '\t')
                fh.write('\n')
            fh.close()

    def writeLabel2File(self, indVec):
        clusters = {}
        for i,k in enumerate(indVec):
            if k not in clusters.keys():
                clusters[k] = set()
            clusters[k].add(i)

        with open("bioplex_out.csv", "w") as fh:
            for i, k in enumerate(clusters):
                for v in clusters[k]:
                    fh.write(self.aSortedProteins[v] + '\t')
                fh.write('\n')
            fh.close()
            
def clustering(inputSet, Nk, psi, fileName):
    fn = 0.8
    fp = 0.04
    nProteins = inputSet.observationG.nProteins
    cmfa = CMeanFieldAnnealing(nProteins, Nk) # default

    funcInfer = cmfa

    ts = timer()
    # alpha = 1e-2
    funcInfer.estimate(inputSet.observationG, nProteins, Nk, psi) 
    te = timer()
    print("Time running MFA: ", te-ts)

    funcInfer.find_argmax()
    inputSet.writeCluster2File(fileName, funcInfer.mIndicatorQ, funcInfer.indicatorVec)

bioPlex2 = CInputBioplex("/content/drive/MyDrive/BioPlex_2.0_293T_DirectedEdges.tsv", CountBioplexMatrix)


Number of baits =  5157
Number of preys =  8800
Number of proteins =  10961


In [2]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

In [4]:
MAX_ITERATION = 20

class CMeanFieldAnnealing:

    def __init__(self, Nproteins, Nk):
        self.lstExpectedLikelihood = []
        self.mIndicatorQ = np.zeros((Nproteins, Nk), dtype=float)

    def tf_annealing(self, mix_p, mObservationG, Nproteins, Nk, psi):

        matA = tf.convert_to_tensor(mObservationG.mTrials - mObservationG.mObserved, dtype=tf.float32)
        matB = tf.convert_to_tensor(psi*mObservationG.mObserved, dtype=tf.float32)
        tfArray = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        for i in range(Nproteins):
            tfArray = tfArray.write(i, self.mIndicatorQ[i])
        
        gamma = 1000.0
        nIteration = 0
        while(nIteration < MAX_ITERATION and gamma > 1.0):
            for i in range(Nproteins):        
                tQ = tfArray.stack()
                fn_out = tf.tensordot(matA[i], tQ, axes=1) 
                fp_out = tf.tensordot(matB[i], 1.0 - tQ, axes=1)

                mLogLikelihood = fn_out + fp_out
                tfArray = tfArray.write(i, tf.nn.softmax(-gamma*mLogLikelihood))

            nIteration += 1
            gamma = gamma - 100.0
        print("Initialize with MFA: num. iterations = ", nIteration)
        self.mIndicatorQ = tfArray.stack().numpy()

    def estimate(self, mObservationG, Nproteins, Nk, psi):
        
        print('psi = ', psi)

        mix_p = (1.0/float(Nk))*np.ones(Nk, dtype=float)
        alpha1 = 1e-8
        for i in range(Nproteins):
            self.mIndicatorQ[i] = np.random.uniform(0.0, 1.0, size=Nk)
            self.mIndicatorQ[i] = (self.mIndicatorQ[i] + alpha1)/(np.sum(self.mIndicatorQ[i]) + alpha1*Nproteins)

        self.tf_annealing(mix_p, mObservationG, Nproteins, Nk, psi)

    def find_argmax(self):
        N = np.size(self.mIndicatorQ, axis=0)
        k = np.size(self.mIndicatorQ, axis=1)
        self.indicatorVec = np.argmax(self.mIndicatorQ, axis=1)
        
    def computeErrorRate(self, mObservationG, Nproteins):
        
        self.find_argmax()

        nClusters = len(np.unique(self.indicatorVec))
        print("Number of clusters used: " + str(nClusters))

        countFn = 0
        countFp = 0
        sumSameCluster = 0
        sumDiffCluster = 0
        for i in range(Nproteins):
            for j in mObservationG.lstAdjacency[i]:
                t = mObservationG.mTrials[i][j]
                s = mObservationG.mObserved[i][j]
                assert(s <= t)
                if (self.indicatorVec[i] == self.indicatorVec[j]):
                    countFn += (t - s)
                    sumSameCluster += t
                else:
                    countFp += s
                    sumDiffCluster += t

        counts = countFn + countFp
        fn = 0.0
        fp = 0.0
        if (sumSameCluster > 0):
            fn = float(countFn)/float(sumSameCluster)
        if (sumDiffCluster > 0):
            fp = float(countFp)/float(sumDiffCluster)
        likelihood = countFn*(-np.log(fn) + np.log(1.0 - fp)) + countFp*(-np.log(fp) + np.log(1.0 - fn)) 
        for i in range(Nproteins):
            for j in mObservationG.lstAdjacency[i]:
                t = mObservationG.mTrials[i][j]
                s = mObservationG.mObserved[i][j]
                likelihood += -s*(np.log(1.0-fn)) - (t-s)*(np.log(1.0-fp))
        return (fn, fp, counts, likelihood)

In [None]:
def hill_climbing(inputSet, Nk):

    nProteins = inputSet.observationG.nProteins
    cmfa = CMeanFieldAnnealing(nProteins, Nk) # default      

    with tf.device('/device:GPU:0'):
      cmfa.estimate(inputSet.observationG, nProteins, Nk, 0.3)
    (fn, fp, errs, f_last) = cmfa.computeErrorRate(inputSet.observationG, nProteins)
    x_values = np.arange(1.0, 10.5, 0.2)
    y_values = np.zeros(len(x_values), dtype=float)
    aics = np.zeros(len(x_values), dtype=float) 
    for i, psi in enumerate(x_values):
        ts = timer()
        with tf.device('/device:GPU:0'):
          f_value = cmfa.estimate(inputSet.observationG, nProteins, Nk, psi) 
        te = timer()
        print("Time running MFA: ", te-ts)
        print("x = ", psi, "f(x) = ", f_value)
        (fn, fp, errs, likelihood) = cmfa.computeErrorRate(inputSet.observationG, nProteins)
        print("\tLikelihood =", likelihood)
        y_values[i] = likelihood
        aics[i] = (Nk - likelihood)/(Nk - f_last)
        f_last = likelihood
    return (x_values, aics)


x_values, aics = hill_climbing(bioPlex2, 700)

aics_filter = gaussian_filter1d(aics, 1)
aics_d2 = np.gradient(np.gradient(aics_filter))
infls = np.where(np.diff(np.sign(aics_d2)))[0]
print("psi = ", x_values[infls])


psi =  0.3


In [None]:
% matplotlib inline

In [None]:
import matplotlib.pyplot as plt

plt.plot(x_values, aics_filter, label='AIC Filter')
plt.plot(x_values, aics, label='AIC')
for i, infl in enumerate(infls):
    plt.axvline(x=x_values[infl], color='k')
plt.legend(bbox_to_anchor=(1.5, 1.0))
plt.show()