<a href="https://colab.research.google.com/github/wasineer-dev/braid/blob/develop/inputSet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import numpy as np
import tensorflow as tf

from time import time as timer

MAX_ITERATION = 20

class CMeanFieldAnnealing:

    def __init__(self, Nproteins, Nk):
        self.lstExpectedLikelihood = []
        self.mIndicatorQ = np.zeros((Nproteins, Nk), dtype=float)

    def tf_annealing(self, mix_p, mObservationG, Nproteins, Nk, psi):

        matA = tf.convert_to_tensor(mObservationG.mTrials - mObservationG.mObserved, dtype=tf.float32)
        matB = tf.convert_to_tensor(psi*mObservationG.mObserved, dtype=tf.float32)
        tfArray = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        for i in range(Nproteins):
            tfArray = tfArray.write(i, self.mIndicatorQ[i])
        
        gamma = 1000.0
        nIteration = 0
        while(nIteration < MAX_ITERATION and gamma > 1.0):
            for i in range(Nproteins):        
                tQ = tfArray.stack()
                fn_out = tf.tensordot(matA[i], tQ, axes=1) 
                fp_out = tf.tensordot(matB[i], 1.0 - tQ, axes=1)

                mLogLikelihood = fn_out + fp_out
                tfArray = tfArray.write(i, tf.nn.softmax(-gamma*mLogLikelihood))

            nIteration += 1
            gamma = gamma - 100.0
        print("Initialize with MFA: num. iterations = ", nIteration)
        self.mIndicatorQ = tfArray.stack().numpy()

    def estimate(self, mObservationG, Nproteins, Nk, psi):
        
        print('psi = ', psi)

        mix_p = (1.0/float(Nk))*np.ones(Nk, dtype=float)
        alpha1 = 1e-8
        for i in range(Nproteins):
            self.mIndicatorQ[i] = np.random.uniform(0.0, 1.0, size=Nk)
            self.mIndicatorQ[i] = (self.mIndicatorQ[i] + alpha1)/(np.sum(self.mIndicatorQ[i]) + alpha1*Nproteins)

        self.tf_annealing(mix_p, mObservationG, Nproteins, Nk, psi)
        
    def find_argmax(self):
        self.indicatorVec = np.argmax(self.mIndicatorQ, axis=1)

    def computeErrorRate(self, mObservationG, Nproteins):
        
        # self.find_lin_dependent()
        self.find_argmax()

        rnk = np.linalg.matrix_rank(self.mIndicatorQ)
        print("Indicator matrix had rank = " + str(rnk))
        nClusters = len(np.unique(self.indicatorVec))
        print("Number of clusters used: " + str(nClusters))

        countFn = 0
        countFp = 0
        sumSameCluster = 0
        sumDiffCluster = 0
        for i in range(Nproteins):
            for j in mObservationG.lstAdjacency[i]:
                t = mObservationG.mTrials[i][j]
                s = mObservationG.mObserved[i][j]
                assert(s <= t)
                if (self.indicatorVec[i] == self.indicatorVec[j]):
                    countFn += (t - s)
                    sumSameCluster += t
                else:
                    countFp += s
                    sumDiffCluster += t

        counts = countFn + countFp
        fn = 0.0
        fp = 0.0
        if (sumSameCluster > 0):
            fn = float(countFn)/float(sumSameCluster)
        if (sumDiffCluster > 0):
            fp = float(countFp)/float(sumDiffCluster)
        likelihood = countFn*(-np.log(fn)) + countFp*(-np.log(fp)) 
        for i in range(Nproteins):
            for j in mObservationG.lstAdjacency[i]:
                t = mObservationG.mTrials[i][j]
                s = mObservationG.mObserved[i][j]
                likelihood += -s*(np.log(1.0-fn)) - (t-s)*(np.log(1.0-fp))
        return (fn, fp, counts, likelihood)
        
class CountMatrixModel:
    
    def __init__(self, nProteins, bait_inds, incidence):

        self.nProteins = nProteins
        self.mObserved = np.zeros(shape=(nProteins, nProteins), dtype=int)
        for i, bait in zip(range(len(bait_inds)), bait_inds):
            for j in range(nProteins):
                if incidence[i,j]:
                    self.mObserved[j,:] += incidence[i,:] 
                    self.mObserved[:,j] += incidence[i,:]
    
        self.mTrials = np.zeros(shape=(nProteins, nProteins), dtype=int)
        for i, bait in zip(range(len(bait_inds)), bait_inds):
            for j in range(nProteins):
                if incidence[i,j]:
                    self.mTrials[j,:] += np.ones(nProteins, dtype=int) 
                    self.mTrials[:,j] += np.ones(nProteins, dtype=int)

        for i in range(nProteins):
            assert(np.sum(self.mTrials[i,:]) == np.sum(self.mTrials[:,i]))

        #
        # Create the adjacency list
        #
        self.lstAdjacency = {}
        for i in np.arange(nProteins):
            self.lstAdjacency[i] = set()
            for j in np.arange(nProteins):
                t = self.mTrials[i][j]
                if (i < j):
                    s = self.mObserved[i][j] 
                else:
                    s = self.mObserved[j][i] 
                assert(s <= t)
                if (i != j and t > 0):
                    self.lstAdjacency[i].add(j)
#
# TODO: cpmFunc can be countSpokeModel or countMatrixModel
#
class CInputSet:

    def __init__(self, filename, cpmFunc=None):
        super().__init__()

        listBaits = list()
        with open(filename) as fh:
            setProteins = set()
            for line in fh:
                lst = line.rstrip().split(',')
                bait = lst[0]
                listBaits.append(bait)
                setProteins = setProteins.union(set(lst))
            print('Number of proteins ' + str(len(setProteins)))
            fh.close()

        self.aSortedProteins = np.sort(np.array(list(setProteins), dtype='U21'))
        bait_inds = np.searchsorted(self.aSortedProteins, np.array(listBaits, dtype='U21'))
        
        print('Number of purifications ' + str(len(bait_inds)))

        nProteins = len(self.aSortedProteins)
        self.incidence = np.zeros(shape=(len(bait_inds), nProteins), dtype=int)
        with open(filename) as fh:
            lineCount = 0
            for line in fh:
                lst = line.rstrip().split(',')
                prey_inds = np.searchsorted(self.aSortedProteins, np.array(lst, dtype='U21'))           
                for id in prey_inds:
                    self.incidence[lineCount][id] = 1
                lineCount += 1
            fh.close()
            
        self.observationG = CountMatrixModel(nProteins, bait_inds, self.incidence)

    def writeCluster2File(self, baseName, matQ, indVec):
        nRows, nCols = matQ.shape
        filePath = baseName + ".tab"
        with open(filePath, "w") as fh:
            for i in range(nRows):
                ind = indVec[i]
                fh.write(self.aSortedProteins[i] + '\t' + str(indVec[i]) + '\t' + str(max(matQ[ind])) + '\n')
            fh.close()
        filePath = baseName + ".csv"
        with open(filePath, "w") as fh:
            for k in range(nCols):
                inds = list(i for i in range(nRows) if indVec[i] == k)
                for j in inds:
                    protein = self.aSortedProteins[j].split('__')[0] 
                    fh.write(protein + '\t')
                fh.write('\n')
            fh.close()
    
    def writeLabel2File(self, indVec):
        clusters = {}
        for i,k in enumerate(indVec):
            if k not in clusters.keys():
                clusters[k] = set()
            clusters[k].add(i)

        with open("out.csv", "w") as fh:
            for i, k in enumerate(clusters):
                for v in clusters[k]:
                    protein = self.aSortedProteins[v].split('__')[0] 
                    fh.write(protein + '\t')
                fh.write('\n')
            fh.close()

def clustering(inputSet, Nk, psi, baseName):
    fn = 0.8
    fp = 0.04
    nProteins = inputSet.observationG.nProteins
    cmfa = CMeanFieldAnnealing(nProteins, Nk) # default

    funcInfer = cmfa

    ts = timer()
    # alpha = 1e-2
    funcInfer.estimate(inputSet.observationG, nProteins, Nk, psi) 
    te = timer()
    print("Time running MFA: ", te-ts)
    
    funcInfer.find_argmax()
    
    inputSet.writeCluster2File(baseName, funcInfer.mIndicatorQ, funcInfer.indicatorVec)
    

    

In [48]:
inputSet = CInputSet("/content/drive/MyDrive/gavin2006.csv")

Number of proteins 2760
Number of purifications 2166


In [None]:
from time import time as timer

def hill_climbing(inputSet, Nk, step=0.5):

    nProteins = inputSet.observationG.nProteins
    cmfa = CMeanFieldAnnealing(nProteins, Nk) # default

    funcInfer = cmfa        

    funcInfer.estimate(inputSet.observationG, nProteins, Nk, 0.3)
    (fn, fp, errs, f_last) = funcInfer.computeErrorRate(inputSet.observationG, nProteins)
    x_values = np.arange(1.0, 10.5, step)
    y_values = np.zeros(len(x_values), dtype=float)
    aics = np.zeros(len(x_values), dtype=float) 
    for i, psi in enumerate(x_values):
        ts = timer()
        f_value = funcInfer.estimate(inputSet.observationG, nProteins, Nk, psi) 
        te = timer()
        print("Time running MFA: ", te-ts)
        print("x = ", psi, "f(x) = ", f_value)
        (fn, fp, errs, likelihood) = funcInfer.computeErrorRate(inputSet.observationG, nProteins)
        print("\tLikelihood =", likelihood)
        y_values[i] = likelihood
        aics[i] = (Nk + likelihood)
        f_last = likelihood

    return (x_values, aics, y_values)

x_values, aics, y_values = hill_climbing(inputSet, 300, step=0.5)

psi =  0.3
Initialize with MFA: num. iterations =  10
Indicator matrix had rank = 300
Number of clusters used: 300
psi =  1.0
Initialize with MFA: num. iterations =  10
Time running MFA:  106.10355043411255
x =  1.0 f(x) =  None
Indicator matrix had rank = 300
Number of clusters used: 300
	Likelihood = 5748785.574634841
psi =  1.5
Initialize with MFA: num. iterations =  10
Time running MFA:  107.31715846061707
x =  1.5 f(x) =  None
Indicator matrix had rank = 300
Number of clusters used: 300
	Likelihood = 5614516.030721573
psi =  2.0
Initialize with MFA: num. iterations =  10
Time running MFA:  106.86240315437317
x =  2.0 f(x) =  None
Indicator matrix had rank = 300
Number of clusters used: 300
	Likelihood = 5544625.152860571
psi =  2.5
Initialize with MFA: num. iterations =  10
Time running MFA:  106.9421136379242
x =  2.5 f(x) =  None
Indicator matrix had rank = 300
Number of clusters used: 300
	Likelihood = 5494283.476093733
psi =  3.0
Initialize with MFA: num. iterations =  10
Time

In [None]:
nProteins = inputSet.observationG.nProteins
Nk = 300
bics = np.log(nProteins)*Nk + 2.0*y_values

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
from scipy.ndimage import gaussian_filter1d

y_values = np.exp(np.min(y_values) - y_values)
y_filter = gaussian_filter1d(y_values, 2)
d2 = np.gradient(np.gradient(y_filter))
aics_d2 = np.gradient(np.gradient(aics))
infls = np.where(np.diff(np.sign(d2)))[0]
print("psi = ", x_values[infls])

fig = plt.figure()
plt.plot(x_values, y_values, label='AIC')
plt.plot(x_values, y_filter, label='AIC Filter')
for i, infl in enumerate(infls):
    plt.axvline(x=x_values[infl], color='k')
plt.legend(bbox_to_anchor=(1.5, 1.0))
plt.show()

In [None]:
def clustering(inputSet, k, psi):
    nProteins = inputSet.observationG.nProteins
    cmfa = CMeanFieldAnnealing(nProteins, k) # default

    funcInfer = cmfa        

    funcInfer.estimate(inputSet.observationG, nProteins, k, psi)
    (fn, fp, errs, f_last) = funcInfer.computeErrorRate(inputSet.observationG, nProteins)
    return f_last
    
max_k = int(inputSet.observationG.nProteins/2)
ks = np.arange(100, max_k, 100)
for infl in infls[:1]:
    ls = []
    for i, k in enumerate(ks):
        ls.append(clustering(inputSet, k, x_values[infl]))
    bics = []
    for i, f in enumerate(ls):
        bics.append(np.log(nProteins)*float(ks[i]) + 2.0*ls[i])
    plt.plot(ks, bics, label="{:.2f}".format(x_values[infl]))
plt.legend(bbox_to_anchor=(1.5, 1.0))
plt.show()

In [None]:
xs = ks[2:]
ys = bics[2:]
plt.plot(xs, np.min(ys)/ys)
plt.show()

In [None]:
plt.plot(ks[3:], bics[3:])
plt.show()