## Goal
Count data is from Pi ([p,n]) which has no anchor words, and all samples are from one of the K groups purely

In [62]:
import sys
sys.path.insert(0,'../Top_code/')
import Top
import numpy as np
import time
import random
random.seed(12345)
np.random.seed(12345)

K = 5
p = 3000
N = 5000
n = 50

## generate A (with no anchor words: each row has more than one nonzero entry)
A = np.zeros(shape=[p,K])

for row in A:
    n_sample = np.random.randint(2,K,1)
    row_indx = np.random.randint(0,K,n_sample)
    row_val = np.random.uniform(0,1,n_sample)
    row[row_indx] = row_val
### scale A so that column sums to 1
A = A/A.sum(axis = 0)

## generate W (each column has only one nonzero entry: 1)
WT = np.zeros(shape=[n,K])
for row in WT:
    row[np.random.randint(0,K,1)] = 1
W = WT.transpose()


## generate Pi
Pi = np.dot(A,W)

## generate X
## generate X
X = np.zeros(shape = [p,n])
for column in range(n):
    X[:,column] = np.random.multinomial(N,Pi[:,column])


In [63]:
## note empty documents to facilitae comparison
zero_row= np.where(X.sum(axis=1)==0)
zero_row

(array([1074, 1191, 1373, 1722]),)

In [64]:
## experiments
start = time.time()
top_nonanchor = Top.Top(X,T=1)
print("run finished after: ", str(time.time()-start))

run finished after:  1.5460395812988281


In [65]:
top_nonanchor["K"]

8

## Comment:
K is slightly larger

In [66]:
A = top_nonanchor["A"] 

In [69]:
W = np.linalg.lstsq(A,Pi[np.where(X.sum(axis=1)!=0)[0],:])

  if __name__ == '__main__':


In [92]:
diff = Pi[np.where(X.sum(axis=1)!=0)[0],:]-np.dot(A,W[0])
square_loss = np.square(diff).sum()

square_loss

0.0014340726935883636

In [82]:
loglik = Pi[np.where(X.sum(axis=1)!=0)[0],:] * np.log(np.dot(A,W[0]))
loglik

  if __name__ == '__main__':


array([[-0.00710235, -0.        , -0.00627993, ..., -0.00710235,
        -0.00710235, -0.00710235],
       [-0.00129825, -0.00013398, -0.00325632, ..., -0.00129825,
        -0.00129825, -0.00129825],
       [-0.        , -0.00652262, -0.00670559, ..., -0.        ,
        -0.        , -0.        ],
       ...,
       [-0.        , -0.00898087, -0.        , ..., -0.        ,
        -0.        , -0.        ],
       [-0.00095257, -0.        , -0.        , ..., -0.00095257,
        -0.00095257, -0.00095257],
       [-0.        , -0.0038374 , -0.00160485, ..., -0.        ,
        -0.        , -0.        ]])

## Comment:
Least square loss is good. But too many 0s given. This way of solving W is not good.

In [86]:
A.shape

(2996, 8)

In [87]:
Pi.shape

(3000, 50)

In [88]:
np.savetxt("../Top_data/Pi_K5.csv", Pi, delimiter=",")
np.savetxt("../Top_data/countX_K5.csv", X, delimiter=",")
np.savetxt("../Top_data/topA_K5.csv", A, delimiter=",")


## See how accurate Top can estimate K

In [32]:
def experiment_with_p(p, K = 5, n = 50):
    #K = 5
    #p = 3000
    N = 1500
    #n = 50

    ## generate A
    A = np.zeros(shape=[p,K])

    for row in A:
        n_sample = np.random.randint(2,K,1)
        row_indx = np.random.randint(0,K,n_sample)
        row_val = np.random.uniform(0,1,n_sample)
        row[row_indx] = row_val
    ### scale A so that column sums to 1
    A = A/A.sum(axis = 0)

    ## generate W (each column has only one nonzero entry: 1)
    WT = np.zeros(shape=[n,K])
    for row in WT:
        row[np.random.randint(0,K,1)] = 1
    W = WT.transpose()


    ## generate Pi
    Pi = np.dot(A,W)

    ## generate X
    ## generate X
    X = np.zeros(shape = [p,n])
    for column in range(n):
        X[:,column] = np.random.multinomial(N,Pi[:,column])
    
    return(Top.Top(X,T=1)["K"])


In [35]:
def experiment_with_p_mean(p,n, K = 5, nn = 50):
    k = 0
    for i in range(n):
        k += experiment_with_p(p, K = K, n = nn)
    k = k/n
    return(k)

In [18]:
experiment_with_p_mean(100,10)

4.1

In [27]:
[experiment_with_p_mean(100*x,10) for x in range(1,10)]

[4.1, 4.5, 4.4, 4.4, 5.0, 5.1, 5.0, 5.6, 5.6]

In [28]:
[experiment_with_p_mean(100*x,10, K = 10) for x in range(1,10)]

[5.5, 5.3, 6.0, 5.8, 6.2, 6.2, 7.1, 7.0, 7.5]

In [29]:
[experiment_with_p_mean(100*x,10, K = 20) for x in range(1,10)]

[7.3, 7.4, 7.9, 8.1, 7.9, 8.5, 9.0, 8.6, 8.6]

In [30]:
[experiment_with_p_mean(150*x,10, K = 20) for x in range(1,10)]

[7.8, 7.7, 7.7, 8.1, 8.7, 8.8, 8.6, 9.8, 9.7]

#### Comment:
It seems not accurate, but maybe we need to give it more samples.

In [36]:
[experiment_with_p_mean(150*x,10, K = 20, nn = 200) for x in range(1,10)]

[15.1, 14.0, 14.9, 14.6, 15.5, 16.3, 15.6, 17.0, 17.3]

## Comment:
Seems that Top can find pretty good Ks, given samples are sufficient