## Goal

I find an improved implementation of anchor-word algorithm at https://aclanthology.org/D19-1504.pdf . I want to see how well it performs on a simulated dataset that satsifies the "anchor-word" assumption. 

In [1]:
import os
import sys
import pandas as pd
from scipy import sparse
from sklearn.decomposition import NMF, LatentDirichletAllocation

import numpy as np
import matplotlib.pyplot as plt

script_dir = "../"
sys.path.append(os.path.abspath(script_dir))
from file2 import *
from factorize import *
from smallsim_functions_anchor import *
from misc import *


np.random.seed(123)

## small, uncorrelated example

In [2]:
n = 1000
p = 400
k = 4
doc_len = 1000

sim = smallsim_independent(n = n, p = p, k = k, doc_len = doc_len)
X = sparse.coo_matrix(sim["X"])
L = sim["L"]
F = sim["F"]
anchor_words = sim["anchor_words"]
id_m = sim["id_m"]

Bows = X2Bows(X)
C, D1, D2 = bows2C(Bows, min_tokens=0)

[file.bows2C] Start constructing dense C...
- Counting the co-occurrence for each document...
+ Finish constructing C and D!
  - The sum of all entries = 1.000000
  - Elapsed Time = 0.6452 seconds


In [3]:
S, B, A, Btilde, Cbar, C_rowSums, diagR, C = factorizeC(C, K=k, rectifier='AP', optimizer='activeSet')

+ Start rectifying C...
+ Start alternating projection
  - 1-th iteration... (3.040130e-05 / 4.621196e-10)
  - 2-th iteration... (1.491654e-08 / 4.621200e-10)
  - 3-th iteration... (9.905184e-09 / 4.621204e-10)
  - 4-th iteration... (6.577451e-09 / 4.621207e-10)
  - 5-th iteration... (4.367703e-09 / 4.621209e-10)
  - 6-th iteration... (2.900339e-09 / 4.621211e-10)
  - 7-th iteration... (1.925949e-09 / 4.621212e-10)
  - 8-th iteration... (1.278912e-09 / 4.621213e-10)
  - 9-th iteration... (8.492528e-10 / 4.621213e-10)
  - 10-th iteration... (5.639404e-10 / 4.621214e-10)
  - 11-th iteration... (3.744807e-10 / 4.621214e-10)
  - 12-th iteration... (2.486713e-10 / 4.621214e-10)
  - 13-th iteration... (1.651285e-10 / 4.621214e-10)
  - 14-th iteration... (1.096524e-10 / 4.621214e-10)
  - 15-th iteration... (7.281394e-11 / 4.621214e-10)
+ Finish alternating projection
  - Elapsed seconds = 0.0690

  - Finish rectifying C! [0.069019]
+ Start finding the set of anchor bases S...
[inference.findS

## Evaulate results

In [4]:
topic_idx = match_topics(F, B).astype(int)
topic_idx

array([0, 1, 3, 2])

In [5]:
np.column_stack((S[topic_idx], anchor_words))

array([[ 85,  85],
       [367, 367],
       [110, 110],
       [319, 319]])

In [6]:
cand_set = set(np.unique(id_m))
[w in cand_set for w in S]

[True, True, True, True]

In [7]:
F[anchor_words,:].round(2)

array([[0.07, 0.  , 0.  , 0.  ],
       [0.  , 0.08, 0.  , 0.  ],
       [0.  , 0.  , 0.07, 0.  ],
       [0.  , 0.  , 0.  , 0.08]])

In [8]:
B[anchor_words[topic_idx],:].round(3)

array([[0.069, 0.   , 0.   , 0.   ],
       [0.   , 0.077, 0.   , 0.   ],
       [0.   , 0.   , 0.08 , 0.   ],
       [0.   , 0.   , 0.   , 0.072]])

In [9]:
# compare A and LLt/n
A_reorder = A[topic_idx,:]
A_reorder = A_reorder[:, topic_idx]
A_reorder.round(decimals=2)

array([[0.2 , 0.02, 0.02, 0.02],
       [0.02, 0.18, 0.02, 0.02],
       [0.02, 0.02, 0.17, 0.02],
       [0.02, 0.02, 0.02, 0.19]])

In [10]:
L = sim["L"]
(L.T.dot(L)/n).round(decimals=2)

array([[0.2 , 0.02, 0.02, 0.02],
       [0.02, 0.17, 0.02, 0.02],
       [0.02, 0.02, 0.17, 0.02],
       [0.02, 0.02, 0.02, 0.2 ]])

LDA can find those anchor words very well too

In [11]:
# lda = LatentDirichletAllocation(n_components=k, max_iter=20,
#                                 random_state=0)
# lda.fit(X)

# B2 = (lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]).T
# idx = match_topics(B2, F).astype(int)
# B2[anchor_words[idx],:].round(3)