 ![GM](./Graphical_Model.png)


$z_n$ sampling<br>
\begin{equation}\begin{aligned}z_n \sim P(z_n|z_{1:n-1,n+1:N}, x_{1:N}, \pi_{1:K}, \lambda_{1:K})=\frac{\prod_{k=1}^K[\pi_k\prod_{v=1}^V Poisson(x_{nv}|\lambda_{kv})]^{z_{nk}}}{\sum_{z_n}\prod_{k=1}^K[\pi_k\prod_{v=1}^V Poisson(x_{nv}|\lambda_{kv})]^{z_{nk}}}\end{aligned}\end{equation}

$\lambda_{kv}$ sampling<br>
\begin{equation}\begin{aligned}\lambda_{kv} \sim P(\lambda_{kv}|z_{1:N}, x_{1:N}, \pi_{1:k-1,k+1:K})=P(\lambda_{kv}|x_{1:N})\propto P(x_{1:N}|\lambda_{kv})P(\lambda_{kv})=Gamma(\alpha_{kv}+\beta)\end{aligned}\end{equation}$\alpha_{kv}$ : # of times word v is observed in cluster k

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import poisson, gamma, dirichlet, multinomial
from tqdm import tqdm

In [2]:
data = pd.DataFrame()
vocabs = []
with open('./vocab.txt', 'r') as f:
    for line in f:
        vocabs.append(line.split()[0])

rows = []
count = []
with open('./ap.dat', 'r') as f:
    for line in f:
        tmp = line.split(' ')
        count.append(tmp[0])
        tmp = tmp[1:]
        row = [0] * len(vocabs)
        for elem in tmp:
            index, value = elem.split(':')
            row[int(index)] = int(value)
        rows.append(row)
            
x_data = pd.DataFrame(rows, columns = vocabs)


In [3]:
def initialize_params(x_data, K,alpha):
    N, V = x_data.shape
    topic_assignments = np.random.randint(0, K, size=N)
    topic_word_counts = np.zeros((K, V))
    topic_counts = np.zeros(K)
    lambdas = np.random.gamma(1.0, 1.0, size=(K, V))
    pis = dirichlet.rvs(alpha * np.ones(K), size=N)
 

    for i in range(N):
        topic = topic_assignments[i]
        topic_counts[topic] += 1
        topic_word_counts[topic] += x_data[i]

    return topic_assignments, topic_word_counts, topic_counts, lambdas, pis

In [4]:
gamma.rvs(a=[1,100], scale=1.0)

array([ 3.06202069, 93.64119029])

In [5]:
def gibbs_sampling(x_data, K, alpha, beta, num_iters=1000):
    N, V = x_data.shape
    topic_assignments, topic_word_counts, topic_counts, lambdas, pis = initialize_params(x_data, K, alpha)
    for iteration in tqdm(range(num_iters)):
        for i in range(N):
            current_topic = topic_assignments[i]
            topic_counts[current_topic] -= 1
            topic_word_counts[current_topic] -= x_data[i]

            log_topic_probs = np.zeros(K)
            for k in range(K):
                log_topic_word_prob = np.sum(poisson.logpmf(x_data[i], mu=lambdas[k]))
                log_topic_prior_prob = np.log(pis[i, k])
                log_topic_probs[k] = log_topic_word_prob + log_topic_prior_prob

            # Subtract the max log probability to avoid numerical instability
            log_topic_probs = log_topic_probs - np.max(log_topic_probs)
            topic_probs = np.exp(log_topic_probs)
            topic_probs /= topic_probs.sum()

            # Sample new_topic from a multinomial distribution
            new_topic = np.random.choice(K, p=topic_probs)

            topic_assignments[i] = new_topic
            topic_counts[new_topic] += 1
            topic_word_counts[new_topic] += x_data[i]

            # Update lambdas for the new topic
            lambdas[new_topic] = gamma.rvs(a=topic_word_counts[new_topic] + beta, scale=1.0)

            # Update pi for the current document
            pis[i] = dirichlet.rvs(alpha + topic_counts)

    return topic_assignments, topic_word_counts, topic_counts, lambdas, pis

In [6]:
## Very Crude Pruning TODO Make this more reliable!

vocabs = []
with open('./vocab.txt', 'r') as f:
    for line in f:
        vocabs.append(line.strip())
        
len(vocabs)
del vocabs[8000:-1]
del vocabs[0:2000]
len(vocabs)
pruned_x = x_data[vocabs]

In [7]:

# Set parameters
K = 4  # Number of topics
alpha = 1.0  # Dirichlet prior parameter
beta = 1.0  # Gamma prior parameter
# num_iters = 100  # Number of iterations 
num_iters = 40 # Reduced Iterations 

### Take Pruned Value
# x_data_np = x_data.values
x_data_np = pruned_x.values


# Run the Gibbs sampler
print(x_data_np.shape)
topic_assignments, topic_word_counts, topic_counts, lambdas, pis = gibbs_sampling(x_data_np, K, alpha, beta, num_iters)


(2246, 6001)


100%|██████████| 40/40 [01:50<00:00,  2.76s/it]


In [8]:
top_word_index = {}
for idx, ld in enumerate(lambdas):
    top_word_index[idx] =  sorted(range(len(ld)), key=lambda x: ld[x])[-30:]
    top_word_index[idx].reverse()

In [9]:
toplist = pd.DataFrame()

In [10]:
for topic in top_word_index:
    # print(f"Topic K={topic}:   ", )
    tmp =[]
    for idx in top_word_index[topic]:
        tmp.append(vocabs[idx])
    toplist[topic] = tmp

In [11]:
toplist

Unnamed: 0,0,1,2,3
0,shamir,venus,reporter,terrorists
1,pictures,creek,metal,damages
2,cook,vargas,sandinista,species
3,veterans,administrations,signal,auction
4,col,electoral,device,premier
5,polish,chosen,species,reed
6,putting,exploded,portland,partners
7,peres,atmosphere,kohl,scandal
8,honduran,southwell,basic,irancontra
9,demjanjuk,fda,dallas,fly


In [12]:
vocabs

['providing',
 'administrations',
 'seats',
 'violated',
 'journalists',
 'looks',
 'cable',
 'sandinista',
 'brothers',
 'drove',
 'facing',
 'arizona',
 'flew',
 'monthly',
 'basic',
 'premier',
 'involvement',
 'poland',
 'aviation',
 'whites',
 'winning',
 'anderson',
 'kids',
 'encourage',
 'boost',
 'resignation',
 'prompted',
 'pictures',
 'obviously',
 'investigating',
 'extended',
 'express',
 'ensure',
 'alcohol',
 'degree',
 'transfer',
 'successful',
 'rival',
 'col',
 'goal',
 'overnight',
 'substantial',
 'partly',
 'route',
 'rescue',
 'affect',
 'practices',
 'matters',
 'putting',
 'path',
 'liberation',
 'va',
 'highly',
 'negative',
 'prove',
 'baghdad',
 'regular',
 'arguments',
 'sector',
 'alabama',
 'alan',
 'quake',
 'supports',
 'watching',
 'nicaraguan',
 'restructuring',
 'particular',
 'giant',
 'scale',
 'cutting',
 'negotiate',
 'declining',
 'germans',
 'runs',
 'discussions',
 'advertising',
 'grew',
 'pace',
 'grant',
 'dealing',
 'mondays',
 'founded',