In [65]:
import numpy as np
import pandas as pd
from scipy.stats import poisson, gamma, dirichlet, multinomial
from tqdm import tqdm


In [2]:
!python preprocess.py

In [52]:
x_data = pd.read_csv('data.csv')


In [58]:
x_data = x_data.iloc[:,1:-1]

In [69]:
def initialize_params(x_data, K,alpha):
    N, V = x_data.shape
    topic_assignments = np.random.randint(0, K, size=N)
    topic_word_counts = np.zeros((K, V))
    topic_counts = np.zeros(K)
    lambdas = np.random.gamma(1.0, 1.0, size=(K, V))
    pis = dirichlet.rvs(alpha * np.ones(K), size=N)
    
    for i in range(N):
        topic = topic_assignments[i]
        topic_counts[topic] += 1
        topic_word_counts[topic] += x_data[i]

    return topic_assignments, topic_word_counts, topic_counts, lambdas, pis

In [74]:
def gibbs_sampling(x_data, K, alpha, beta, num_iters=1000):
    N, V = x_data.shape
    topic_assignments, topic_word_counts, topic_counts, lambdas, pis = initialize_params(x_data, K, alpha)

    for iteration in tqdm(range(num_iters)):
        for i in range(N):
            current_topic = topic_assignments[i]
            topic_counts[current_topic] -= 1
            topic_word_counts[current_topic] -= x_data[i]

            log_topic_probs = np.zeros(K)
            for k in range(K):
                log_topic_word_prob = np.sum(poisson.logpmf(x_data[i], mu=lambdas[k]))
                log_topic_prior_prob = np.log(pis[i, k])
                log_topic_probs[k] = log_topic_word_prob + log_topic_prior_prob

            # Subtract the max log probability to avoid numerical instability
            log_topic_probs = log_topic_probs - np.max(log_topic_probs)
            topic_probs = np.exp(log_topic_probs)
            topic_probs /= topic_probs.sum()

            # Sample new_topic from a multinomial distribution
            new_topic = np.random.choice(K, p=topic_probs)

            topic_assignments[i] = new_topic
            topic_counts[new_topic] += 1
            topic_word_counts[new_topic] += x_data[i]

            # Update lambdas for the new topic
            lambdas[new_topic] = gamma.rvs(a=topic_word_counts[new_topic] + beta, scale=1.0)

            # Update pi for the current document
            pis[i] = dirichlet.rvs(alpha + topic_counts)

    return topic_assignments, topic_word_counts, topic_counts, lambdas, pis

In [75]:

# Set parameters
K = 10  # Number of topics
alpha = 1.0  # Dirichlet prior parameter
beta = 1.0  # Gamma prior parameter
num_iters = 1000  # Number of iterations
x_data_np = x_data.values
# Run the Gibbs sampler
print(x_data_np.shape)
topic_assignments, topic_word_counts, topic_counts, lambdas = gibbs_sampling(x_data_np, K, alpha, beta, num_iters)


(2246, 10473)


  0%|          | 0/1000 [00:09<?, ?it/s]


KeyboardInterrupt: 