# COLLAPSED GIBBS SAMPLING

The idea of this first part we implement a collapsed Gibbs Sampling

---

# Preprocessing

---

In [16]:
import numpy as np 
import spacy
import random
from tqdm import tqdm
from collections import Counter
import lda

In [8]:
X = lda.datasets.load_reuters()
vocab = lda.datasets.load_reuters_vocab()
titles = lda.datasets.load_reuters_titles()

In [100]:
# Dirichlet , Hyperparamaters , tokonize
ALPHA = 0.1
BETA = 0.1
NUM_TOPICS = 600
sp = spacy.load("en_core_web_sm")

np.random.seed(42)
random.seed(42)

In [64]:
def generate_frequencies(data):
    freqs = Counter()
    all_stopwords = sp.Defaults.stop_words
    nr_tokens = 0

    for doc in data:
        for doc in titles:
            tokens = sp.tokenizer(doc)
            for token in tokens :
                token_text = token.text.lower()
                if token_text not in all_stopwords and token.is_alpha:
                    nr_tokens +=1
                    freqs[token_text] +=1
    
    return freqs
           
    

def get_vocab(freqs , freq_threeshold):
    vocab = {}
    vocab_idx_str = {}
    vocab_idx = 0

    for word in freqs :
        if freqs[word] >= freq_threeshold:
            vocab[word] = vocab_idx
            vocab_idx_str[vocab_idx] = word 
            vocab_idx += 1
    return vocab , vocab_idx_str


def tokonize_dataset(data, vocab):
    nr_tokens = 0
    nr_docs = 0
    docs = []

    for doc in data:
        tokens  = sp.tokenizer(doc)

        if len(tokens) > 1:
            doc =  []
            for token in tokens :
                token_text  = token.text.lower()
                if token_text in vocab: 
                    doc.append(token_text)
                    nr_tokens +=1
            nr_docs +=1 
            docs.append(doc)

    corpus = []
    for doc in docs:
        corpus_d = []

        for token in doc:
            corpus_d.append(vocab[token])

        corpus.append(np.asarray(corpus_d))
    return docs , corpus

In [72]:
freqs = generate_frequencies(titles)
vocabu , vocab_idx_str = get_vocab(freqs , 3)
docs , corpus = tokonize_dataset(titles , vocabu)
vocab_size = len(vocabu)

---

In [102]:
def LDA_collapsed_gibbs(corpus , num_iteration):
    Z = []
    num_docs = len(corpus)
    for _ , doc in enumerate(corpus):
        Zd  = np.random.randint(low = 0 , high = NUM_TOPICS , size = len(doc))
        Z.append(Zd)

    ndk = np.zeros((num_docs , NUM_TOPICS))
    for d in range(num_docs):
        for k in range(NUM_TOPICS):
            ndk[d , k]   = np.sum(Z[d]==k)
    
    nkw = np.zeros((NUM_TOPICS , vocab_size))
    for doc_idx , doc  in enumerate(corpus):
        for i , word in enumerate(doc):
            topic = Z[doc_idx][i]
            nkw[topic , word] += 1
    
    nk = np.sum(nkw  , axis =1 )
    topic_list = [i for i  in range(NUM_TOPICS)]

    for _ in tqdm(range(num_iteration)):
        for doc_idx, doc in enumerate(corpus):  # Fixed variable naming here
            for i, word in enumerate(doc):
                topic = Z[doc_idx][i]
                nkw[topic, word] -= 1
                ndk[doc_idx, topic] -= 1
                nk[topic] -= 1
            
            # Make sure ALPHA and BETA are correctly defined and used
            p_z = ((ndk[doc_idx, :] + ALPHA) / (np.sum(ndk[doc_idx, :]) + NUM_TOPICS * ALPHA)) * \
                  ((nkw[:, word] + BETA) / (nk + vocab_size * BETA))
                  
            new_topic = np.random.choice(topic_list, p=p_z / p_z.sum())
            Z[doc_idx][i] = new_topic
            nkw[new_topic, word] += 1
            ndk[doc_idx, new_topic] += 1
            nk[new_topic] += 1

   # return Z , ndk , nkw , nk
    return p_z


    

 

In [105]:
def LDA_collapsed_gibbs(corpus, num_iterations):
    Z = []
    num_docs = len(corpus)
    for _, doc in enumerate(corpus):
        Zd = np.random.randint(low=0, high=NUM_TOPICS, size=len(doc))
        Z.append(Zd)

    ndk = np.zeros((num_docs, NUM_TOPICS))
    for d in range(num_docs):
        for k in range(NUM_TOPICS):
            ndk[d, k] = np.sum(Z[d] == k)

    nkw = np.zeros((NUM_TOPICS, vocab_size))
    for doc_idx, doc in enumerate(corpus):
        for i, word in enumerate(doc):
            topic = Z[doc_idx][i]
            nkw[topic, word] += 1

    nk = np.sum(nkw, axis=1)
    topic_list = np.arange(NUM_TOPICS)

    for _ in tqdm(range(num_iterations)):
        for doc_idx, doc in enumerate(corpus):
            for i, word in enumerate(doc):
                topic = Z[doc_idx][i]
                nkw[topic, word] -= 1
                ndk[doc_idx, topic] -= 1
                nk[topic] -= 1

                p_z = ((ndk[doc_idx, :] + ALPHA) / (np.sum(ndk[doc_idx, :]) + NUM_TOPICS * ALPHA)) * \
                      ((nkw[:, word] + BETA) / (nk + vocab_size * BETA))

                # Ensure p_z is non-negative
                p_z = np.maximum(p_z, 0)

                # Normalize p_z to ensure it sums to 1 (if not all zeros)
                if np.sum(p_z) > 0:
                    p_z /= np.sum(p_z)
                else:
                    # Handle the case where all probabilities are 0
                    # This should ideally not happen but is included for robustness
                    p_z = np.ones(len(p_z)) / len(p_z)

                new_topic = np.random.choice(topic_list, p=p_z)
                Z[doc_idx][i] = new_topic
                nkw[new_topic, word] += 1
                ndk[doc_idx, new_topic] += 1
                nk[new_topic] += 1

    # return Z, ndk, nkw, nk
    return Z, ndk, nkw, nk


In [109]:
Z , ndk , nkw , nk  = LDA_collapsed_gibbs(corpus , 2000)

100%|██████████| 2000/2000 [02:32<00:00, 13.09it/s]


---

In [120]:
phi = nkw / nk.reshape(NUM_TOPICS , 1)
num_words = 10
for k in range(NUM_TOPICS):
    most_common_words = np.argsort(phi[k])[::-1][:num_words]
    print(f"Topic {k} most important wwords: ")

    for word in most_common_words:
        print(vocab_idx_str[word])
    
    print("\n")

Topic 0 most important wwords: 
indonesia
germany
york
madrid
quiet
frail
stocks
buddhist
prince
croatia


Topic 1 most important wwords: 
feature
return
monterey
glittering
veil
anniversary
priesthood
romanians
enjoy
colourful


Topic 2 most important wwords: 
philippines
grave
anniversary
priesthood
romanians
enjoy
colourful
choice
candidates
veil


Topic 3 most important wwords: 
russia
angeles
tsar
mourns
nuns
seven
recalls
civil
hospital
veil


Topic 4 most important wwords: 
germany
patricia
banker
thickens
priesthood
romanians
enjoy
colourful
choice
candidates


Topic 5 most important wwords: 
chicago
dallas
enduring
announces
priesthood
romanians
enjoy
colourful
choice
candidates


Topic 6 most important wwords: 
sicily
london
harriman
france
dies
zakopane
peace
berates
thickens
romanians


Topic 7 most important wwords: 
romania
riot
banker
secrecy
anniversary
priesthood
romanians
enjoy
colourful
choice


Topic 8 most important wwords: 
live
atlanta
secrecy
priesthood
romanian

  phi = nkw / nk.reshape(NUM_TOPICS , 1)
