In [187]:
from matplotlib import pyplot as plt
from scipy.stats import multinomial,gamma,dirichlet,poisson
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.special import psi  
from scipy.special import digamma, gammaln

In [323]:
class CAVI():
    def __init__(self, data,pi_a,gamma_a,gamma_b,K):

        # Number of Topics
        self.K = K

        # For Data & Size
        self.data = data
        self.N, self.V = data.shape

        # For PI (PI ~ Dirichlet)
        self.pi_a = pi_a

        # For Lambda (Lambda ~ Gamma)
        self.gamma_a = gamma_a
        self.gamma_b = gamma_b

        self.elbo = []

    def init_params(self):
        self.pis = [self.pi_a] * self.K
        self.pis = self.pis / np.sum(self.pis)
        self.lambdas = gamma.rvs(self.gamma_a, 1 / self.gamma_b, size=(self.K,self.V))
        self.Z = np.zeros((self.N, self.K))
        for i in range(self.N):
            self.Z[i] = multinomial.rvs(n=1, p=self.pis)
        self.expected_counts = np.dot(self.Z.T, self.data)
        print('----initialize random variables finished----')
        #print(f'PIS: {self.pis}, Lambdas: {self.lambdas}, Z:{self.Z}')

    def update(self):
        
        self.update_z()
        self.expected_counts = np.dot(self.Z.T, self.data)
        self.update_pi()
        self.update_lambda()
        
        #for k in range(self.K):
        #    print(self.lambdas[k])
        #print('----update parameters finished----')
        
    
    def update_pi(self):
        self.pis = np.sum(self.Z, axis=0) + self.pi_a
        self.pis = self.pis / np.sum(self.pis)
    
    def update_lambda(self):
        for k in range(self.K):
            self.lambdas[k] = (self.expected_counts[k] + self.gamma_a) / (self.gamma_b + np.sum(self.Z[:, k]))
            #print(self.expected_counts[k],np.sum(self.Z[:, k]))

    def update_z(self):
        for i in range(self.N):  # for each document
            self.Z[i] = np.exp(psi(self.pis) + np.dot(self.data[i], (psi(self.lambdas.T) - np.log(self.lambdas.T))))
            self.Z[i] /= np.sum(self.Z[i])  # normalize
        self.Z = np.eye(self.Z.shape[1])[np.argmax(self.Z, axis=1)]
     

    def calculate_elbo(self):
        expected_log_likelihood = self.compute_log_likelihood()
        expected_entropy = self.compute_entropy()
        return expected_log_likelihood - expected_entropy
    
    
    
    def compute_log_likelihood(self):
        exp_poisson = np.sum(poisson.logpmf(self.data,np.dot(self.Z,self.lambdas)))
        exp_gamma = np.sum(gamma.logpdf(self.lambdas, self.gamma_a, scale= 1/self.gamma_b))
        exp_multinomial = np.sum(multinomial.logpmf(self.Z, n=1, p=self.pis))
        #print(f'{self.pis}')
        exp_dirichlet = dirichlet.logpdf(self.pis, np.ones(self.K)*self.pi_a)
        
        return exp_poisson + exp_multinomial + exp_dirichlet + exp_gamma
    
    def compute_entropy(self):
        exp_log_q_Z = np.sum(multinomial.logpmf(self.Z, n=1, p=self.pis))
        exp_log_q_pi = dirichlet.logpdf(self.pis, np.ones(self.K)*self.pi_a)
        exp_log_q_lambda = np.sum(gamma.logpdf(self.lambdas, self.gamma_a, scale=1/self.gamma_b))

        return exp_log_q_Z + exp_log_q_pi + exp_log_q_lambda

    def fit(self, max_iter=100):
        self.init_params()
        print('----start iteration----')
        for i in tqdm(range(max_iter),total=max_iter,desc='VI',ncols=100,ascii=' =',leave=True):
            self.update()
            self.elbo.append(self.calculate_elbo())
            if (len(self.elbo) >= 2) and (np.abs(self.elbo[-1] - self.elbo[-2]) < 1e-7):
                print(i)
                break
        print('----finish iteration----')


In [275]:
data = pd.DataFrame()
vocabs = []
with open('./vocab.txt', 'r') as f:
    for line in f:
        vocabs.append(line.split()[0])

rows = []
count = []
with open('./ap.dat', 'r') as f:
    for line in f:
        tmp = line.split(' ')
        count.append(tmp[0])
        tmp = tmp[1:]
        row = [0] * len(vocabs)
        for elem in tmp:
            index, value = elem.split(':')
            row[int(index)] = int(value)
        rows.append(row)
            
x_data = pd.DataFrame(rows, columns = vocabs)

In [276]:
vocabs = []
with open('./vocab.txt', 'r') as f:
    for line in f:
        vocabs.append(line.strip())
        
tmp = x_data.sum()
tmp = np.array(tmp)
to_be_pruned = []
for idx, elem in enumerate(tmp):
    if elem > 1000 or elem < 10:
        to_be_pruned.append(idx)

for i in sorted(to_be_pruned, reverse=True):
    del vocabs[i]
    
pruned_x = x_data[vocabs]

In [277]:
pruned_x.sum()

officials      1000
soviet          999
united          998
bush            949
time            948
               ... 
chicken          10
homosexuals      10
rocked           10
locate           10
frohnmayer       10
Length: 7262, dtype: int64

In [345]:
gamma_a = 1
gamma_b = 2
pi_a = 0.1
K = 8

In [346]:
# cavi = CAVI(data=x_data.values,pi_a=pi_a,gamma_a=gamma_a,gamma_b=gamma_b,K=K)
cavi = CAVI(data=pruned_x.values,pi_a=pi_a,gamma_a=gamma_a,gamma_b=gamma_b,K=K)

In [347]:
cavi.fit(max_iter=100)

----initialize random variables finished----
----start iteration----


  self.Z[i] /= np.sum(self.Z[i])  # normalize
VI:   6%|===                                                        | 6/100 [00:33<08:45,  5.59s/it]

6
----finish iteration----





In [348]:
cavi.lambdas.shape

(8, 7262)

In [351]:
top_word_index = {}
for idx, ld in enumerate(cavi.lambdas):
    top_word_index[idx] =  sorted(range(len(ld)), key=lambda x: ld[x])[-30:]
    top_word_index[idx].reverse()

toplist = pd.DataFrame()
for topic in top_word_index:
    # print(f"Topic K={topic}:   ", )
    tmp =[]
    for idx in top_word_index[topic]:
        tmp.append(vocabs[idx])
    toplist[topic] = tmp

In [352]:
toplist

Unnamed: 0,0,1,2,3,4,5,6,7
0,soviet,index,national,frohnmayer,killed,inc,stock,billboard
1,united,share,hunter,locate,plane,company,points,test
2,officials,points,stamps,rocked,air,billion,exchange,publications
3,bush,exchange,shark,homosexuals,official,military,reported,transmission
4,time,shares,n,chicken,reported,died,news,copyright
5,three,financial,williams,bankrupt,today,agreement,agency,lithuania
6,billion,stock,border,wishes,killing,american,thursday,permission
7,today,rose,th,hinted,died,officer,service,appear
8,told,volume,rates,lengthy,statement,communist,market,popular
9,national,times,man,rid,airport,director,officials,magazine


In [364]:
pruned_x

(2246, 7262)

In [369]:
documents = []

for doc_vector in pruned_x.values:
    document = []
    for word_idx, count in enumerate(doc_vector):
        document += [vocabs[word_idx]]*count
    documents.append(document)

['three', 'federal', 'friday', 'just', 'spokesman', 'dont', 'like', 'like', 'four', 'four', 'found', 'found', 'meeting', 'chief', 'ago', 'killed', 'killed', 'yearold', 'yearold', 'yearold', 'good', 'good', 'high', 'long', 'school', 'school', 'school', 'school', 'school', 'school', 'school', 'family', 'death', 'know', 'see', 'authorities', 'charges', 'died', 'fire', 'saturday', 'saturday', 'students', 'students', 'students', 'students', 'students', 'ms', 'little', 'th', 'went', 'condition', 'big', 'george', 'outside', 'private', 'didnt', 'didnt', 'building', 'investigation', 'believe', 'morning', 'trying', 'arrested', 'shot', 'shot', 'shot', 'shot', 'shot', 'third', 'third', 'charged', 'officers', 'lot', 'senior', 'name', 'name', 'church', 'release', 'main', 'running', 'wall', 'appeared', 'spent', 'parents', 'parents', 'murder', 'murder', 'primary', 'serious', 'identified', 'identified', 'body', 'student', 'student', 'request', 'fired', 'fired', 'schools', 'apparently', 'apparently', 'w

In [370]:
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

# 가정: documents는 전체 문서 데이터를 담고 있습니다.
dictionary = Dictionary(documents)

# 각 토픽의 상위 단어들을 리스트로 변환합니다.
topics = toplist.values.tolist()

# Coherence 모델을 생성하고 Coherence 점수를 계산합니다.
cm = CoherenceModel(topics=topics, dictionary=dictionary, texts=documents, coherence='c_v')

# coherence 점수를 출력합니다.
print('Coherence Score: ', cm.get_coherence())