In [2]:
from src.vectorize import *
from src.lda import *
import pandas as pd
import numpy as np
from keras.utils import to_categorical
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import random




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yanis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Import des données et pré-processing

On importe 10 000 articles Wikipedia.

In [3]:
data = pd.read_csv("C:\\Users\\yanis\\OneDrive\\Documents\\ENSAE 3A\\Sequential MC\\SSM-PROJECT\\wiki_data.csv",
                    encoding='utf-8',
                    delimiter=";")
del data["Unnamed: 0"]
display(data.head())
print("Number of articles: {}".format(len(data)))

Unnamed: 0,Text,Title
0,"Joseph Harold Greenberg (May 28, 1915 – May 7,...",Joseph Greenberg
1,"Pauline Donalda, (March 5, 1882 – October 22,...",Pauline Donalda
2,This is a list of German football transfers in...,List of German football transfers summer 2017
3,"Lester Hudson III (born August 7, 1984) is an ...",Lester Hudson
4,"Monique Ganderton (born August 6, 1980) is a C...",Monique Ganderton


Number of articles: 10000


1. On tokenize chaque article: "Joseph Harold Greenberg" devient ["Jospeh", "Harold", "Greenberg"]
2. On filtre les articles: on retire les articles composés de moins de 500 mots
3. On vectorise le texte: on associe à chaque mot son indice dans le vocabulaire général.

Exemple: si tous les articles peuvent contenir comme mots: ["pomme", "poire", "chocolat", "eau", "banane"], l'article ["chocolat", "banane", "pomme"] devient [2, 4, 0]

In [4]:
vectorized_data=vectorize_data(data=data, min_number_words=500)
display(vectorized_data.head())

Unnamed: 0,Text,Title,tokenized_text,vectorized_text
0,"Joseph Harold Greenberg (May 28, 1915 – May 7,...",Joseph Greenberg,"[Joseph, Harold, Greenberg, (, May, 28, ,, 191...","[68836, 61034, 58682, 1182, 80975, 13183, 1242..."
3,"Lester Hudson III (born August 7, 1984) is an ...",Lester Hudson,"[Lester, Hudson, III, (, born, August, 7, ,, 1...","[75482, 63781, 64659, 1182, 134097, 26662, 181..."
4,"Monique Ganderton (born August 6, 1980) is a C...",Monique Ganderton,"[Monique, Ganderton, (, born, August, 6, ,, 19...","[84017, 56054, 1182, 134097, 26662, 17386, 124..."
6,The white bikini of Ursula Andress (also known...,White bikini of Ursula Andress,"[The, white, bikini, of, Ursula, Andress, (, a...","[115808, 188693, 133263, 165140, 120335, 24145..."
15,"""Breakout"" is a single from British act Swing ...",Breakout (Swing Out Sister song),"[``, Breakout, '', is, a, single, from, Britis...","[127812, 32984, 6, 156142, 127813, 178055, 149..."


En attendant d'avoir $z^*_{1:T}$, on applique le LDA sur le corpus avec K=50 topics. Chaque mot a alors un poids relativement à chaque topic.

Exemple: le mot banane a les poids suivant: {"topic 1": 0.3, "topic 2": 0.1, "topic 3": 0.05,...}. 

On associe au mot le topic avec le poids le plus élevé. En l'occurence 1. De cette façon, on crée index_topic=$(z_1,...,z_T)$

In [5]:
NUM_TOPICS=50
lda=LDA(num_topics=NUM_TOPICS, random_state=123)
vectorized_data_lda=lda.run(vectorized_data)
display(vectorized_data_lda.head())

Unnamed: 0,Text,Title,tokenized_text,vectorized_text,index_topic
0,"Joseph Harold Greenberg (May 28, 1915 – May 7,...",Joseph Greenberg,"[Joseph, Harold, Greenberg, (, May, 28, ,, 191...","[68836, 61034, 58682, 1182, 80975, 13183, 1242...","[21, 21, 24, 21, 18, 18, 7, 18, 6, 18, 21, 7, ..."
3,"Lester Hudson III (born August 7, 1984) is an ...",Lester Hudson,"[Lester, Hudson, III, (, born, August, 7, ,, 1...","[75482, 63781, 64659, 1182, 134097, 26662, 181...","[21, 16, 18, 21, 11, 39, 21, 7, 16, 21, 10, 15..."
4,"Monique Ganderton (born August 6, 1980) is a C...",Monique Ganderton,"[Monique, Ganderton, (, born, August, 6, ,, 19...","[84017, 56054, 1182, 134097, 26662, 17386, 124...","[30, 30, 21, 11, 39, 27, 7, 16, 21, 10, 7, 1, ..."
6,The white bikini of Ursula Andress (also known...,White bikini of Ursula Andress,"[The, white, bikini, of, Ursula, Andress, (, a...","[115808, 188693, 133263, 165140, 120335, 24145...","[16, 7, 11, 34, 13, 13, 21, 19, 3, 15, 1, 34, ..."
15,"""Breakout"" is a single from British act Swing ...",Breakout (Swing Out Sister song),"[``, Breakout, '', is, a, single, from, Britis...","[127812, 32984, 6, 156142, 127813, 178055, 149...","[2, 2, 27, 10, 7, 2, 32, 41, 39, 2, 2, 6, 1, 4..."


On applique un padding, i.e une longueur maximale sur les articles ($x$) et les indices de topics ($z$). De façon arbitraire, on fixe le padding à 1000.
* Si l'article/la liste de topics est composé de plus de 1000 caractères alors, on supprime les derniers
* Si l'article/la liste de topics est composé de moins de 1000 caractères alors, on ajoute des 0 à la fin
Cette opération est faite car la taille d'inpput de LSTM est unique

In [6]:
SEQUENCE_LENGTH=1000

def apply_padding(sequence, max_length=SEQUENCE_LENGTH):
    padded_sequence = sequence[:max_length] + [0] * max(0, max_length - len(sequence))
    return padded_sequence

In [7]:
vectorized_data_lda_padding=vectorized_data_lda.copy()
vectorized_data_lda_padding['tokenized_text'] = vectorized_data_lda_padding['tokenized_text'].apply(apply_padding)
vectorized_data_lda_padding['vectorized_text'] = vectorized_data_lda_padding['vectorized_text'].apply(apply_padding)
vectorized_data_lda_padding['index_topic'] = vectorized_data_lda_padding['index_topic'].apply(apply_padding)

On définit un one hot encoding des listes d'indices de topics. Chaque liste $z$, de longueur SEQUENCE_LENGTH=200, est convertie en une matrice de dimensions SEQUENCE_LENGTH*NUM_TOPICS = $1000 \times 50$. L'élement $(i,j)$ de cette matrice vaut 1 si le i-ème topic de la séquence correspond au j-ème topic dans l'ensemble des topics.

In [8]:
def one_hot_encode_list(topic_list, vocab_size):
    return to_categorical(topic_list, num_classes=vocab_size)

In [9]:
array_z=vectorized_data_lda_padding['index_topic'].values
array_z_one_hot_encoded = np.array([one_hot_encode_list(lst, NUM_TOPICS) for lst in array_z])

# LSTM

array_z_one_hot_encoded correspond à l'échantillon de topics $\mathcal{D}_z$ one-hot encodés. Chaque liste de topics $z=(z_1,...,z_T)$ de l'échantillon est une matrice de dimensions $1000 \times 50$. Lorsque l'on aura implémenté le gibbs sampler, cet échantillon n'existera plus: en réalité, les $z$ ne sont pas observés. On remplacera l'échantillon par $z^*_{1:T}$.

On prend un $z$ de l'échantillon en faisant comme si c'était un $z^*$.

In [10]:
z_one_hot=array_z_one_hot_encoded[0]
print(z_one_hot.shape)

(1000, 50)


In [11]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, topics, model_length):
        self.topics=topics
        self.model_length=model_length

    def __len__(self):
        return len(self.topics)-self.model_length

    def __getitem__(self, index):
        input_sequence=torch.tensor(self.topics[index:index+self.model_length, :])
        target_sequence=torch.tensor(self.topics[index+1:index+self.model_length+1, :])

        return input_sequence, target_sequence

In [12]:
import torch.nn.functional as F

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, model_length):
        super(LSTM, self).__init__()
        self.input_size = input_size #dimension d'entrée (NUM_TOPICS)
        self.hidden_size = hidden_size #nombre de neurones de la couche cachée
        self.output_size = output_size #dimension d'outputs (NUM_TOPICS)
        self.model_length=model_length

        self.lstm = nn.LSTM(self.input_size, self.hidden_size, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, self.output_size)
        # self.softmax = nn.Softmax(dim=1) 

    def forward(self, x, prev_state):
        output, state = self.lstm(x, prev_state)
        output=self.fc(output)
        probabilities = F.softmax(output[:, -1, :], dim=1)
        return probabilities, state
    
    def init_state(self):
        return (torch.zeros(1, 1, self.hidden_size), #(NUM_LAYERS, BATCH SIZE, NUM_NEURONES)
                torch.zeros(1, 1, self.hidden_size))
    
    def train_model(self, dataset, optimizer, criterion):
        state_h, state_c = self.init_state()
        self.train()
        for t, (x, y) in enumerate(dataset):
            optimizer.zero_grad()
            softmax , (state_h, state_c) = self(x, (state_h, state_c)) #softmax= p(z_{t+1}|z_1:t)
            loss = criterion(softmax, y[:, -1, :])
            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()
            print({'loss': loss.item() })
    
    def predict_next_probability(self, input_sequence):
        state_h, state_c = self.init_state()

        # Forward pass jusqu'à t-1
        for t in range(len(input_sequence)):
            input_t = input_sequence[t].unsqueeze(0).unsqueeze(0)
            _, (state_h, state_c) = self(input_t, (state_h, state_c))

        # Obtenez les probabilités pour x_t
        input_t = input_sequence[-1].unsqueeze(0).unsqueeze(0)
        probabilities, _ = self(input_t, (state_h, state_c))

        return probabilities
    
    def sample_next_z(self, input_sequence):
        proba = self.predict_next_probability(input_sequence)
        return torch.multinomial(proba, 1).item()

In [13]:
MODEL_LENGTH=200
dataset=Dataset(topics=z_one_hot, model_length=MODEL_LENGTH)
dataloader = DataLoader(dataset, batch_size=1)
print(dataset.__getitem__(0)[0].shape)
print(dataset.__getitem__(0)[1].shape)

torch.Size([200, 50])
torch.Size([200, 50])


In [14]:
HIDDEN_SIZE=64
model=LSTM(input_size=NUM_TOPICS, hidden_size=HIDDEN_SIZE, output_size=NUM_TOPICS, model_length=MODEL_LENGTH)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [15]:
model.train_model(dataloader, optimizer, criterion)

{'loss': 3.9110419750213623}
{'loss': 3.9143152236938477}
{'loss': 3.913928985595703}
{'loss': 3.9124531745910645}
{'loss': 3.9105522632598877}
{'loss': 3.9129912853240967}
{'loss': 3.9142117500305176}
{'loss': 3.9132254123687744}
{'loss': 3.910290479660034}
{'loss': 3.9109063148498535}
{'loss': 3.910454750061035}
{'loss': 3.913980484008789}
{'loss': 3.9103245735168457}
{'loss': 3.9103775024414062}
{'loss': 3.9103240966796875}
{'loss': 3.911623239517212}
{'loss': 3.9108684062957764}
{'loss': 3.910801410675049}
{'loss': 3.913306713104248}
{'loss': 3.9101240634918213}
{'loss': 3.9120521545410156}
{'loss': 3.910109519958496}
{'loss': 3.912790060043335}
{'loss': 3.912604331970215}
{'loss': 3.9132800102233887}
{'loss': 3.9147961139678955}
{'loss': 3.9139156341552734}
{'loss': 3.9151620864868164}
{'loss': 3.915220260620117}
{'loss': 3.912921905517578}
{'loss': 3.914076566696167}
{'loss': 3.9107680320739746}
{'loss': 3.9135546684265137}
{'loss': 3.910661458969116}
{'loss': 3.91335129737854}
{

In [16]:
z_one_hot_tensor = torch.FloatTensor(z_one_hot)
z_one_hot_tensor.shape

torch.Size([1000, 50])

In [17]:
model.predict_next_probability(z_one_hot_tensor)

tensor([[0.0187, 0.0221, 0.0180, 0.0207, 0.0183, 0.0178, 0.0196, 0.0219, 0.0235,
         0.0221, 0.0198, 0.0218, 0.0238, 0.0207, 0.0188, 0.0213, 0.0204, 0.0211,
         0.0187, 0.0181, 0.0223, 0.0188, 0.0229, 0.0231, 0.0184, 0.0182, 0.0190,
         0.0175, 0.0184, 0.0192, 0.0171, 0.0211, 0.0201, 0.0184, 0.0171, 0.0184,
         0.0188, 0.0213, 0.0188, 0.0205, 0.0212, 0.0187, 0.0181, 0.0213, 0.0214,
         0.0198, 0.0215, 0.0189, 0.0222, 0.0204]], grad_fn=<SoftmaxBackward0>)

In [18]:
model.predict_next_probability(z_one_hot_tensor).detach().numpy()[0]

array([0.01866646, 0.02212409, 0.01800099, 0.02065765, 0.01831837,
       0.01777819, 0.01962738, 0.02190126, 0.02348709, 0.02208743,
       0.01979189, 0.02176312, 0.02375273, 0.02074218, 0.01878493,
       0.02132924, 0.02036189, 0.02106654, 0.01868558, 0.01806166,
       0.02233759, 0.01879182, 0.02288273, 0.02312025, 0.01842999,
       0.01815892, 0.01897769, 0.01746651, 0.01839547, 0.01919946,
       0.01708745, 0.02105109, 0.02012437, 0.01837844, 0.01713775,
       0.01840677, 0.01877698, 0.02133535, 0.01881841, 0.02047467,
       0.02121725, 0.01867016, 0.01805095, 0.02134924, 0.0213901 ,
       0.0198335 , 0.02154742, 0.01893549, 0.02224816, 0.02041735],
      dtype=float32)

In [19]:
model.sample_next_z(z_one_hot_tensor)

45

## SSM

In [20]:
class SSM:
    def __init__(self, num_words, num_topics, T):
        self.num_words = num_words
        self.num_topics = num_topics
        self.T = T
        self.phi = np.zeros((T, num_words, num_topics))

    def compute_MLE_SSM(self, ech_x, ech_z):
        def compute_MLE_SSM_time_t_zt(t, z_t, ech_x, ech_z, num_words):
            phi = np.zeros(num_words)
            list_probas = []
            for j in range(1,num_words+1):
                num = len(np.where((ech_x[:,t-1]==j)&(ech_z[:,t-1]==z_t))[0])/ech_x.shape[0]
                phi[j-1] = num
                list_probas.append(num)

            denom = np.sum(np.array(list_probas))
            phi = phi / (denom + 1e-6)
            return phi
        
        def compute_MLE_SSM_time_t(t, ech_x, ech_z, num_words, num_topics):
            phi = np.zeros((num_words, num_topics))
            for k in range(1, num_topics+1):
                z_t = k
                phi_zt = compute_MLE_SSM_time_t_zt(t=t, z_t=z_t, ech_x=ech_x, ech_z=ech_z, num_words=num_words)
                phi[:,k-1] = phi_zt
            return phi
        
        for t in range(self.T):
            self.phi[t] = compute_MLE_SSM_time_t(t, ech_x, ech_z, self.num_words, self.num_topics)
    
    def predict_proba(self, t, z_t):
        return self.phi[t-1][:,z_t-1]
    
    def sample_xt(self, t, z_t):
        proba = self.predict_proba(t, z_t)
        sampled_xt = np.random.choice(len(proba), p=proba)
        return sampled_xt
        

In [21]:
num_samples = 10
T = 200
ech_z = np.random.randint(1, NUM_TOPICS, size=(num_samples, T))
ech_z

array([[43, 12, 25, ..., 15, 34, 44],
       [46,  1, 25, ...,  5, 26, 48],
       [ 5, 45,  1, ..., 22, 28,  6],
       ...,
       [25, 15, 17, ..., 21, 17,  8],
       [25, 12, 39, ..., 27, 44, 43],
       [21, 47, 40, ..., 30, 42,  1]])

In [22]:
NUM_WORDS = 500
ech_x = np.random.randint(1, NUM_WORDS, size=(num_samples, T))
ech_x

array([[ 70, 490, 153, ..., 126,  29, 403],
       [ 60, 345, 163, ..., 147,  26, 187],
       [ 84, 456, 115, ..., 461, 147,  18],
       ...,
       [366, 224, 215, ...,  97, 237,  65],
       [376, 166, 441, ..., 408, 367,  78],
       [195,  82, 337, ..., 375, 436, 119]])

In [23]:
ssm = SSM(num_words=NUM_WORDS, num_topics=NUM_TOPICS, T=200)

In [24]:
ssm.compute_MLE_SSM(ech_x, ech_z)
ssm.phi.shape

(200, 500, 50)

$\phi[t,i,j]=$ probabilité au temps $t$ que le mot $i$ apparaisse sachant le topic $j$

In [25]:
zt=10
t=4
ssm.predict_proba(t, zt).shape

(500,)

## Particle Gibbs

In [150]:
def compute_alpha_unnormalized(t, z_1_t_minus_1, num_topics, num_voc, lstm, ssm):
    z_1_t_minus_1=z_1_t_minus_1-1
    z_one_hot = to_categorical(z_1_t_minus_1, num_classes=num_topics)
    z_one_hot_tensor = torch.FloatTensor(z_one_hot)
    softmax = lstm.predict_next_probability(z_one_hot_tensor).detach().numpy()[0]
    phi_t = ssm.phi[t-1]
    alpha = np.array([np.dot(softmax, phi_t[j,:]) for j in range(num_voc)])
    return alpha

In [151]:
def compute_alpha_normalized(t, z_1_t_minus_1, num_topics, num_voc, lstm, ssm):
    num = compute_alpha_unnormalized(t, z_1_t_minus_1, num_topics, num_voc, lstm, ssm)
    denom = np.sum(num)
    return num/(denom+1e-6)

In [152]:
def compute_gamma_unnormalized(t, xt, z_1_t_minus_1, num_topics, lstm, ssm):
    z_1_t_minus_1=z_1_t_minus_1-1
    z_one_hot = to_categorical(z_1_t_minus_1, num_classes=num_topics)
    z_one_hot_tensor = torch.FloatTensor(z_one_hot)
    softmax = lstm.predict_next_probability(z_one_hot_tensor).detach().numpy()[0]
    phi_t = ssm.phi[t-1]
    phi_xt = phi_t[xt-1, :]
    return np.multiply(softmax, phi_xt)

In [153]:
def compute_gamma_normalized(t, xt, z_1_t_minus_1, num_topics, lstm, ssm):
    num = compute_gamma_unnormalized(t, xt, z_1_t_minus_1, num_topics, lstm, ssm)
    denom = np.sum(num)
    return num/(denom+1e-6)

In [154]:
##Initialisation
P=50
NUM_TOPICS=50
NUM_WORDS=200
T=20

lstm = model
ssm = ssm

Z_matrix=np.zeros((P, T+1))
alpha_matrix=np.zeros((P,T+1))
ancestor_matrix=np.zeros((P,T+1))


z_1_T_star = np.random.choice(a=range(1,NUM_TOPICS+1), size=T)
x = np.random.choice(a=range(1,NUM_WORDS+1), size=T)

In [None]:
def particle_gibbs(P, num_topics, num_words, T, lstm_model, ssm_model, x, previous_z_1_T_star):
    ##Init
    Z_matrix=np.zeros((P, T+1))
    alpha_matrix=np.zeros((P,T+1))
    ancestor_matrix=np.zeros((P,T+1))
    ##t=0
    z_0 = np.random.choice(a=range(1,num_topics+1), size=P)
    alpha_0 = np.repeat(1/P, P)
    Z_matrix[:,0] = z_0
    alpha_matrix[:,0] = alpha_0

    #z[k:n]: du k-ème au n-1 ème
    for t in range(1,T+1):
        print(t)
        a_t_minus_1 = 1 #ok
        z_1_t = previous_z_1_T_star[:t] #ok
        ancestor_matrix[0,t-1] = a_t_minus_1 #ok
        Z_matrix[0, 1:t+1] = z_1_t #ok

        for p in range(2,P+1):
            alpha_t_minus_1_p=alpha_matrix[:,t-1]
            #a_t_minus_1_p = np.random.choice(a=range(1,P+1), p=alpha_t_minus_1_p, size=1)[0] #ok
            a_t_minus_1_p = np.argmax(alpha_t_minus_1_p)+1
            ancestor_matrix[p-1, t-1] = a_t_minus_1_p #ok
            if t ==1:
                z_1_t_minus_1_a_t_minus_1_p = Z_matrix[int(a_t_minus_1_p)-1, 0] #ok
                z_1_t_minus_1_a_t_minus_1_p = np.array([z_1_t_minus_1_a_t_minus_1_p]) #ok
                gamma_t_p = compute_gamma_normalized(t=t,
                                                xt=x[t-1], 
                                                z_1_t_minus_1=z_1_t_minus_1_a_t_minus_1_p,
                                                num_topics = num_topics,
                                                lstm = lstm_model,
                                                ssm = ssm_model)
                z_t_p = np.argmax(gamma_t_p)+1
                z_1_t_p = z_t_p
            else:
                z_1_t_minus_1_a_t_minus_1_p = Z_matrix[int(a_t_minus_1_p)-1, 1:(t-1)+1] #ok
                gamma_t_p = compute_gamma_normalized(t=t,
                                                xt=x[t-1], 
                                                z_1_t_minus_1=z_1_t_minus_1_a_t_minus_1_p,
                                                num_topics = num_topics,
                                                lstm = lstm_model,
                                                ssm = ssm_model)
                z_t_p = np.argmax(gamma_t_p)+1
                #z_t_p = np.random.choice(a=range(1, NUM_TOPICS+1), p=gamma_t_p, size=1)[0]
                z_1_t_p = np.append(z_1_t_minus_1_a_t_minus_1_p, z_t_p)
            
            Z_matrix[p-1, 1:t+1] = z_1_t_p

        
        for p in range(1, P+1):
            a_t_minus_1_p = ancestor_matrix[p-1, t-1]
            if t ==1:
                z_1_t_minus_1_a_t_minus_1_p = Z_matrix[int(a_t_minus_1_p)-1, 0] #ok
                z_1_t_minus_1_a_t_minus_1_p = np.array([z_1_t_minus_1_a_t_minus_1_p]) #ok
            else:
                z_1_t_minus_1_a_t_minus_1_p = Z_matrix[int(a_t_minus_1_p)-1, 1:(t-1)+1]
            
            alpha_t_p = compute_alpha_normalized(t=t,
                                                z_1_t_minus_1=z_1_t_minus_1_a_t_minus_1_p,
                                                num_topics=num_topics,
                                                num_voc=num_words,
                                                lstm=lstm,
                                                ssm=ssm)
            alpha_t_p = alpha_t_p[x[t-1]-1]
            alpha_matrix[p-1,t] = alpha_t_p
    alpha_T=alpha_matrix[:,-1]
    r = np.argmax(alpha_T)+1
    alpha_T_r = alpha_T[r-1, -1]
    z_1_T = Z_matrix[alpha_T_r-1, 1:]
    return z_1_T

In [155]:
##t=0
z_0 = np.random.choice(a=range(1,NUM_TOPICS+1), size=P)
alpha_0 = np.repeat(1/P, P)
Z_matrix[:,0] = z_0
alpha_matrix[:,0] = alpha_0

#z[k:n]: du k-ème au n-1 ème
for t in range(1,T+1):
    print(t)
    a_t_minus_1 = 1 #ok
    z_1_t = z_1_T_star[:t] #ok
    ancestor_matrix[0,t-1] = a_t_minus_1 #ok
    Z_matrix[0, 1:t+1] = z_1_t #ok

    for p in range(2,P+1):
        alpha_t_minus_1_p=alpha_matrix[:,t-1]
        #a_t_minus_1_p = np.random.choice(a=range(1,P+1), p=alpha_t_minus_1_p, size=1)[0] #ok
        a_t_minus_1_p = np.argmax(alpha_t_minus_1_p)+1
        ancestor_matrix[p-1, t-1] = a_t_minus_1_p #ok
        if t ==1:
            z_1_t_minus_1_a_t_minus_1_p = Z_matrix[int(a_t_minus_1_p)-1, 0] #ok
            z_1_t_minus_1_a_t_minus_1_p = np.array([z_1_t_minus_1_a_t_minus_1_p]) #ok
            gamma_t_p = compute_gamma_normalized(t=t,
                                             xt=x[t-1], 
                                             z_1_t_minus_1=z_1_t_minus_1_a_t_minus_1_p,
                                             num_topics = NUM_TOPICS,
                                             lstm = lstm,
                                             ssm = ssm)
            z_t_p = np.argmax(gamma_t_p)+1
            z_1_t_p = z_t_p
        else:
            z_1_t_minus_1_a_t_minus_1_p = Z_matrix[int(a_t_minus_1_p)-1, 1:(t-1)+1] #ok
            gamma_t_p = compute_gamma_normalized(t=t,
                                             xt=x[t-1], 
                                             z_1_t_minus_1=z_1_t_minus_1_a_t_minus_1_p,
                                             num_topics = NUM_TOPICS,
                                             lstm = lstm,
                                             ssm = ssm)
            z_t_p = np.argmax(gamma_t_p)+1
            #z_t_p = np.random.choice(a=range(1, NUM_TOPICS+1), p=gamma_t_p, size=1)[0]
            z_1_t_p = np.append(z_1_t_minus_1_a_t_minus_1_p, z_t_p)
        
        Z_matrix[p-1, 1:t+1] = z_1_t_p

    
    for p in range(1, P+1):
        a_t_minus_1_p = ancestor_matrix[p-1, t-1]
        if t ==1:
            z_1_t_minus_1_a_t_minus_1_p = Z_matrix[int(a_t_minus_1_p)-1, 0] #ok
            z_1_t_minus_1_a_t_minus_1_p = np.array([z_1_t_minus_1_a_t_minus_1_p]) #ok
        else:
            z_1_t_minus_1_a_t_minus_1_p = Z_matrix[int(a_t_minus_1_p)-1, 1:(t-1)+1]
        
        alpha_t_p = compute_alpha_normalized(t=t,
                                             z_1_t_minus_1=z_1_t_minus_1_a_t_minus_1_p,
                                             num_topics=NUM_TOPICS,
                                             num_voc=NUM_WORDS,
                                             lstm=lstm,
                                             ssm=ssm)
        alpha_t_p = alpha_t_p[x[t-1]-1]
        alpha_matrix[p-1,t] = alpha_t_p


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [162]:
alpha_T=alpha_matrix[:,-1]
r = np.argmax(alpha_T)+1
r

1

In [161]:
alpha_T_r = alpha_T[r-1]
alpha_T_r

0.0

In [164]:
alpha_T_r = alpha_T[r-1]
z_1_T = Z_matrix[int(alpha_T_r)-1, 1:]

In [None]:
# vocab = set()

# for token in vectorized_data_lda["tokenized_text"]:
#     vocab.update([word for word in token])

# vocab = sorted(vocab)
# NUM_WORDS=len(vocab)
# print('Unique words: {}'.format(NUM_WORDS))

Unique words: 193476


In [None]:
# def compute_MLE_SSM_time_t_zt(t, z_t, ech_x, ech_z, num_words):
#     phi = np.zeros(num_words)
#     list_probas = []
#     for j in range(1,num_words+1):
#         num = len(np.where((ech_x[:,t-1]==j)&(ech_z[:,t-1]==z_t))[0])/ech_x.shape[0]
#         phi[j-1] = num
#         list_probas.append(num)

#     denom = np.sum(np.array(list_probas))
#     phi = phi / (denom + 1e-6)
#     return phi

In [None]:
# def compute_MLE_SSM_time_t(t, ech_x, ech_z, num_words, num_topics):
#     phi = np.zeros((num_words, num_topics))
#     for k in range(1, num_topics+1):
#         z_t = k
#         phi_zt = compute_MLE_SSM_time_t_zt(t=t, z_t=z_t, ech_x=ech_x, ech_z=ech_z, num_words=num_words)
#         phi[:,k-1] = phi_zt
#     return phi

In [None]:
# def compute_MLE_SSM(ech_x, ech_z, num_words, num_topics, T):
#     phi = np.empty((0, num_words, num_topics))
#     for t in range(T):
#         phi_t = compute_MLE_SSM_time_t(t, ech_x, ech_z, num_words, num_topics)
#         phi = np.concatenate([phi, phi_t[np.newaxis, :, :]])
#     return phi