In [2]:
from src.vectorize import *
from src.lda import *
import pandas as pd
import numpy as np
from keras.utils import to_categorical
import torch.distributions as dist
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yanis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Import des données et pré-processing

On importe 10 000 articles Wikipedia.

In [3]:
data = pd.read_csv("C:\\Users\\yanis\\OneDrive\\Documents\\ENSAE 3A\\Sequential MC\\SSM-PROJECT\\wiki_data.csv",
                    encoding='utf-8',
                    delimiter=";")
del data["Unnamed: 0"]
display(data.head())
print("Number of articles: {}".format(len(data)))

Unnamed: 0,Text,Title
0,"Joseph Harold Greenberg (May 28, 1915 – May 7,...",Joseph Greenberg
1,"Pauline Donalda, (March 5, 1882 – October 22,...",Pauline Donalda
2,This is a list of German football transfers in...,List of German football transfers summer 2017
3,"Lester Hudson III (born August 7, 1984) is an ...",Lester Hudson
4,"Monique Ganderton (born August 6, 1980) is a C...",Monique Ganderton


Number of articles: 10000


1. On tokenize chaque article: "Joseph Harold Greenberg" devient ["Jospeh", "Harold", "Greenberg"]
2. On filtre les articles: on retire les articles composés de moins de 500 mots
3. On vectorise le texte: on associe à chaque mot son indice dans le vocabulaire général.

Exemple: si tous les articles peuvent contenir comme mots: ["pomme", "poire", "chocolat", "eau", "banane"], l'article ["chocolat", "banane", "pomme"] devient [2, 4, 0]

In [4]:
vectorized_data=vectorize_data(data=data, min_number_words=500)
display(vectorized_data.head())

Unnamed: 0,Text,Title,tokenized_text,vectorized_text
0,"Joseph Harold Greenberg (May 28, 1915 – May 7,...",Joseph Greenberg,"[Joseph, Harold, Greenberg, (, May, 28, ,, 191...","[68836, 61034, 58682, 1182, 80975, 13183, 1242..."
3,"Lester Hudson III (born August 7, 1984) is an ...",Lester Hudson,"[Lester, Hudson, III, (, born, August, 7, ,, 1...","[75482, 63781, 64659, 1182, 134097, 26662, 181..."
4,"Monique Ganderton (born August 6, 1980) is a C...",Monique Ganderton,"[Monique, Ganderton, (, born, August, 6, ,, 19...","[84017, 56054, 1182, 134097, 26662, 17386, 124..."
6,The white bikini of Ursula Andress (also known...,White bikini of Ursula Andress,"[The, white, bikini, of, Ursula, Andress, (, a...","[115808, 188693, 133263, 165140, 120335, 24145..."
15,"""Breakout"" is a single from British act Swing ...",Breakout (Swing Out Sister song),"[``, Breakout, '', is, a, single, from, Britis...","[127812, 32984, 6, 156142, 127813, 178055, 149..."


En attendant d'avoir $z^*_{1:T}$, on applique le LDA sur le corpus avec K=50 topics. Chaque mot a alors un poids relativement à chaque topic.

Exemple: le mot banane a les poids suivant: {"topic 1": 0.3, "topic 2": 0.1, "topic 3": 0.05,...}. 

On associe au mot le topic avec le poids le plus élevé. En l'occurence 1. De cette façon, on crée index_topic=$(z_1,...,z_T)$

In [5]:
NUM_TOPICS=50
lda=LDA(num_topics=NUM_TOPICS, random_state=123)
vectorized_data_lda=lda.run(vectorized_data)
display(vectorized_data_lda.head())

Unnamed: 0,Text,Title,tokenized_text,vectorized_text,index_topic
0,"Joseph Harold Greenberg (May 28, 1915 – May 7,...",Joseph Greenberg,"[Joseph, Harold, Greenberg, (, May, 28, ,, 191...","[68836, 61034, 58682, 1182, 80975, 13183, 1242...","[21, 21, 24, 21, 18, 18, 7, 18, 6, 18, 21, 7, ..."
3,"Lester Hudson III (born August 7, 1984) is an ...",Lester Hudson,"[Lester, Hudson, III, (, born, August, 7, ,, 1...","[75482, 63781, 64659, 1182, 134097, 26662, 181...","[21, 16, 18, 21, 11, 39, 21, 7, 16, 21, 10, 15..."
4,"Monique Ganderton (born August 6, 1980) is a C...",Monique Ganderton,"[Monique, Ganderton, (, born, August, 6, ,, 19...","[84017, 56054, 1182, 134097, 26662, 17386, 124...","[30, 30, 21, 11, 39, 27, 7, 16, 21, 10, 7, 1, ..."
6,The white bikini of Ursula Andress (also known...,White bikini of Ursula Andress,"[The, white, bikini, of, Ursula, Andress, (, a...","[115808, 188693, 133263, 165140, 120335, 24145...","[16, 7, 11, 34, 13, 13, 21, 19, 3, 15, 1, 34, ..."
15,"""Breakout"" is a single from British act Swing ...",Breakout (Swing Out Sister song),"[``, Breakout, '', is, a, single, from, Britis...","[127812, 32984, 6, 156142, 127813, 178055, 149...","[2, 2, 27, 10, 7, 2, 32, 41, 39, 2, 2, 6, 1, 4..."


On applique un padding, i.e une longueur maximale sur les articles ($x$) et les indices de topics ($z$). De façon arbitraire, on fixe le padding à 200.
* Si l'article/la liste de topics est composé de plus de 200 caractères alors, on supprime les derniers
* Si l'article/la liste de topics est composé de moins de 200 caractères alors, on ajoute des 0 à la fin
Cette opération est faite car la taille d'inpput de LSTM est unique

In [6]:
SEQUENCE_LENGTH=200

def apply_padding(sequence, max_length=SEQUENCE_LENGTH):
    padded_sequence = sequence[:max_length] + [0] * max(0, max_length - len(sequence))
    return padded_sequence

In [7]:
vectorized_data_lda_padding=vectorized_data_lda.copy()
vectorized_data_lda_padding['tokenized_text'] = vectorized_data_lda_padding['tokenized_text'].apply(apply_padding)
vectorized_data_lda_padding['vectorized_text'] = vectorized_data_lda_padding['vectorized_text'].apply(apply_padding)
vectorized_data_lda_padding['index_topic'] = vectorized_data_lda_padding['index_topic'].apply(apply_padding)

On définit un one hot encoding des listes d'indices de topics. Chaque liste $z$, de longueur SEQUENCE_LENGTH=200, est convertie en une matrice de dimensions SEQUENCE_LENGTH*NUM_TOPICS = $200 \times 50$. L'élement $(i,j)$ de cette matrice vaut 1 si le i-ème topic de la séquence correspond au j-ème topic dans l'ensemble des topics.

In [8]:
def one_hot_encode_list(topic_list, vocab_size):
    return to_categorical(topic_list, num_classes=vocab_size)

In [9]:
array_z=vectorized_data_lda_padding['index_topic'].values
array_z_one_hot_encoded = np.array([one_hot_encode_list(lst, NUM_TOPICS) for lst in array_z])

# LSTM

array_z_one_hot_encoded correspond à l'échantillon de topics $\mathcal{D}_z$ one-hot encodés. Chaque liste de topics $z=(z_1,...,z_T)$ de l'échantillon est une matrice de dimensions $200 \times 50$. Lorsque l'on aura implémenté le gibbs sampler, cet échantillon n'existera plus: en réalité, les $z$ ne sont pas observés. On remplacera l'échantillon par $z^*_{1:T}$.

On prend un $z$ de l'échantillon en faisant comme si c'était un $z^*$.

In [10]:
z_one_hot=array_z_one_hot_encoded[0]
print(z_one_hot.shape)

(200, 50)


In [11]:
X = z_one_hot[:-1,:]  # Toutes les lignes sauf la dernière
y = z_one_hot[1:,:]   # Toutes les lignes sauf la première

X_tensor = torch.FloatTensor(X)
y_tensor = torch.FloatTensor(y)

In [12]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, topics, labels):
        self.topics=topics
        self.labels=labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return self.topics[index,:], self.labels[index,:]

In [13]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.input_size = input_size #dimension d'entrée (NUM_TOPICS)
        self.hidden_size = hidden_size #nombre de neurones de la couche cachée
        self.output_size = output_size #dimension d'outputs (NUM_TOPICS)
        
        self.lstm = nn.LSTM(self.input_size, self.hidden_size, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.Softmax(dim=1) 

    def forward(self, x, prev_state):
        output, state = self.lstm(x, prev_state)
        output=self.fc(output)
        probabilities = self.softmax(output)
        return probabilities, state
    
    def init_state(self):
        return (torch.zeros(1, self.hidden_size),
                torch.zeros(1, self.hidden_size))

In [14]:
dataset=Dataset(topics=X_tensor, labels=y_tensor)
dataloader = DataLoader(dataset, batch_size=1)
print("x_0=z_0: {}".format(dataset.__getitem__(0)[0]))
print("y_0=z_1: {}".format(dataset.__getitem__(0)[1]))

x_0=z_0: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
y_0=z_1: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


In [15]:
HIDDEN_SIZE=64
model=LSTM(input_size=NUM_TOPICS, hidden_size=HIDDEN_SIZE, output_size=NUM_TOPICS)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [16]:
state_h, state_c = model.init_state()
for t, (z_t, z_t_next) in enumerate(dataloader):
    optimizer.zero_grad()
    softmax , (state_h, state_c) = model(z_t, (state_h, state_c)) #softmax= p(z_{t+1}|z_t)
    loss = criterion(softmax, z_t_next)

    index_z_t_next_pred = torch.multinomial(input=softmax[0], num_samples=1, replacement=True)
    z_t_next_pred= torch.eye(len(softmax[0]))[index_z_t_next_pred] #one hot encoding du topic prédit
    state_h = state_h.detach()
    state_c = state_c.detach()

    loss.backward()
    optimizer.step()

    print({ 't': t+1,'loss': loss.item() })

{'t': 1, 'loss': 3.913787841796875}
{'t': 2, 'loss': 3.909677028656006}
{'t': 3, 'loss': 3.9135024547576904}
{'t': 4, 'loss': 3.9133799076080322}
{'t': 5, 'loss': 3.913633346557617}
{'t': 6, 'loss': 3.9111168384552}
{'t': 7, 'loss': 3.9134764671325684}
{'t': 8, 'loss': 3.912024736404419}
{'t': 9, 'loss': 3.9132909774780273}
{'t': 10, 'loss': 3.9141995906829834}
{'t': 11, 'loss': 3.9113969802856445}
{'t': 12, 'loss': 3.9115915298461914}
{'t': 13, 'loss': 3.9137651920318604}
{'t': 14, 'loss': 3.911694288253784}
{'t': 15, 'loss': 3.9090847969055176}
{'t': 16, 'loss': 3.911741018295288}
{'t': 17, 'loss': 3.910820484161377}
{'t': 18, 'loss': 3.9118988513946533}
{'t': 19, 'loss': 3.9136240482330322}
{'t': 20, 'loss': 3.9113807678222656}
{'t': 21, 'loss': 3.9130139350891113}
{'t': 22, 'loss': 3.9100780487060547}
{'t': 23, 'loss': 3.91141676902771}
{'t': 24, 'loss': 3.9089269638061523}
{'t': 25, 'loss': 3.9134366512298584}
{'t': 26, 'loss': 3.9131181240081787}
{'t': 27, 'loss': 3.9124667644500

# SSM

Pour illustrer le fonctionnement, on prend le premier article vectorizé associé à la première liste de topics vectorizée. En théorie, cette liste de topics devrait être $z^*$. Pas besoin de one-hot-encoding. Le but est de trouver le maximum de vraisemblance $(\hat{\phi_z})$.

$\phi_z$ est une matrice de taille NUM_WORDS*NUM_TOPICS. $\phi_z[i,j]=$ Pr(mot i | topic j). Le MLE est donné par la contrepartie empirique de ces probabilités: $\frac{\text{Nombre de mots i dans l'échantillon associés au topic j}}{\sum_{i}\text{Nombre de mots i dans l'échantillon associés au topic j}}$

In [17]:
array_z=vectorized_data_lda_padding['index_topic'].values
z=np.array(array_z[0])
print(z.shape)
print(z)

(200,)
[21 21 24 21 18 18  7 18  6 18 21  7 16 21 28 15  7 22  7  3 19 18 29  7
 32  3 38 34  1  7 28 34 24 21 16 34 15 34 34 21 29 10 16 29 16 21 21 24
 28 11 29 18 18  7 18 44  5 15 18 16  7 18 21 21 32 18  3 29 28  2 21 15
  1 20 34 27  7 29  3  7 16 33 18 38  3 21 34 28 44  1  1 16  3  3 29 15
 21  1  1 43 43  7 29 15 44  1  7 22  1 10 19  7  2 43 21 34 43 19 34 34
 18 18 21 21  1 29  1 28  7 29 34  7 43 34  3 18 38 32  7 34 24 21 16  2
 32 38 34 18 41  7 29 28 32 15  7 43 43  3  6 21 38 19 43 34 18 34 21  1
  1 43 34 29 43 34  7 24 29 38 14  1 38 20 34 41  7  1 29 32  1 38 24 21
 16 34 34 29 34 22 28  1]


In [18]:
array_x=vectorized_data_lda_padding['vectorized_text'].values
x=np.array(array_x[0])
print(x.shape)
print(x)

(200,)
[ 68836  61034  58682   1182  80975  13183   1242   8867 192633  80975
  18147   1242  11580   1183 188139 129921  23715 158664   1242 157188
 159730 148870 152952 189165 138610 158665 185521 130019 183121 150335
 137193 165140 157659   1434  75829  48696 158437 130019 144754   1182
  79074 179225  20189  42585  11647   1183  68836  58682 188139 134097
 165380  80975  13183   1242   8867 183896  68370 166977 154473  33460
   1242  87590 126649   1434  62734 148179 151223 155610 188139 162918
   1434  26358 183121 128842 165140   5402   1242 152281 150196 127813
 168216 138612 154473 111425  60448   1434  61487 139234 183896 168675
 183121 168216 149491 183590 152952 158437   1434  22055 148116 152752
 175946   1242 152281 141424 183896 171364 127813 175938 135670 172111
 183103 127813 162928 165428   1434  61487 145676 131477  40660 120058
 154473  87590 126649   1434  47977 152952 176849 189547   1242 152281
 131571 127813 137172 182536 135001  54494  31743 138610  23715  65796

In [19]:
vocab = set()

for token in vectorized_data_lda["tokenized_text"]:
    vocab.update([word for word in token])

vocab = sorted(vocab)
NUM_WORDS=len(vocab)
print('Unique words: {}'.format(NUM_WORDS))

Unique words: 193476


In [20]:
word_topic_counts = np.zeros((NUM_WORDS, NUM_TOPICS))

for i in range(len(x)):
    word_topic_counts[x[i], z[i]] += 1

MLE_phi_z = word_topic_counts/(np.sum(word_topic_counts, axis=0)+1e-6)
print(MLE_phi_z)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
