In [1]:
from src.vectorize import *
from src.lda import *
import pandas as pd
import numpy as np
from keras.utils import to_categorical
import torch.distributions as dist
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yanis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Import des données et pré-processing

On importe 10 000 articles Wikipedia.

In [2]:
data = pd.read_csv("C:\\Users\\yanis\\OneDrive\\Documents\\ENSAE 3A\\Sequential MC\\SSM-PROJECT\\wiki_data.csv",
                    encoding='utf-8',
                    delimiter=";")
del data["Unnamed: 0"]
display(data.head())
print("Number of articles: {}".format(len(data)))

Unnamed: 0,Text,Title
0,"Joseph Harold Greenberg (May 28, 1915 – May 7,...",Joseph Greenberg
1,"Pauline Donalda, (March 5, 1882 – October 22,...",Pauline Donalda
2,This is a list of German football transfers in...,List of German football transfers summer 2017
3,"Lester Hudson III (born August 7, 1984) is an ...",Lester Hudson
4,"Monique Ganderton (born August 6, 1980) is a C...",Monique Ganderton


Number of articles: 10000


1. On tokenize chaque article: "Joseph Harold Greenberg" devient ["Jospeh", "Harold", "Greenberg"]
2. On filtre les articles: on retire les articles composés de moins de 500 mots
3. On vectorise le texte: on associe à chaque mot son indice dans le vocabulaire général.

Exemple: si tous les articles peuvent contenir comme mots: ["pomme", "poire", "chocolat", "eau", "banane"], l'article ["chocolat", "banane", "pomme"] devient [2, 4, 0]

In [3]:
vectorized_data=vectorize_data(data=data, min_number_words=500)
display(vectorized_data.head())

Unnamed: 0,Text,Title,tokenized_text,vectorized_text
0,"Joseph Harold Greenberg (May 28, 1915 – May 7,...",Joseph Greenberg,"[Joseph, Harold, Greenberg, (, May, 28, ,, 191...","[68836, 61034, 58682, 1182, 80975, 13183, 1242..."
3,"Lester Hudson III (born August 7, 1984) is an ...",Lester Hudson,"[Lester, Hudson, III, (, born, August, 7, ,, 1...","[75482, 63781, 64659, 1182, 134097, 26662, 181..."
4,"Monique Ganderton (born August 6, 1980) is a C...",Monique Ganderton,"[Monique, Ganderton, (, born, August, 6, ,, 19...","[84017, 56054, 1182, 134097, 26662, 17386, 124..."
6,The white bikini of Ursula Andress (also known...,White bikini of Ursula Andress,"[The, white, bikini, of, Ursula, Andress, (, a...","[115808, 188693, 133263, 165140, 120335, 24145..."
15,"""Breakout"" is a single from British act Swing ...",Breakout (Swing Out Sister song),"[``, Breakout, '', is, a, single, from, Britis...","[127812, 32984, 6, 156142, 127813, 178055, 149..."


En attendant d'avoir $z^*_{1:T}$, on applique le LDA sur le corpus avec K=50 topics. Chaque mot a alors un poids relativement à chaque topic.

Exemple: le mot banane a les poids suivant: {"topic 1": 0.3, "topic 2": 0.1, "topic 3": 0.05,...}. 

On associe au mot le topic avec le poids le plus élevé. En l'occurence 1. De cette façon, on crée index_topic=$(z_1,...,z_T)$

In [4]:
NUM_TOPICS=50
lda=LDA(num_topics=NUM_TOPICS, random_state=123)
vectorized_data_lda=lda.run(vectorized_data)
display(vectorized_data_lda.head())

Unnamed: 0,Text,Title,tokenized_text,vectorized_text,index_topic
0,"Joseph Harold Greenberg (May 28, 1915 – May 7,...",Joseph Greenberg,"[Joseph, Harold, Greenberg, (, May, 28, ,, 191...","[68836, 61034, 58682, 1182, 80975, 13183, 1242...","[21, 21, 24, 21, 18, 18, 7, 18, 6, 18, 21, 7, ..."
3,"Lester Hudson III (born August 7, 1984) is an ...",Lester Hudson,"[Lester, Hudson, III, (, born, August, 7, ,, 1...","[75482, 63781, 64659, 1182, 134097, 26662, 181...","[21, 16, 18, 21, 11, 39, 21, 7, 16, 21, 10, 15..."
4,"Monique Ganderton (born August 6, 1980) is a C...",Monique Ganderton,"[Monique, Ganderton, (, born, August, 6, ,, 19...","[84017, 56054, 1182, 134097, 26662, 17386, 124...","[30, 30, 21, 11, 39, 27, 7, 16, 21, 10, 7, 1, ..."
6,The white bikini of Ursula Andress (also known...,White bikini of Ursula Andress,"[The, white, bikini, of, Ursula, Andress, (, a...","[115808, 188693, 133263, 165140, 120335, 24145...","[16, 7, 11, 34, 13, 13, 21, 19, 3, 15, 1, 34, ..."
15,"""Breakout"" is a single from British act Swing ...",Breakout (Swing Out Sister song),"[``, Breakout, '', is, a, single, from, Britis...","[127812, 32984, 6, 156142, 127813, 178055, 149...","[2, 2, 27, 10, 7, 2, 32, 41, 39, 2, 2, 6, 1, 4..."


On applique un padding, i.e une longueur maximale sur les articles ($x$) et les indices de topics ($z$). De façon arbitraire, on fixe le padding à 200.
* Si l'article/la liste de topics est composé de plus de 1000 caractères alors, on supprime les derniers
* Si l'article/la liste de topics est composé de moins de 1000 caractères alors, on ajoute des 0 à la fin
Cette opération est faite car la taille d'inpput de LSTM est unique

In [5]:
SEQUENCE_LENGTH=200

def apply_padding(sequence, max_length=SEQUENCE_LENGTH):
    padded_sequence = sequence[:max_length] + [0] * max(0, max_length - len(sequence))
    return padded_sequence

In [6]:
vectorized_data_lda_padding=vectorized_data_lda.copy()
vectorized_data_lda_padding['tokenized_text'] = vectorized_data_lda_padding['tokenized_text'].apply(apply_padding)
vectorized_data_lda_padding['vectorized_text'] = vectorized_data_lda_padding['vectorized_text'].apply(apply_padding)
vectorized_data_lda_padding['index_topic'] = vectorized_data_lda_padding['index_topic'].apply(apply_padding)

On définit un one hot encoding des listes d'indices de topics. Chaque liste $z$, de longueur SEQUENCE_LENGTH=200, est convertie en une matrice de dimensions SEQUENCE_LENGTH*NUM_TOPICS = $200 \times 50$. L'élement $(i,j)$ de cette matrice vaut 1 si le i-ème topic de la séquence correspond au j-ème topic dans l'ensemble des topics.

In [7]:
def one_hot_encode_list(topic_list, vocab_size):
    return to_categorical(topic_list, num_classes=vocab_size)

In [8]:
array_z=vectorized_data_lda_padding['index_topic'].values
array_z_one_hot_encoded = np.array([one_hot_encode_list(lst, NUM_TOPICS) for lst in array_z])

# LSTM

array_z_one_hot_encoded correspond à l'échantillon de topics $\mathcal{D}_z$ one-hot encodés. Chaque liste de topics $z=(z_1,...,z_T)$ de l'échantillon est une matrice de dimensions $200 \times 50$. Lorsque l'on aura implémenté le gibbs sampler, cet échantillon n'existera plus: en réalité, les $z$ ne sont pas observés. On remplacera l'échantillon par $z^*_{1:T}$.

On prend un $z$ de l'échantillon en faisant comme si c'était un $z^*$.

In [9]:
z_one_hot=array_z_one_hot_encoded[0]
print(z_one_hot.shape)

(200, 50)


In [10]:
X = z_one_hot[:-1,:]  # Toutes les lignes sauf la dernière
y = z_one_hot[1:,:]   # Toutes les lignes sauf la première

X_tensor = torch.FloatTensor(X)
y_tensor = torch.FloatTensor(y)

In [11]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, topics, labels):
        self.topics=topics
        self.labels=labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return self.topics[index,:], self.labels[index,:]

In [12]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.input_size = input_size #dimension d'entrée (NUM_TOPICS)
        self.hidden_size = hidden_size #nombre de neurones de la couche cachée
        self.output_size = output_size #dimension d'outputs (NUM_TOPICS)
        
        self.lstm = nn.LSTM(self.input_size, self.hidden_size, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.Softmax(dim=1) 

    def forward(self, x, prev_state):
        output, state = self.lstm(x, prev_state)
        output=self.fc(output)
        probabilities = self.softmax(output)
        return probabilities, state
    
    def init_state(self):
        return (torch.zeros(1, self.hidden_size),
                torch.zeros(1, self.hidden_size))

In [13]:
dataset=Dataset(topics=X_tensor, labels=y_tensor)
dataloader = DataLoader(dataset, batch_size=1)
print("x_0=z_0: {}".format(dataset.__getitem__(0)[0]))
print("y_0=z_1: {}".format(dataset.__getitem__(0)[1]))

x_0=z_0: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
y_0=z_1: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


In [14]:
HIDDEN_SIZE=64
model=LSTM(input_size=NUM_TOPICS, hidden_size=HIDDEN_SIZE, output_size=NUM_TOPICS)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [15]:
state_h, state_c = model.init_state()
for t, (z_t, z_t_next) in enumerate(dataloader):
    optimizer.zero_grad()
    softmax , (state_h, state_c) = model(z_t, (state_h, state_c)) #softmax= p(z_{t+1}|z_t)
    loss = criterion(softmax, z_t_next)

    index_z_t_next_pred = torch.multinomial(input=softmax[0], num_samples=1, replacement=True)
    z_t_next_pred= torch.eye(len(softmax[0]))[index_z_t_next_pred] #one hot encoding du topic prédit
    state_h = state_h.detach()
    state_c = state_c.detach()

    loss.backward()
    optimizer.step()

    print({ 't': t+1,'loss': loss.item() })

{'t': 1, 'loss': 3.9132745265960693}
{'t': 2, 'loss': 3.910295248031616}
{'t': 3, 'loss': 3.913569450378418}
{'t': 4, 'loss': 3.9122750759124756}
{'t': 5, 'loss': 3.91237211227417}
{'t': 6, 'loss': 3.9097931385040283}
{'t': 7, 'loss': 3.9124233722686768}
{'t': 8, 'loss': 3.913525342941284}
{'t': 9, 'loss': 3.912706136703491}
{'t': 10, 'loss': 3.9136691093444824}
{'t': 11, 'loss': 3.909810781478882}
{'t': 12, 'loss': 3.9135122299194336}
{'t': 13, 'loss': 3.9137938022613525}
{'t': 14, 'loss': 3.9132325649261475}
{'t': 15, 'loss': 3.913527011871338}
{'t': 16, 'loss': 3.9097132682800293}
{'t': 17, 'loss': 3.914539098739624}
{'t': 18, 'loss': 3.9100239276885986}
{'t': 19, 'loss': 3.9137091636657715}
{'t': 20, 'loss': 3.911336660385132}
{'t': 21, 'loss': 3.9121599197387695}
{'t': 22, 'loss': 3.911240577697754}
{'t': 23, 'loss': 3.909759521484375}
{'t': 24, 'loss': 3.9141972064971924}
{'t': 25, 'loss': 3.913372039794922}
{'t': 26, 'loss': 3.913803815841675}
{'t': 27, 'loss': 3.909743070602417