# Classification d'opinion avec un RNN 



In [None]:
# import standard + 
# 
import numpy as np

import torch.nn.functional as F
import torch
import torch.nn as nn
from tqdm.autonotebook import tqdm
from torch.utils.data import Dataset, DataLoader

import os
import time
import logging
import re
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# device = "cpu"
print(device)



In [None]:
from pathlib import Path
from IPython.display import display, HTML
from torch.utils.tensorboard import SummaryWriter

# Chemin vers TensorBoard
TB_PATH = "/tmp/logs/module-RNN"

# TENSORBOARD 
# usage externe de tensorboard: (1) lancer la commande dans une console; (2) copier-coller l'URL dans un navigateur
display(HTML("<h2>Informations</h2><div>Pour visualiser les logs, tapez la commande : </div>"))
print(f"tensorboard --logdir {Path(TB_PATH).absolute()}")
print("Une fois la commande lancer dans la console, copier-coller l'URL dans votre navigateur")



A. Chargement des données
------------------

Tout le code est fourni. Le cadre est le même que pour la classification de noms: many-to-one. La tâche est de la classification d'opinion (sentiment en anglais)



In [None]:
GLOVE_PATH = Path("data/glove")
DATASET_PATH = Path("data/aclImdb")
IMDB_CLASSES  = ['neg','pos']

class FolderText(Dataset):
    """Dataset basé sur des dossiers (un par classe) et fichiers"""

    def __init__(self, classes, folder: Path, tokenizer, train_max_size = None, load=False):
        self.tokenizer = tokenizer
        self.files = []
        self.filelabels = []
        self.labels = {}
        for ix, key in enumerate(classes):
            self.labels[key] = ix
        
        for label in classes:
            c = 0
            for file in (folder / label).glob("*.txt"):
                self.files.append(file.read_text() if load else file)
                self.filelabels.append(self.labels[label])
                c+=1
                if train_max_size !=None and c>train_max_size: break

    def __len__(self):
        return len(self.filelabels)
    
    def __getitem__(self, ix):
        s = self.files[ix]
        return torch.tensor(self.tokenizer(s if isinstance(s, str) else s.read_text())), self.filelabels[ix]

def get_imdb_data(embedding_size=50, train_max_size = None):
    """Renvoie l'ensemble des donnéees nécessaires pour l'apprentissage 

    - dictionnaire word vers ID
    - embeddings (Glove)
    - DataSet (FolderText)

    """
    WORDS = re.compile(r"\S+")
    glove_fn = open(GLOVE_PATH / ("glove.6B.%dd.txt" % embedding_size))
    words, embeddings = [], []
    for line in glove_fn:
        values = line.split()
        words.append(values[0])
        embeddings.append([float(x) for x in values[1:]])

    OOVID = len(words)
    words.append("__OOV__")

    word2id = {word: ix for ix, word in enumerate(words)}
    embeddings = np.vstack((embeddings, np.zeros(embedding_size)))

    def tokenizer(t):
        return [word2id.get(x, OOVID) for x in re.findall(WORDS, t.lower())]

    logging.info("Loading embeddings")

    logging.info("Get the IMDB dataset")
    

    return word2id, embeddings, FolderText(IMDB_CLASSES, DATASET_PATH /"train", tokenizer, train_max_size, load=True), FolderText(IMDB_CLASSES, DATASET_PATH / "test", tokenizer, train_max_size, load=True)




In [None]:
word2id, embeddings, train_dataset, test_dataset = get_imdb_data(train_max_size=1000)

Prendre le temps de comprendre ce qui est chargé:

- nature des informations
- dimension des structures de données

In [None]:
# vérification des données chargées:

word2id
# embeddings
# train_dataset[0]

print(len(word2id), len(embeddings), len(embeddings[0]))



Passage au data-loader

In [None]:
from torch.nn.utils.rnn import pad_sequence

BATCH_SIZE = 100

def collate_fn(batch):
    sequences, labels = zip(*batch)
    lengths = [len(seq) for seq in sequences]
    padded_sequences = pad_sequence(sequences, batch_first=False)
    return padded_sequences, torch.tensor(lengths), torch.tensor(labels)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)



In [None]:

# Test

batch =next(iter( train_loader))
padded_sequences, lengths, labels = batch
print("Padded sequences:", padded_sequences)
print(padded_sequences.size())
# print("Lengths:", lengths)
print("Labels:", labels)



## C. Création du réseau

Attention à la manière de gérer les embeddings

In [None]:

class RNNSent(nn.Module):
    def __init__(self, hidden_size, output_size, embeddings):
        super(RNNSent, self).__init__()

        self.hidden_size = hidden_size
        self.input_size = len(embeddings[0])

        self.emb = nn.Embedding(len(embeddings), len(embeddings[0]))

        # INITIALISATION des embeddings
        # 1. Récupération des valeurs présentes dans la structure (numpy) embeddings
        # 2. Faut-il activer le gradient sur ce module? Dans la négative, comment le désactiver?
        # <CORRECTION>
        self.emb.weight.data.copy_(torch.from_numpy(embeddings))
        self.emb.weight.requires_grad = False
        # </CORRECTION>

        # CHOIX Du module récurrent
        self.rec = nn.RNN(self.input_size,self.hidden_size, nonlinearity='tanh' )
        # self.rec = nn.LSTM(self.input_size, self.hidden_size)

        # ATTENTION à ajouter
        # - réfléchir à la nature du module à utiliser
        # <CORRECTION>
        self.attention = nn.Linear(hidden_size, 1, bias=False)
        # </CORRECTION>

        self.h2o = nn.Linear(hidden_size, output_size)

        

    def forward(self, input, lengths=None):
        # Principales étapes
        # 1. translation of the input from int to emb
        # 2. Passage dans le rec
        # 3. Ajout de l'attention (non nécessaire dans un premier temps)
        # 4. retour de la prediction sur la dernière couche

         # print("input", input.size())
        maxlen = input.size(0)
        batch_size = input.size(1)

        # 1. translation of the input from int to emb
        xemb = self.emb(input) 
        print("xemb", xemb.size())

        # 2. Passage dans le rec
        hidden, last = self.rec(xemb)   # RNN
        # hidden, (last,c) = self.rec(xemb) # LSTM => last[-1]
        # print("last", last.size())
        print("hidden", hidden.size())
        
        # recupération des dernières couches (réelles, sans padding)
        if lengths != None:
            last = torch.stack([hidden[ lengths[i] - 1, i, :] for i in range(batch_size)])
    
        # 3. Ajout de l'attention (non nécessaire dans un premier temps)
        # dans la pratique, il s'agit d'une nouvelle manière de construire last
        # WARNING: pour l'utilisation de batch, il faut savoir construire un masque
        
        # 3.1 passage dans l'attention
        # <CORRECTION>
        a = self.attention(hidden).squeeze(-1)
        print(a.size())
        # </CORRECTION>

        # 3.2 mask [à comprendre impérativemnent]
        mask = torch.arange(maxlen).unsqueeze(1).expand(maxlen, batch_size) < lengths.unsqueeze(0)
        print(mask)
        masked_attn_scores = a.masked_fill(~mask, float('-inf'))

        # 3.3 calcul de l'attention (utilisation du softmax) + application sur les couches cachées
        # <CORRECTION>
        a = F.softmax(masked_attn_scores, dim=0).unsqueeze(-1)
        print("a", a.size())
        print("hidden", hidden.size())

        last = torch.sum(a * hidden, dim=0)
        print("last", last.size())
        # </CORRECTION>

        output = self.h2o(last).squeeze(0)
        #output = self.h2o(last) # LSTM
        
        return output, hidden



In [None]:

# choose hidden size
n_hidden = 128
output_size = 2
# build network
rnn = RNNSent( n_hidden,  output_size, embeddings)
rnn.name = "RNNSent-"+time.asctime()

In [None]:
loss = nn.CrossEntropyLoss()
# batch
x, lengths, y = next(iter(train_loader))
print(x.size(),y.size())

# maxlen = x.size()[0]
# mask = torch.arange(maxlen).unsqueeze(1).expand(maxlen, 100) < lengths.unsqueeze(0)
# print(mask)

yhat, hidden = rnn(x, lengths)
print(yhat.size())
l = loss(yhat,y)

## C. Training

1. put the data into a DataLoader
2. choose a loss function 
3. run a standard training loop

In [None]:
# définition de la métrique d'évaluation
def accuracy(yhat,y):
    # y encode les indexes, s'assurer de la bonne taille de tenseur
    assert len(y.shape)==1 or y.size(1)==1
    return (torch.argmax(yhat,1).view(y.size(0),-1)== y.view(-1,1)).float().mean()

In [None]:

    
def train(model,epochs,train_loader,test_loader):
    writer = SummaryWriter(f"{TB_PATH}/{model.name}")
    optim = torch.optim.Adam(model.parameters(),lr=1e-3)    # choix optimizer
    model = model.to(device)
    print(f"running {model.name}")
    loss = nn.CrossEntropyLoss()                            # choix loss
    # 
    # loss = nn.CrossEntropyLoss(weight=cl_weight.to(device))                            # choix loss
    for epoch in tqdm(range(epochs)):
        cumloss, cumacc, count = 0, 0, 0
        model.train()
        for x, lengths, y in train_loader:                            # boucle sur les batchs
            optim.zero_grad()
            x,y = x.to(device), y.to(device)                # y doit être un tensor (pas un int)
            yhat, next_hidden = model(x, lengths)
            l = loss(yhat,y)
            l.backward()
            optim.step()
            cumloss += l*len(x)                             # attention, il peut y avoir un batch + petit (le dernier)
            cumacc += accuracy(yhat,y)*len(x)
            count += len(x)
        writer.add_scalar('loss/train',cumloss/count,epoch)
        writer.add_scalar('accuracy/train',cumacc/count,epoch)
        if epoch % 2 == 0:
            model.eval()
            with torch.no_grad():
                cumloss, cumacc, count = 0, 0, 0
                for x, lengths, y in test_loader:
                    x,y = x.to(device), y.to(device)
                    yhat, next_hidden = model(x, lengths)
                    cumloss += loss(yhat,y)*len(x)
                    cumacc += accuracy(yhat,y)*len(x)
                    count += len(x)
                writer.add_scalar(f'loss/test',cumloss/count,epoch)
                writer.add_scalar('accuracy/test',cumacc/count,epoch)


In [None]:
# ~10 minutes sur CPU
n_epoch = 20
train(rnn, n_epoch, train_loader, test_loader)


## D. Evaluating the Results

From the qualitative point of view, then computing the confusion matrix


# Construction du sujet à partir de la correction

In [1]:
### <CORRECTION> ###
import re
# transformation de cet énoncé en version étudiante

fname = "4_3_rnn_classif_attn-corr.ipynb" # ce fichier
fout  = fname.replace("-corr","")

# print("Fichier de sortie: ", fout )

f = open(fname, "r")
txt = f.read()
 
f.close()

f2 = open(fout, "w")
f2.write(re.sub("<CORRECTION>.*?(</CORRECTION>)"," TODO ",\
    txt, flags=re.DOTALL))
f2.close()

### </CORRECTION> ###