In [17]:
import collections
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torchcrf import CRF
from sklearn import preprocessing

In [18]:
device = torch.device('cuda')

In [19]:
le = preprocessing.LabelEncoder()
le.fit(y+["<eos>"])
int_labels = le.transform(y)

In [20]:
max_len = 16
batch_size = 64
embed_size = 300 # modifier paramètre uniquement si vous utilisez d'autres embeddings pré-entraînés
hidden_size = 128

In [21]:
le_vocab = preprocessing.LabelEncoder()
vocab = [text for _, text, _, _ in X]+["<eos>"]

int_texts = le_vocab.fit_transform(vocab)
print(len(int_texts))
print(le_vocab.transform(["."]))
temp_x = []
final_x = []
temp_y = []
final_y = []

dot_symbol = le_vocab.transform(["."])
for x,y in zip(int_texts,int_labels) :
	temp_x.append(x)
	temp_y.append(y)
	if x == dot_symbol :
		final_x.append(temp_x)
		final_y.append(temp_y)
		temp_x = []
		temp_y = []

int_texts = final_x
int_labels = final_y
# print(le_vocab.inverse_transform(final_x[0]))
# print(le.inverse_transform(final_y[0]))

51507
[237]


In [22]:
X = torch.zeros(len(int_texts), max_len).long()
Y = torch.zeros(len(int_labels), max_len).long()
for i, (text, label) in enumerate(zip(int_texts, int_labels)):
	length = min(max_len, len(text))
	X[i,:length] = torch.LongTensor(text[:length])
	Y[i,:length] = torch.LongTensor(label[:length])
print(X[12])
print(Y[12])
print(le.inverse_transform(Y[12]))

tensor([3829, 4540, 8669, 4203, 8133, 4444, 6777, 3987, 6415, 4399, 3844, 7195,
        7930,  237,    0,    0])
tensor([ 0, 17,  0,  0, 40,  0,  0,  0, 36,  0,  0,  0, 12,  0,  0,  0])
['' 'n.person' '' '' 'v.social' '' '' '' 'v.emotion' '' '' '' 'n.group' ''
 '' '']


In [23]:
# pretrained_weights = torch.zeros(len(vocab), 300)
# with open('wiki-news-300d-1M.vec', encoding="utf-8") as fp:
# 	fp.readline()
# 	for line in fp:
# 		tokens = line.strip().split()
# 		if tokens[0].lower() in le_vocab.classes_:
# 			pretrained_weights[le_vocab.transform([tokens[0].lower()])] = torch.FloatTensor([float(x) for x in tokens[1:]])

In [24]:
import pickle

In [25]:
# with open(f"pretrained_weights.pkl", 'wb') as fo:
# 	pickle.dump(pretrained_weights, fo)

In [26]:
with open(f"pretrained_weights.pkl", 'rb') as fin:
	pretrained_weights = pickle.load(fin)

In [27]:
X = torch.zeros(len(int_texts), max_len).long()
Y = torch.zeros(len(int_labels), max_len).long()

for i, (text, label) in enumerate(zip(int_texts, int_labels)):
    length = min(max_len, len(text))
    X[i,:length] = torch.LongTensor(text[:length])
    Y[i,:length] = torch.LongTensor(label[:length])

print(X[12])
print(Y[12])

tensor([3829, 4540, 8669, 4203, 8133, 4444, 6777, 3987, 6415, 4399, 3844, 7195,
        7930,  237,    0,    0])
tensor([ 0, 17,  0,  0, 40,  0,  0,  0, 36,  0,  0,  0, 12,  0,  0,  0])


In [28]:
X_train = X[:2000]
Y_train = Y[:2000]
X_valid = X[2000:]
Y_valid = Y[2000:]

In [29]:
train_set = TensorDataset(X_train, Y_train)
valid_set = TensorDataset(X_valid, Y_valid)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size)

In [30]:
class RNN(nn.Module):
    def __init__(self, pretrained_weights, le):
        super().__init__()
        self.embed = nn.Embedding(len(le.classes_), embed_size, padding_idx=le.transform(['<eos>'])[0])
        self.embed.weight = nn.Parameter(pretrained_weights, requires_grad=False)
        self.rnn = nn.GRU(embed_size, hidden_size, bias=False, num_layers=1, bidirectional=False, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.decision = nn.Linear(hidden_size * 1 * 1, len(le.classes_))

    def forward(self, x):
        embed = self.embed(x)
        output, hidden = self.rnn(embed)
        return self.decision(self.dropout(output))

rnn_model = RNN(pretrained_weights, le_vocab)
rnn_model

RNN(
  (embed): Embedding(8802, 300, padding_idx=463)
  (rnn): GRU(300, 128, bias=False, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (decision): Linear(in_features=128, out_features=8802, bias=True)
)

In [31]:
with torch.no_grad():
  print(rnn_model(X[:2]).shape)

torch.Size([2, 16, 8802])


In [32]:
def perf(model, loader):
	criterion = nn.CrossEntropyLoss()
	model.eval()
	total_loss = correct = num_loss = num_perf = 0
	for x, y in loader:
		with torch.no_grad():
			y_scores = model(x)
			loss = criterion(y_scores.view(y.size(0) * y.size(1), -1), y.view(y.size(0) * y.size(1)))
			y_pred = torch.max(y_scores, 2)[1]
			mask = (y != 0)
			correct += torch.sum((y_pred.data == y) * mask)
			total_loss += loss.item()
			num_loss += len(y)
			num_perf += torch.sum(mask).item()
	return total_loss / num_loss, correct.item() / num_perf

perf(rnn_model, valid_loader)

(0.15382562442259354, 0.0013422818791946308)

In [33]:
def fit(model, epochs, train_loader, valid_loader):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(filter(lambda param: param.requires_grad, model.parameters()))
    for epoch in range(epochs):
        model.train()
        total_loss = num = 0
        for x, y in train_loader:
            optimizer.zero_grad()
            y_scores = model(x)
            loss = criterion(y_scores.view(y.size(0) * y.size(1), -1), y.view(y.size(0) * y.size(1)))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            num += len(y)
        print(epoch, total_loss / num, *perf(model, valid_loader))

0 0.09984938430786133 0.04736312952908603 0.0013422818791946308
1 0.03997862148284912 0.03957474367185072 0.020134228187919462
2 0.036693758964538574 0.037675662474198776 0.020134228187919462
3 0.034983372688293456 0.03594202480532906 0.020134228187919462
4 0.03353504294157028 0.03448308056051081 0.020134228187919462
5 0.032256755411624906 0.03332324867898768 0.020134228187919462
6 0.03126042276620865 0.0323540446433154 0.022818791946308724
7 0.030422228276729583 0.03162022476846522 0.022818791946308724
8 0.02986017894744873 0.031027098948305302 0.022818791946308724
9 0.029344615221023558 0.030538424172184685 0.022818791946308724
10 0.028907779097557068 0.030145177109674973 0.025503355704697986
11 0.02874341952800751 0.029790567403489895 0.032214765100671144
12 0.028331458508968354 0.029467047615484757 0.022818791946308724
13 0.02798076754808426 0.029168578711423008 0.06308724832214765
14 0.02771499103307724 0.02888127551837401 0.053691275167785234
15 0.027479027926921844 0.02864347736

In [37]:
fit(rnn_model, 50, train_loader, valid_loader)

0 0.022305285155773163 0.023033688014203853 0.287248322147651
1 0.02211806446313858 0.022806393151933498 0.29261744966442954
2 0.021842821419239045 0.022621241482821377 0.29261744966442954
3 0.02176356518268585 0.02246382561596957 0.2912751677852349
4 0.021542116284370423 0.022251335057345303 0.29395973154362415
5 0.021307790517807006 0.022029705345630646 0.3073825503355705
6 0.020948788225650786 0.02179977094585245 0.31140939597315437
7 0.02084458488225937 0.021584784442728214 0.32483221476510066
8 0.020576954782009126 0.02137730744752017 0.3275167785234899
9 0.020259391009807587 0.021165002476085316 0.3288590604026846
10 0.02005208510160446 0.020928542045029728 0.338255033557047
11 0.01984624046087265 0.020745701410553673 0.3476510067114094
12 0.019616094946861267 0.020571653138507496 0.3422818791946309
13 0.01924274468421936 0.020355243574489246 0.3704697986577181
14 0.01902614641189575 0.02016018195585771 0.3731543624161074
15 0.018830136954784395 0.019979444417086514 0.37315436241