In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from Model import Model

from Generic_Torch_Model import Simple_GRU as GRU
from Generic_Torch_Model import NN_Model, NN_CRF_Model, extract_data, train_test_split_sentences, transform, \
                                load_pretrained_weights, transform_test, align_pred, write_data, combine_sst_mwe
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torchcrf import CRF
import pickle


def preprocess_train_eval(model_sst_generator, model_mwe_generator, epochs_sst, epochs_mwe,
                          generate_pretrained_weights=False, use_pos=False, use_mwe=False,
                          batch_size=64, max_len=16):
    
    # Load data
    data = open('../dimsum-data-1.5/dimsum16.train', 'r').readlines()
    data_test = open('../dimsum-data-1.5/dimsum16.test.blind', 'r', encoding='utf-8').readlines()

    X, y_sst, y_mwe = extract_data(data)
    X_, X_val_, y_sst, y_sst_val = train_test_split_sentences(X, y_sst, test_size=0.3, random_state=0, shuffle=False)
    X, X_val, y_mwe, y_mwe_val = train_test_split_sentences(X, y_mwe, test_size=0.3, random_state=0, shuffle=False)
    assert (X == X_) and (X_val == X_val_)
    write_data(X_val, y_sst_val, y_mwe_val, 'val.gold')

    vocab_, le_vocab_, le_pos_, le_sst, train_loader_sst, valid_loader_sst = transform(X, y_sst, X_val, y_sst_val,
                                                                                       max_len, batch_size, use_pos)
    vocab, le_vocab, le_pos, le_mwe, train_loader_mwe, valid_loader_mwe = transform(X, y_mwe, X_val, y_mwe_val,
                                                                                    max_len, batch_size, use_pos)
    assert (vocab == vocab_) and (le_vocab == le_vocab_) and (le_pos == le_pos_)
    
    # Load Pretrained Weights (or generate for the first time)
    if generate_pretrained_weights:
        pretrained_weights = load_pretrained_weights('wiki-news-300d-1M.vec', vocab_size=len(vocab), le_vocab=le_vocab)
        with open(f"pretrained_weights_2.pkl", 'wb') as fo:
            pickle.dump(pretrained_weights, fo)
    else:
        pretrained_weights = load_pretrained_weights('pretrained_weights_2.pkl', from_pickle=True)


    # Train
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    device = torch.device('cpu')
    
    print('Model MWE:')
    model_mwe = model_mwe_generator(pretrained_weights, le_mwe, le_vocab)
    model_mwe.fit(train_loader_mwe, valid_loader_mwe, epochs=epochs_mwe)
    
    if use_mwe:
        X_enc, y_mwe_enc = transform_test(le_vocab, le_mwe, X, y_mwe, max_len, le_pos)
        X_enc, y_sst_enc = transform_test(le_vocab, le_sst, X, y_sst, max_len, le_pos)
        X_enc_val, y_mwe_enc_val = transform_test(le_vocab, le_mwe, X_val, y_mwe_val, max_len, le_pos)
        X_enc_val, y_sst_enc_val = transform_test(le_vocab, le_sst, X_val, y_sst_val, max_len, le_pos)
        
        y_mwe_hat = y_mwe_enc
        y_mwe_hat_val = y_mwe_enc_val
        
        #print(y_sst_enc[:,0])
        
        if use_pos:
            X_enc = torch.cat((X_enc, y_mwe_hat.view(y_mwe_hat.size(0), y_mwe_hat.size(1), 1)), dim=-1)
            X_enc_val = torch.cat((X_enc_val,
                                   y_mwe_hat_val.view(y_mwe_hat_val.size(0), y_mwe_hat_val.size(1), 1)), dim=-1)
        else:
            X_enc = torch.cat((X_enc.view(X_enc.size(0), X_enc.size(1), 1),
                               y_mwe_hat.view(y_mwe_hat.size(0), y_mwe_hat.size(1), 1)), dim=-1)
            X_enc_val = torch.cat((X_enc_val.view(X_enc_val.size(0), X_enc_val.size(1), 1),
                                   y_mwe_hat_val.view(y_mwe_hat_val.size(0), y_mwe_hat_val.size(1), 1)), dim=-1)
        
        train_set = TensorDataset(X_enc, y_sst_enc)
        valid_set = TensorDataset(X_enc_val, y_sst_enc_val)
        train_loader_sst = DataLoader(train_set, batch_size=batch_size, shuffle=True)
        valid_loader_sst = DataLoader(valid_set, batch_size=batch_size)
        
        assert (vocab == vocab_) and (le_vocab == le_vocab_)
    
    print('\nModel SST:')
    model_sst = model_sst_generator(pretrained_weights, le_sst, le_vocab)
    model_sst.fit(train_loader_sst, valid_loader_sst, epochs=epochs_sst)
    
    
    # Get Model F1-Score on valid set
    X_val_enc, y_sst_val_enc = transform_test(le_vocab, le_sst, X_val, y_sst_val, max_len=100, le_pos=le_pos)
    X_val_enc, y_mwe_val_enc = transform_test(le_vocab, le_mwe, X_val, y_mwe_val, max_len=100, le_pos=le_pos)
    
    precision, recall, f1_score, accuracy = model_mwe.score(X_val_enc, y_mwe_val_enc, le_mwe[''], le_mwe)
    print(f'\nModel MWE score: Acc={accuracy:.4} P={precision:.4f}, R={recall:.4f}, F1={f1_score:.4f}')
    
    if use_mwe:
        y_mwe_hat_val = model_mwe.predict(X_val_enc)
        if use_pos:
            X_val_enc = torch.cat((X_val_enc,
                                   y_mwe_hat_val.view(y_mwe_hat_val.size(0), y_mwe_hat_val.size(1), 1)), dim=-1)
        else:
            X_val_enc = torch.cat((X_val_enc.view(X_val_enc.size(0), X_val_enc.size(1), 1),
                                   y_mwe_hat_val.view(y_mwe_hat_val.size(0), y_mwe_hat_val.size(1), 1)), dim=-1)
    precision, recall, f1_score, accuracy = model_sst.score(X_val_enc, y_sst_val_enc, le_sst[''], le_sst)
    print(f'Model SST score: Acc={accuracy:.4} P={precision:.4f}, R={recall:.4f}, F1={f1_score:.4f}')


    # Écriture du fichier pour évaluation
    X_val_enc, y_sst_val_enc = transform_test(le_vocab, le_sst, X_val, y_sst_val, max_len=100, le_pos=le_pos)
    X_val_enc, y_mwe_val_enc = transform_test(le_vocab, le_mwe, X_val, y_mwe_val, max_len=100, le_pos=le_pos)
    
    y_mwe_hat_val = model_mwe.predict(X_val_enc)
    if use_mwe:
        if use_pos:
            X_val_enc = torch.cat((X_val_enc,
                                   y_mwe_hat_val.view(y_mwe_hat_val.size(0), y_mwe_hat_val.size(1), 1)), dim=-1)
        else:
            X_val_enc = torch.cat((X_val_enc.view(X_val_enc.size(0), X_val_enc.size(1), 1),
                                   y_mwe_hat_val.view(y_mwe_hat_val.size(0), y_mwe_hat_val.size(1), 1)), dim=-1)
    y_sst_hat_val = model_sst.predict(X_val_enc)
    y_mwe_hat_val_align = align_pred(X_val, y_mwe_hat_val, le_mwe, is_mwe=True)
    y_sst_hat_val_align = align_pred(X_val, y_sst_hat_val, le_sst, is_mwe=False)
    rev_le_mwe = {v: k for k, v in le_mwe.items()}
    rev_le_sst = {v: k for k, v in le_sst.items()}
    y_mwe_hat_val_align = [rev_le_mwe[yi] if yi in rev_le_mwe else '' for yi in y_mwe_hat_val_align] # Decode
    y_sst_hat_val_align = [rev_le_sst[yi] if yi in rev_le_sst else '' for yi in y_sst_hat_val_align] # Decode
    write_data(X_val, y_sst_hat_val_align.copy(), y_mwe_hat_val_align.copy(), 'val_before_comb.pred')
    y_sst_hat_val_comb, y_mwe_hat_val_comb = combine_sst_mwe(X_val, y_sst_hat_val_align.copy(), y_mwe_hat_val_align.copy())
    write_data(X_val, y_sst_hat_val_comb.copy(), y_mwe_hat_val_comb.copy(), 'val.pred')


In [19]:
_ = None
X = [[i+1, _, _, _] for i in range(6)] + [['', '<eos>', _, _]] #[[i+1, _, _, _] for i in range(10)]# + [['', '<eos>', _, _]] + [[i+1, _, _, _] for i in range(10)] + ['<eos>']
#y_mwe = ['I', 'b', 'b', 'I', 'b', 'B', 'I', 'O', 'O', 'O']
#y_mwe = ['I', 'I', 'I', 'I', 'o', 'B', 'B', 'O', 'O', 'O']
#y_mwe = ['O', 'O', 'O', 'B', 'o', 'O', 'O', 'O', 'O', 'O']
y_mwe = ['O', 'I', 'O', 'B', 'B', '']#, 'B', 'o', 'O', 'O', 'O', 'O', 'O']
y_sst = ['', 'n.act', '', '', 's.social', '']#, '', '', 'v.social', '', 'n.act', '', '']

#y_mwe = ['O', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'B'] + ['I'] + ['O', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'B'] + ['<eos>']
#y_sst = ['', '', '', '', '', 'v.social', '', 'n.act', '', ''] + [''] + ['', '', '', '', '', 'v.social', '', 'n.act', '', ''] + ['<eos>']

combine_sst_mwe(X, y_sst, y_mwe)

(['', 'n.act', '', '', 's.social', ''], ['O', 'O', 'O', 'O', 'O', ''])

# Modèle GRU

In [3]:
use_pos = False
use_mwe = False

model_sst_generator = lambda pretrained_weights, le, le_vocab : NN_Model(model=GRU(pretrained_weights, le,
                                                                                   embed_size=300, hidden_size=128,
                                                                                   use_pos=use_pos, use_mwe=use_mwe),
                                                                         criterion=nn.CrossEntropyLoss(),
                                                                         optim=optim.Adam
                                                                        )

model_mwe_generator = lambda pretrained_weights, le, le_vocab : NN_Model(model=GRU(pretrained_weights, le,
                                                                                   embed_size=300, hidden_size=128,
                                                                                   use_pos=use_pos),
                                                                         criterion=nn.CrossEntropyLoss(),
                                                                         optim=optim.Adam
                                                                        )

preprocess_train_eval(model_sst_generator, model_mwe_generator, epochs_sst=50, epochs_mwe=50,
                      generate_pretrained_weights=False, use_pos=use_pos, use_mwe=use_mwe,
                      batch_size=64, max_len=16)

Model MWE:
1 0.018783527192135375 0.01197886611852381 0.7852340559220861
2 0.010670465474044969 0.009517150082521969 0.8080427269871191
3 0.009621973556221578 0.009063452171782652 0.8476280238768458
4 0.009332658883254064 0.008894829617606268 0.8502042098649073
5 0.009050702873032942 0.008628355194297102 0.8455545083254791
6 0.008863431831029386 0.008508428641491466 0.8412818096135721
7 0.00873481984222244 0.008524435779286755 0.8314169022934339
8 0.008597815975471422 0.008330274518165324 0.8497015394282124
9 0.008445383014335417 0.00830389882127444 0.8502670436694942
10 0.008388084380816193 0.008211312525802188 0.8393339616713792
11 0.008231974812127466 0.008241443977587753 0.8529688972667295
12 0.008198817355109666 0.008115473886330922 0.8469996858309771
13 0.008097243951381264 0.00810826068951024 0.8480678605089538
14 0.008017288188772493 0.008119388793905577 0.8495758718190386
15 0.007930779810212986 0.008069524375928772 0.8495130380144518
16 0.007828222299906852 0.0080495524116688

In [4]:
# Évaluation avec 'dimsumeval.py'
!python ../dimsum-data-1.5/scripts/dimsumeval.py val.gold val.pred

[40m[97m[('We', 'PRON'), ('called', 'VERB'), ('few', 'ADJ'), ('companies', 'NOUN'), ('before', 'SCONJ'), ('we', 'PRON'), ('decide', 'VERB'), ('to', 'PART'), ('hire', 'VERB'), ('them', 'PRON'), ('.', 'PUNCT')]
[('We', 'PRON'), ('called', 'VERB'), ('few', 'ADJ'), ('companies', 'NOUN'), ('before', 'SCONJ'), ('we', 'PRON'), ('decide', 'VERB'), ('to', 'PART'), ('hire', 'VERB'), ('them', 'PRON'), ('.', 'PUNCT')]
TEST 11 [('We', 'O', 'O'), ('called', 'O', 'O'), ('few', 'O', 'O'), ('companies', 'O', 'O'), ('before', 'O', 'O'), ('we', 'O', 'O'), ('decide', 'O', 'O'), ('to', 'O', 'O'), ('hire', 'O', 'O'), ('them', 'O', 'O'), ('.', 'O', 'O')]
[('They', 'PRON'), ('came', 'VERB'), ('on', 'ADP'), ('time', 'NOUN'), ('and', 'CONJ'), ('completed', 'VERB'), ('their', 'PRON'), ('work', 'NOUN'), ('quickly', 'ADV'), ('.', 'PUNCT')]
[('They', 'PRON'), ('came', 'VERB'), ('on', 'ADP'), ('time', 'NOUN'), ('and', 'CONJ'), ('completed', 'VERB'), ('their', 'PRON'), ('work', 'NOUN'), ('quickly', 'ADV'), ('.', 'P

TEST 26 [('However', 'O', 'O'), (',', 'O', 'O'), ('we', 'O', 'O'), ('waited', 'O', 'O'), ('and', 'O', 'O'), ('waited', 'O', 'O'), ('and', 'O', 'O'), ('in', 'B', 'O'), ('the', 'I', 'O'), ('mean', 'I', 'O'), ('time', 'I', 'O'), (',', 'O', 'O'), ('saw', 'O', 'O'), ('4', 'O', 'O'), ('groups', 'O', 'B'), ('of', 'O', 'I'), ('people', 'O', 'O'), ('simply', 'O', 'O'), ('just', 'O', 'O'), ('paraded', 'O', 'O'), ('in', 'O', 'O'), ('without', 'O', 'O'), ('signing', 'O', 'O'), ('there', 'O', 'O'), ('names', 'O', 'O'), ('.', 'O', 'O')]
[('This', 'DET'), ('sign', 'VERB'), ('in', 'ADP'), ('policy', 'NOUN'), ('is', 'AUX'), ('posted', 'VERB'), ('by', 'ADP'), ('the', 'DET'), ('restaurant', 'NOUN'), ('"', 'PUNCT'), ('no', 'DET'), ('reservation', 'NOUN'), (',', 'PUNCT'), ('sign', 'VERB'), ('your', 'PRON'), ('name', 'NOUN'), ('here', 'ADV'), ('"', 'PUNCT'), ('.', 'PUNCT')]
[('This', 'DET'), ('sign', 'VERB'), ('in', 'ADP'), ('policy', 'NOUN'), ('is', 'AUX'), ('posted', 'VERB'), ('by', 'ADP'), ('the', 'DET')

TEST 24 [('I', 'O', 'O'), ('thouhgt', 'O', 'O'), ('it', 'O', 'O'), ('would', 'O', 'O'), ('be', 'O', 'O'), ('out', 'B', 'O'), ('of', 'I', 'O'), ('my', 'O', 'O'), ('price', 'B', 'O'), ('range', 'I', 'O'), ('but', 'O', 'O'), ('they', 'O', 'O'), ('really', 'O', 'O'), ('worked', 'O', 'O'), ('with', 'O', 'O'), ('me', 'O', 'O'), ('and', 'O', 'O'), ('now', 'O', 'O'), ('I', 'O', 'O'), ('could', 'O', 'O'), ('nt', 'O', 'O'), ('be', 'O', 'O'), ('happier', 'O', 'O'), ('.', 'O', 'O')]
[('Jeff', 'PROPN'), ('and', 'CONJ'), ('Craig', 'PROPN'), ('are', 'VERB'), ('really', 'ADV'), ('good', 'ADJ'), ('at', 'SCONJ'), ('what', 'PRON'), ('they', 'PRON'), ('do', 'VERB'), ('and', 'CONJ'), ('know', 'VERB'), ('exactly', 'ADV'), ('how', 'ADV'), ('to', 'PART'), ('treat', 'VERB'), ('a', 'DET'), ('customer', 'NOUN'), ('.', 'PUNCT')]
[('Jeff', 'PROPN'), ('and', 'CONJ'), ('Craig', 'PROPN'), ('are', 'VERB'), ('really', 'ADV'), ('good', 'ADJ'), ('at', 'SCONJ'), ('what', 'PRON'), ('they', 'PRON'), ('do', 'VERB'), ('and', 

TEST 13 [('It', 'O', 'O'), ('used', 'B', 'O'), ('to', 'I', 'O'), ('be', 'O', 'O'), ('fabulous', 'O', 'O'), (',', 'O', 'O'), ('why', 'O', 'O'), ('did', 'O', 'O'), ('you', 'B', 'O'), ('guys', 'I', 'O'), ('change', 'O', 'O'), ('it', 'O', 'O'), ('??', 'O', 'O')]
[('Queso', 'NOUN'), ('should', 'AUX'), ('not', 'PART'), ('be', 'VERB'), ('watery', 'ADJ'), (':(', 'SYM'), ('.....', 'PUNCT')]
[('Queso', 'NOUN'), ('should', 'AUX'), ('not', 'PART'), ('be', 'VERB'), ('watery', 'ADJ'), (':(', 'SYM'), ('.....', 'PUNCT')]
TEST 7 [('Queso', 'O', 'O'), ('should', 'O', 'O'), ('not', 'O', 'O'), ('be', 'O', 'O'), ('watery', 'O', 'O'), (':(', 'O', 'O'), ('.....', 'O', 'O')]
[('***', 'SYM'), ('update', 'NOUN'), ('***', 'SYM'), ('NEVER', 'ADV'), ('MIND', 'VERB'), ('!', 'PUNCT')]
[('***', 'SYM'), ('update', 'NOUN'), ('***', 'SYM'), ('NEVER', 'ADV'), ('MIND', 'VERB'), ('!', 'PUNCT')]
TEST 6 [('***', 'O', 'O'), ('update', 'O', 'O'), ('***', 'O', 'O'), ('NEVER', 'B', 'O'), ('MIND', 'I', 'O'), ('!', 'O', 'O')]
[('T

[('Excellent', 'ADJ'), ('location', 'NOUN'), ('.', 'PUNCT')]
TEST 3 [('Excellent', 'O', 'O'), ('location', 'O', 'O'), ('.', 'O', 'O')]
[('Good', 'ADJ'), ('sports', 'NOUN'), ('bar', 'NOUN'), ('.', 'PUNCT')]
[('Good', 'ADJ'), ('sports', 'NOUN'), ('bar', 'NOUN'), ('.', 'PUNCT')]
TEST 4 [('Good', 'O', 'O'), ('sports', 'B', 'O'), ('bar', 'I', 'O'), ('.', 'O', 'O')]
[('Hyatt', 'PROPN'), ('web', 'NOUN'), ('site', 'NOUN'), ('improved', 'VERB'), ('.', 'PUNCT')]
[('Hyatt', 'PROPN'), ('web', 'NOUN'), ('site', 'NOUN'), ('improved', 'VERB'), ('.', 'PUNCT')]
TEST 5 [('Hyatt', 'O', 'O'), ('web', 'B', 'O'), ('site', 'I', 'O'), ('improved', 'O', 'O'), ('.', 'O', 'O')]
[('Accurate', 'ADJ'), ('check', 'NOUN'), ('-', 'PUNCT'), ('out', 'NOUN'), ('.', 'PUNCT')]
[('Accurate', 'ADJ'), ('check', 'NOUN'), ('-', 'PUNCT'), ('out', 'NOUN'), ('.', 'PUNCT')]
TEST 5 [('Accurate', 'O', 'O'), ('check', 'B', 'O'), ('-', 'I', 'O'), ('out', 'I', 'O'), ('.', 'O', 'O')]
[('Rooms', 'NOUN'), ('clean', 'ADJ'), ('.', 'PUNCT')]


# Modèle GRU avec POS

In [23]:
use_pos = True
use_mwe = False

model_sst_generator = lambda pretrained_weights, le, le_vocab : NN_Model(model=GRU(pretrained_weights, le,
                                                                                   embed_size=300, hidden_size=128,
                                                                                   use_pos=use_pos, use_mwe=use_mwe),
                                                                         criterion=nn.CrossEntropyLoss(),
                                                                         optim=optim.Adam
                                                                        )

model_mwe_generator = lambda pretrained_weights, le, le_vocab : NN_Model(model=GRU(pretrained_weights, le,
                                                                                   embed_size=300, hidden_size=128,
                                                                                   use_pos=use_pos),
                                                                         criterion=nn.CrossEntropyLoss(),
                                                                         optim=optim.Adam
                                                                        )

preprocess_train_eval(model_sst_generator, model_mwe_generator, epochs_sst=50, epochs_mwe=50,
                      generate_pretrained_weights=False, use_pos=use_pos, use_mwe=use_mwe,
                      batch_size=64, max_len=16)

Model MWE:
1 0.015058630294975832 0.009195774421095848 0.8487590323594094
2 0.008660963325849706 0.0074531406578090455 0.8476280238768458
3 0.007958427927755961 0.007119458748234643 0.8520263901979265
4 0.007632685110447626 0.006901280623343256 0.8523405592208608
5 0.007382517034505655 0.006672955076727602 0.8513980521520578
6 0.007201003607769529 0.006596323537329833 0.8557335846685517
7 0.007049229811188294 0.00647374333606826 0.8577442664153314
8 0.0068375971051829664 0.006327557543085681 0.8569274269557021
9 0.0066349795617742105 0.006125092878937722 0.8517750549795791
10 0.006477336943522207 0.006069021382265621 0.8561734213006598
11 0.006328950381839726 0.005949549087219768 0.8567389255419415
12 0.006181292322356987 0.005953347600168652 0.8584982720703739
13 0.006059609644112753 0.005888936513413986 0.8569274269557021
14 0.00596323518853842 0.005787425095008479 0.8566760917373547
15 0.005859358825652363 0.00572800724249747 0.8570530945648759
16 0.0057021950852058235 0.00562522538

In [24]:
# Évaluation avec 'dimsumeval.py'
!python ../dimsum-data-1.5/scripts/dimsumeval.py val.gold val.pred

[40m[97m[('We', 'PRON'), ('called', 'VERB'), ('few', 'ADJ'), ('companies', 'NOUN'), ('before', 'SCONJ'), ('we', 'PRON'), ('decide', 'VERB'), ('to', 'PART'), ('hire', 'VERB'), ('them', 'PRON'), ('.', 'PUNCT')]
[('We', 'PRON'), ('called', 'VERB'), ('few', 'ADJ'), ('companies', 'NOUN'), ('before', 'SCONJ'), ('we', 'PRON'), ('decide', 'VERB'), ('to', 'PART'), ('hire', 'VERB'), ('them', 'PRON'), ('.', 'PUNCT')]
TEST 11 [('We', 'O', 'O'), ('called', 'O', 'O'), ('few', 'O', 'O'), ('companies', 'O', 'O'), ('before', 'O', 'O'), ('we', 'O', 'O'), ('decide', 'O', 'O'), ('to', 'O', 'O'), ('hire', 'O', 'O'), ('them', 'O', 'O'), ('.', 'O', 'O')]
[('They', 'PRON'), ('came', 'VERB'), ('on', 'ADP'), ('time', 'NOUN'), ('and', 'CONJ'), ('completed', 'VERB'), ('their', 'PRON'), ('work', 'NOUN'), ('quickly', 'ADV'), ('.', 'PUNCT')]
[('They', 'PRON'), ('came', 'VERB'), ('on', 'ADP'), ('time', 'NOUN'), ('and', 'CONJ'), ('completed', 'VERB'), ('their', 'PRON'), ('work', 'NOUN'), ('quickly', 'ADV'), ('.', 'P

[('They', 'PRON'), ('have', 'VERB'), ('the', 'DET'), ('best', 'ADJ'), ('Egg', 'PROPN'), ('Drop', 'PROPN'), ('Soup', 'PROPN'), ('I', 'PRON'), ('have', 'AUX'), ('ever', 'ADV'), ('tasted', 'VERB'), ('.', 'PUNCT')]
TEST 12 [('They', 'O', 'O'), ('have', 'O', 'O'), ('the', 'O', 'O'), ('best', 'O', 'O'), ('Egg', 'B', 'O'), ('Drop', 'I', 'B'), ('Soup', 'I', 'I'), ('I', 'O', 'O'), ('have', 'O', 'O'), ('ever', 'O', 'O'), ('tasted', 'O', 'O'), ('.', 'O', 'O')]
[('We', 'PRON'), ('also', 'ADV'), ('love', 'VERB'), ('their', 'PRON'), ('Egg', 'NOUN'), ('Rolls', 'NOUN'), ('and', 'CONJ'), ('Spring', 'NOUN'), ('Rolls', 'NOUN'), ('.', 'PUNCT')]
[('We', 'PRON'), ('also', 'ADV'), ('love', 'VERB'), ('their', 'PRON'), ('Egg', 'NOUN'), ('Rolls', 'NOUN'), ('and', 'CONJ'), ('Spring', 'NOUN'), ('Rolls', 'NOUN'), ('.', 'PUNCT')]
TEST 10 [('We', 'O', 'O'), ('also', 'O', 'O'), ('love', 'O', 'O'), ('their', 'O', 'O'), ('Egg', 'B', 'O'), ('Rolls', 'I', 'O'), ('and', 'O', 'O'), ('Spring', 'B', 'O'), ('Rolls', 'I', 'O')

TEST 13 [('It', 'O', 'O'), ('used', 'B', 'O'), ('to', 'I', 'O'), ('be', 'O', 'O'), ('fabulous', 'O', 'O'), (',', 'O', 'O'), ('why', 'O', 'O'), ('did', 'O', 'O'), ('you', 'B', 'O'), ('guys', 'I', 'O'), ('change', 'O', 'O'), ('it', 'O', 'O'), ('??', 'O', 'O')]
[('Queso', 'NOUN'), ('should', 'AUX'), ('not', 'PART'), ('be', 'VERB'), ('watery', 'ADJ'), (':(', 'SYM'), ('.....', 'PUNCT')]
[('Queso', 'NOUN'), ('should', 'AUX'), ('not', 'PART'), ('be', 'VERB'), ('watery', 'ADJ'), (':(', 'SYM'), ('.....', 'PUNCT')]
TEST 7 [('Queso', 'O', 'O'), ('should', 'O', 'O'), ('not', 'O', 'O'), ('be', 'O', 'O'), ('watery', 'O', 'O'), (':(', 'O', 'O'), ('.....', 'O', 'O')]
[('***', 'SYM'), ('update', 'NOUN'), ('***', 'SYM'), ('NEVER', 'ADV'), ('MIND', 'VERB'), ('!', 'PUNCT')]
[('***', 'SYM'), ('update', 'NOUN'), ('***', 'SYM'), ('NEVER', 'ADV'), ('MIND', 'VERB'), ('!', 'PUNCT')]
TEST 6 [('***', 'O', 'O'), ('update', 'O', 'O'), ('***', 'O', 'O'), ('NEVER', 'B', 'O'), ('MIND', 'I', 'O'), ('!', 'O', 'O')]
[('T

[('Excellent', 'ADJ'), ('location', 'NOUN'), ('.', 'PUNCT')]
TEST 3 [('Excellent', 'O', 'O'), ('location', 'O', 'O'), ('.', 'O', 'O')]
[('Good', 'ADJ'), ('sports', 'NOUN'), ('bar', 'NOUN'), ('.', 'PUNCT')]
[('Good', 'ADJ'), ('sports', 'NOUN'), ('bar', 'NOUN'), ('.', 'PUNCT')]
TEST 4 [('Good', 'O', 'O'), ('sports', 'B', 'O'), ('bar', 'I', 'O'), ('.', 'O', 'O')]
[('Hyatt', 'PROPN'), ('web', 'NOUN'), ('site', 'NOUN'), ('improved', 'VERB'), ('.', 'PUNCT')]
[('Hyatt', 'PROPN'), ('web', 'NOUN'), ('site', 'NOUN'), ('improved', 'VERB'), ('.', 'PUNCT')]
TEST 5 [('Hyatt', 'O', 'O'), ('web', 'B', 'O'), ('site', 'I', 'O'), ('improved', 'O', 'O'), ('.', 'O', 'O')]
[('Accurate', 'ADJ'), ('check', 'NOUN'), ('-', 'PUNCT'), ('out', 'NOUN'), ('.', 'PUNCT')]
[('Accurate', 'ADJ'), ('check', 'NOUN'), ('-', 'PUNCT'), ('out', 'NOUN'), ('.', 'PUNCT')]
TEST 5 [('Accurate', 'O', 'O'), ('check', 'B', 'O'), ('-', 'I', 'O'), ('out', 'I', 'O'), ('.', 'O', 'O')]
[('Rooms', 'NOUN'), ('clean', 'ADJ'), ('.', 'PUNCT')]


# Modèles GRU pour les Supersens et GRU+CRF pour les MWEs (avec POS)

In [25]:
use_pos = True
use_mwe = False

model_sst_generator = lambda pretrained_weights, le, le_vocab : NN_Model(model=GRU(pretrained_weights, le,
                                                                                   embed_size=300, hidden_size=128,
                                                                                   use_pos=use_pos, use_mwe=use_mwe),
                                                                         criterion=nn.CrossEntropyLoss(),
                                                                         optim=optim.Adam
                                                                        )

model_mwe_generator = lambda pretrained_weights, le, le_vocab : NN_CRF_Model(nn_model=GRU(pretrained_weights, le,
                                                                                      embed_size=300, hidden_size=128,
                                                                                      use_pos=use_pos),
                                                                             crf_model=CRF(len(le), batch_first=True),
                                                                             optim=optim.Adam
                                                                            )

preprocess_train_eval(model_sst_generator, model_mwe_generator, epochs_sst=50, epochs_mwe=50,
                      generate_pretrained_weights=False, use_pos=use_pos, use_mwe=use_mwe,
                      batch_size=64, max_len=16)

Model MWE:
1 8.73650453459049 5.866128836737738 0.8612001256676092
2 6.0982973826148035 5.434354390038385 0.8612001256676092
3 5.694844833333139 5.214844682481554 0.8612001256676092
4 5.435796262395279 4.963136672973633 0.8611372918630223
5 5.192962326226969 4.797886678907607 0.8611372918630223
6 4.9747922715064705 4.604034211900499 0.8611372918630223
7 4.801900374743867 4.495706272125244 0.8612001256676092
8 4.592578121365873 4.3401750670539005 0.8612001256676092
9 4.4483916841116855 4.161848327848646 0.8611372918630223
10 4.266334717669065 4.093575027253893 0.8616399622997173
11 4.112832026525635 3.9249284161461726 0.8613886270813698
12 3.951892027155066 3.9114678541819257 0.8625196355639334
13 3.8405554664386954 3.7499613867865667 0.8628338045868678
14 3.714590351437486 3.7611115879482693 0.8632108074143889
15 3.6051238282303895 3.6249323421054416 0.8640276468740182
16 3.4983473809285686 3.6384131961398656 0.8618912975180647
17 3.390390208165679 3.5225057866838245 0.8664781652529061

In [26]:
# Évaluation avec 'dimsumeval.py'
!python ../dimsum-data-1.5/scripts/dimsumeval.py val.gold val.pred

[40m[97m[('We', 'PRON'), ('called', 'VERB'), ('few', 'ADJ'), ('companies', 'NOUN'), ('before', 'SCONJ'), ('we', 'PRON'), ('decide', 'VERB'), ('to', 'PART'), ('hire', 'VERB'), ('them', 'PRON'), ('.', 'PUNCT')]
[('We', 'PRON'), ('called', 'VERB'), ('few', 'ADJ'), ('companies', 'NOUN'), ('before', 'SCONJ'), ('we', 'PRON'), ('decide', 'VERB'), ('to', 'PART'), ('hire', 'VERB'), ('them', 'PRON'), ('.', 'PUNCT')]
TEST 11 [('We', 'O', 'O'), ('called', 'O', 'B'), ('few', 'O', 'I'), ('companies', 'O', 'O'), ('before', 'O', 'O'), ('we', 'O', 'O'), ('decide', 'O', 'O'), ('to', 'O', 'O'), ('hire', 'O', 'O'), ('them', 'O', 'O'), ('.', 'O', 'O')]
[('They', 'PRON'), ('came', 'VERB'), ('on', 'ADP'), ('time', 'NOUN'), ('and', 'CONJ'), ('completed', 'VERB'), ('their', 'PRON'), ('work', 'NOUN'), ('quickly', 'ADV'), ('.', 'PUNCT')]
[('They', 'PRON'), ('came', 'VERB'), ('on', 'ADP'), ('time', 'NOUN'), ('and', 'CONJ'), ('completed', 'VERB'), ('their', 'PRON'), ('work', 'NOUN'), ('quickly', 'ADV'), ('.', 'P

TEST 19 [('I', 'O', 'O'), ('can', 'O', 'O'), ('not', 'O', 'O'), ('describe', 'O', 'O'), ('how', 'O', 'O'), ('delicious', 'O', 'O'), ('the', 'O', 'O'), ('mango', 'O', 'O'), ('and', 'O', 'O'), ('cheese', 'O', 'O'), ('pastries', 'O', 'O'), ('and', 'O', 'O'), ('the', 'O', 'O'), ('omelets', 'O', 'O'), ('are', 'B', 'O'), ('to', 'I', 'O'), ('die', 'I', 'O'), ('for', 'I', 'O'), ('.', 'O', 'O')]
[('Stop', 'VERB'), ('in', 'ADV'), ('and', 'CONJ'), ('have', 'VERB'), ('a', 'DET'), ('bite', 'NOUN'), ('you', 'PRON'), ('wo', 'AUX'), ("n't", 'PART'), ('regret', 'VERB'), ('it', 'PRON'), ('.', 'PUNCT')]
[('Stop', 'VERB'), ('in', 'ADV'), ('and', 'CONJ'), ('have', 'VERB'), ('a', 'DET'), ('bite', 'NOUN'), ('you', 'PRON'), ('wo', 'AUX'), ("n't", 'PART'), ('regret', 'VERB'), ('it', 'PRON'), ('.', 'PUNCT')]
TEST 12 [('Stop', 'B', 'O'), ('in', 'I', 'O'), ('and', 'O', 'O'), ('have', 'B', 'O'), ('a', 'I', 'O'), ('bite', 'I', 'O'), ('you', 'O', 'O'), ('wo', 'O', 'O'), ("n't", 'O', 'O'), ('regret', 'O', 'O'), ('it'

TEST 16 [('but', 'O', 'O'), ('sice', 'O', 'O'), ('we', 'O', 'O'), ('almost', 'O', 'O'), ('just', 'O', 'O'), ('slept', 'O', 'O'), ('there', 'O', 'O'), ('i', 'O', 'O'), ('ca', 'O', 'O'), ('nt', 'O', 'O'), ('give', 'O', 'O'), ('that', 'O', 'O'), ('good', 'O', 'O'), ('of', 'O', 'O'), ('a', 'O', 'O'), ('review', 'O', 'O')]
[('Review', 'NOUN'), ('on', 'ADP'), ('House', 'NOUN'), ('of', 'ADP'), ('Joy', 'PROPN'), ('Chinese', 'PROPN'), ('Restaurant', 'PROPN')]
[('Review', 'NOUN'), ('on', 'ADP'), ('House', 'NOUN'), ('of', 'ADP'), ('Joy', 'PROPN'), ('Chinese', 'PROPN'), ('Restaurant', 'PROPN')]
TEST 7 [('Review', 'O', 'O'), ('on', 'O', 'O'), ('House', 'B', 'O'), ('of', 'I', 'O'), ('Joy', 'I', 'O'), ('Chinese', 'I', 'B'), ('Restaurant', 'I', 'I')]
[('My', 'PRON'), ('family', 'NOUN'), ('and', 'CONJ'), ('I', 'PRON'), ('moved', 'VERB'), ('to', 'ADP'), ('San', 'PROPN'), ('Antonio', 'PROPN'), ('a', 'DET'), ('year', 'NOUN'), ('ago', 'ADV'), ('and', 'CONJ'), ('have', 'AUX'), ('tried', 'VERB'), ('almost', 

TEST 13 [('It', 'O', 'O'), ('used', 'B', 'O'), ('to', 'I', 'O'), ('be', 'O', 'O'), ('fabulous', 'O', 'O'), (',', 'O', 'O'), ('why', 'O', 'O'), ('did', 'O', 'O'), ('you', 'B', 'O'), ('guys', 'I', 'O'), ('change', 'O', 'O'), ('it', 'O', 'O'), ('??', 'O', 'O')]
[('Queso', 'NOUN'), ('should', 'AUX'), ('not', 'PART'), ('be', 'VERB'), ('watery', 'ADJ'), (':(', 'SYM'), ('.....', 'PUNCT')]
[('Queso', 'NOUN'), ('should', 'AUX'), ('not', 'PART'), ('be', 'VERB'), ('watery', 'ADJ'), (':(', 'SYM'), ('.....', 'PUNCT')]
TEST 7 [('Queso', 'O', 'O'), ('should', 'O', 'O'), ('not', 'O', 'O'), ('be', 'O', 'O'), ('watery', 'O', 'O'), (':(', 'O', 'O'), ('.....', 'O', 'O')]
[('***', 'SYM'), ('update', 'NOUN'), ('***', 'SYM'), ('NEVER', 'ADV'), ('MIND', 'VERB'), ('!', 'PUNCT')]
[('***', 'SYM'), ('update', 'NOUN'), ('***', 'SYM'), ('NEVER', 'ADV'), ('MIND', 'VERB'), ('!', 'PUNCT')]
TEST 6 [('***', 'O', 'O'), ('update', 'O', 'O'), ('***', 'O', 'O'), ('NEVER', 'B', 'O'), ('MIND', 'I', 'O'), ('!', 'O', 'O')]
[('T

[('Excellent', 'ADJ'), ('location', 'NOUN'), ('.', 'PUNCT')]
TEST 3 [('Excellent', 'O', 'O'), ('location', 'O', 'O'), ('.', 'O', 'O')]
[('Good', 'ADJ'), ('sports', 'NOUN'), ('bar', 'NOUN'), ('.', 'PUNCT')]
[('Good', 'ADJ'), ('sports', 'NOUN'), ('bar', 'NOUN'), ('.', 'PUNCT')]
TEST 4 [('Good', 'O', 'O'), ('sports', 'B', 'B'), ('bar', 'I', 'I'), ('.', 'O', 'O')]
[('Hyatt', 'PROPN'), ('web', 'NOUN'), ('site', 'NOUN'), ('improved', 'VERB'), ('.', 'PUNCT')]
[('Hyatt', 'PROPN'), ('web', 'NOUN'), ('site', 'NOUN'), ('improved', 'VERB'), ('.', 'PUNCT')]
TEST 5 [('Hyatt', 'O', 'B'), ('web', 'B', 'I'), ('site', 'I', 'I'), ('improved', 'O', 'O'), ('.', 'O', 'O')]
[('Accurate', 'ADJ'), ('check', 'NOUN'), ('-', 'PUNCT'), ('out', 'NOUN'), ('.', 'PUNCT')]
[('Accurate', 'ADJ'), ('check', 'NOUN'), ('-', 'PUNCT'), ('out', 'NOUN'), ('.', 'PUNCT')]
TEST 5 [('Accurate', 'O', 'O'), ('check', 'B', 'O'), ('-', 'I', 'O'), ('out', 'I', 'O'), ('.', 'O', 'O')]
[('Rooms', 'NOUN'), ('clean', 'ADJ'), ('.', 'PUNCT')]


# Modèles GRU+CRF pour les 2 (avec POS)

In [27]:
use_pos = True
use_mwe = False

model_sst_generator = lambda pretrained_weights, le, le_vocab : NN_CRF_Model(nn_model=GRU(pretrained_weights, le,
                                                                                      embed_size=300, hidden_size=128,
                                                                                      use_pos=use_pos, use_mwe=use_mwe),
                                                                             crf_model=CRF(len(le), batch_first=True),
                                                                             optim=optim.Adam
                                                                            )

model_mwe_generator = lambda pretrained_weights, le, le_vocab : NN_CRF_Model(nn_model=GRU(pretrained_weights, le,
                                                                                      embed_size=300, hidden_size=128,
                                                                                      use_pos=use_pos),
                                                                             crf_model=CRF(len(le), batch_first=True),
                                                                             optim=optim.Adam
                                                                            )

preprocess_train_eval(model_sst_generator, model_mwe_generator, epochs_sst=50, epochs_mwe=50,
                      generate_pretrained_weights=False, use_pos=use_pos, use_mwe=use_mwe,
                      batch_size=64, max_len=16)

Model MWE:
1 9.309505484224111 5.909699281056722 0.8599434495758718
2 6.1253788940393346 5.487494087219238 0.861262959472196
3 5.742706335169391 5.227386029561361 0.861262959472196
4 5.459218658056597 5.002252197265625 0.861262959472196
5 5.247550507818315 4.803699122534858 0.8613886270813698
6 5.033872419824625 4.640222369299995 0.8614514608859567
7 4.8230935244661035 4.494088416629367 0.8614514608859567
8 4.658554781260467 4.354372130499946 0.8610744580584354
9 4.472766450346493 4.179850806130303 0.8609487904492618
10 4.3187902370497175 4.054551702075535 0.8609487904492618
11 4.149818013842243 3.9934366438123914 0.8606346214263274
12 4.032942303733905 3.858500697877672 0.8614514608859567
13 3.894058654548938 3.7522630903455947 0.8624568017593466
14 3.749660985957444 3.6695176866319446 0.8633364750235627
15 3.638720606366379 3.5950724124908446 0.8644046497015394
16 3.522630196946403 3.555061509874132 0.8633993088281495
17 3.430822056153943 3.520235374238756 0.8652843229657555
18 3.330

In [28]:
# Évaluation avec 'dimsumeval.py'
!python ../dimsum-data-1.5/scripts/dimsumeval.py val.gold val.pred

[40m[97m[('We', 'PRON'), ('called', 'VERB'), ('few', 'ADJ'), ('companies', 'NOUN'), ('before', 'SCONJ'), ('we', 'PRON'), ('decide', 'VERB'), ('to', 'PART'), ('hire', 'VERB'), ('them', 'PRON'), ('.', 'PUNCT')]
[('We', 'PRON'), ('called', 'VERB'), ('few', 'ADJ'), ('companies', 'NOUN'), ('before', 'SCONJ'), ('we', 'PRON'), ('decide', 'VERB'), ('to', 'PART'), ('hire', 'VERB'), ('them', 'PRON'), ('.', 'PUNCT')]
TEST 11 [('We', 'O', 'O'), ('called', 'O', 'B'), ('few', 'O', 'I'), ('companies', 'O', 'O'), ('before', 'O', 'O'), ('we', 'O', 'O'), ('decide', 'O', 'O'), ('to', 'O', 'O'), ('hire', 'O', 'O'), ('them', 'O', 'O'), ('.', 'O', 'O')]
[('They', 'PRON'), ('came', 'VERB'), ('on', 'ADP'), ('time', 'NOUN'), ('and', 'CONJ'), ('completed', 'VERB'), ('their', 'PRON'), ('work', 'NOUN'), ('quickly', 'ADV'), ('.', 'PUNCT')]
[('They', 'PRON'), ('came', 'VERB'), ('on', 'ADP'), ('time', 'NOUN'), ('and', 'CONJ'), ('completed', 'VERB'), ('their', 'PRON'), ('work', 'NOUN'), ('quickly', 'ADV'), ('.', 'P

[('We', 'PRON'), ('just', 'ADV'), ('happen', 'VERB'), ('to', 'PART'), ('stumble', 'VERB'), ('across', 'ADP'), ('this', 'DET'), ('little', 'ADJ'), ('restaurant', 'NOUN'), ('one', 'NUM'), ('day', 'NOUN'), ('when', 'ADV'), ('we', 'PRON'), ('had', 'VERB'), ('to', 'PART'), ('visit', 'VERB'), ('the', 'DET'), ('Bexar', 'PROPN'), ('County', 'PROPN'), ('Tax', 'PROPN'), ('Office', 'PROPN'), ('off', 'ADP'), ('of', 'ADP'), ('Bandera', 'PROPN'), ('Road', 'PROPN'), ('.', 'PUNCT')]
TEST 26 [('We', 'O', 'O'), ('just', 'O', 'O'), ('happen', 'B', 'O'), ('to', 'I', 'O'), ('stumble', 'B', 'O'), ('across', 'I', 'O'), ('this', 'O', 'O'), ('little', 'O', 'O'), ('restaurant', 'O', 'O'), ('one', 'O', 'O'), ('day', 'O', 'O'), ('when', 'O', 'O'), ('we', 'O', 'O'), ('had', 'B', 'O'), ('to', 'I', 'O'), ('visit', 'O', 'O'), ('the', 'O', 'O'), ('Bexar', 'B', 'O'), ('County', 'I', 'B'), ('Tax', 'I', 'I'), ('Office', 'I', 'O'), ('off', 'B', 'O'), ('of', 'I', 'O'), ('Bandera', 'B', 'O'), ('Road', 'I', 'O'), ('.', 'O', 

TEST 13 [('It', 'O', 'O'), ('used', 'B', 'O'), ('to', 'I', 'O'), ('be', 'O', 'O'), ('fabulous', 'O', 'O'), (',', 'O', 'O'), ('why', 'O', 'O'), ('did', 'O', 'O'), ('you', 'B', 'O'), ('guys', 'I', 'O'), ('change', 'O', 'O'), ('it', 'O', 'O'), ('??', 'O', 'O')]
[('Queso', 'NOUN'), ('should', 'AUX'), ('not', 'PART'), ('be', 'VERB'), ('watery', 'ADJ'), (':(', 'SYM'), ('.....', 'PUNCT')]
[('Queso', 'NOUN'), ('should', 'AUX'), ('not', 'PART'), ('be', 'VERB'), ('watery', 'ADJ'), (':(', 'SYM'), ('.....', 'PUNCT')]
TEST 7 [('Queso', 'O', 'O'), ('should', 'O', 'O'), ('not', 'O', 'O'), ('be', 'O', 'O'), ('watery', 'O', 'O'), (':(', 'O', 'O'), ('.....', 'O', 'O')]
[('***', 'SYM'), ('update', 'NOUN'), ('***', 'SYM'), ('NEVER', 'ADV'), ('MIND', 'VERB'), ('!', 'PUNCT')]
[('***', 'SYM'), ('update', 'NOUN'), ('***', 'SYM'), ('NEVER', 'ADV'), ('MIND', 'VERB'), ('!', 'PUNCT')]
TEST 6 [('***', 'O', 'O'), ('update', 'O', 'O'), ('***', 'O', 'O'), ('NEVER', 'B', 'O'), ('MIND', 'I', 'O'), ('!', 'O', 'O')]
[('T

[('Excellent', 'ADJ'), ('location', 'NOUN'), ('.', 'PUNCT')]
TEST 3 [('Excellent', 'O', 'O'), ('location', 'O', 'O'), ('.', 'O', 'O')]
[('Good', 'ADJ'), ('sports', 'NOUN'), ('bar', 'NOUN'), ('.', 'PUNCT')]
[('Good', 'ADJ'), ('sports', 'NOUN'), ('bar', 'NOUN'), ('.', 'PUNCT')]
TEST 4 [('Good', 'O', 'O'), ('sports', 'B', 'O'), ('bar', 'I', 'O'), ('.', 'O', 'O')]
[('Hyatt', 'PROPN'), ('web', 'NOUN'), ('site', 'NOUN'), ('improved', 'VERB'), ('.', 'PUNCT')]
[('Hyatt', 'PROPN'), ('web', 'NOUN'), ('site', 'NOUN'), ('improved', 'VERB'), ('.', 'PUNCT')]
TEST 5 [('Hyatt', 'O', 'B'), ('web', 'B', 'I'), ('site', 'I', 'I'), ('improved', 'O', 'O'), ('.', 'O', 'O')]
[('Accurate', 'ADJ'), ('check', 'NOUN'), ('-', 'PUNCT'), ('out', 'NOUN'), ('.', 'PUNCT')]
[('Accurate', 'ADJ'), ('check', 'NOUN'), ('-', 'PUNCT'), ('out', 'NOUN'), ('.', 'PUNCT')]
TEST 5 [('Accurate', 'O', 'O'), ('check', 'B', 'O'), ('-', 'I', 'O'), ('out', 'I', 'O'), ('.', 'O', 'O')]
[('Rooms', 'NOUN'), ('clean', 'ADJ'), ('.', 'PUNCT')]


# Modèles GRU+CRF pour les 2 avec les prédictions MWEs et avec POS

In [29]:
use_pos = True
use_mwe = True

model_sst_generator = lambda pretrained_weights, le, le_vocab : NN_CRF_Model(nn_model=GRU(pretrained_weights, le,
                                                                                      embed_size=300, hidden_size=128,
                                                                                      use_pos=use_pos, use_mwe=use_mwe),
                                                                             crf_model=CRF(len(le), batch_first=True),
                                                                             optim=optim.Adam
                                                                            )

model_mwe_generator = lambda pretrained_weights, le, le_vocab : NN_CRF_Model(nn_model=GRU(pretrained_weights, le,
                                                                                      embed_size=300, hidden_size=128,
                                                                                      use_pos=use_pos),
                                                                             crf_model=CRF(len(le), batch_first=True),
                                                                             optim=optim.Adam
                                                                            )

preprocess_train_eval(model_sst_generator, model_mwe_generator, epochs_sst=50, epochs_mwe=50,
                      generate_pretrained_weights=False, use_pos=use_pos, use_mwe=use_mwe,
                      batch_size=64, max_len=16)

Model MWE:
1 9.03992241612145 5.868716229332818 0.8614514608859567
2 6.025023778090345 5.500612884097629 0.8614514608859567
3 5.649516086061642 5.156338193681505 0.8614514608859567
4 5.399408184300224 5.012870587242974 0.8614514608859567
5 5.166729762675543 4.755643516116672 0.8614514608859567
6 4.977345835988668 4.64733034769694 0.8614514608859567
7 4.801272520319128 4.441998269822863 0.8614514608859567
8 4.627951174273524 4.304982418484158 0.8613886270813698
9 4.450990046018219 4.209365865919325 0.8614514608859567
10 4.302483747459019 4.078367413414849 0.8613257932767829
11 4.150582120758538 3.926039695739746 0.861262959472196
12 3.9918474184327555 3.8418737570444743 0.8620169651272385
13 3.877923332321108 3.769508960511949 0.862708136977694
14 3.733217012814803 3.6752215915256077 0.8637763116556707
15 3.609720950681704 3.626780536439684 0.8633364750235627
16 3.5058956708394193 3.5327776114145917 0.8656613257932768
17 3.4162141278088893 3.528756332397461 0.8650958215519949
18 3.32148

In [30]:
# Évaluation avec 'dimsumeval.py'
!python ../dimsum-data-1.5/scripts/dimsumeval.py val.gold val.pred

[40m[97m[('We', 'PRON'), ('called', 'VERB'), ('few', 'ADJ'), ('companies', 'NOUN'), ('before', 'SCONJ'), ('we', 'PRON'), ('decide', 'VERB'), ('to', 'PART'), ('hire', 'VERB'), ('them', 'PRON'), ('.', 'PUNCT')]
[('We', 'PRON'), ('called', 'VERB'), ('few', 'ADJ'), ('companies', 'NOUN'), ('before', 'SCONJ'), ('we', 'PRON'), ('decide', 'VERB'), ('to', 'PART'), ('hire', 'VERB'), ('them', 'PRON'), ('.', 'PUNCT')]
TEST 11 [('We', 'O', 'O'), ('called', 'O', 'B'), ('few', 'O', 'I'), ('companies', 'O', 'O'), ('before', 'O', 'O'), ('we', 'O', 'O'), ('decide', 'O', 'O'), ('to', 'O', 'O'), ('hire', 'O', 'O'), ('them', 'O', 'O'), ('.', 'O', 'O')]
[('They', 'PRON'), ('came', 'VERB'), ('on', 'ADP'), ('time', 'NOUN'), ('and', 'CONJ'), ('completed', 'VERB'), ('their', 'PRON'), ('work', 'NOUN'), ('quickly', 'ADV'), ('.', 'PUNCT')]
[('They', 'PRON'), ('came', 'VERB'), ('on', 'ADP'), ('time', 'NOUN'), ('and', 'CONJ'), ('completed', 'VERB'), ('their', 'PRON'), ('work', 'NOUN'), ('quickly', 'ADV'), ('.', 'P

[('This', 'DET'), ('place', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('complete', 'ADJ'), ('embarrassment', 'NOUN'), ('.', 'PUNCT')]
[('This', 'DET'), ('place', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('complete', 'ADJ'), ('embarrassment', 'NOUN'), ('.', 'PUNCT')]
TEST 7 [('This', 'O', 'O'), ('place', 'O', 'O'), ('is', 'O', 'O'), ('a', 'O', 'O'), ('complete', 'O', 'O'), ('embarrassment', 'O', 'O'), ('.', 'O', 'O')]
[('the', 'DET'), ('2010', 'NUM'), ('Genesis', 'PROPN'), ('is', 'VERB'), ('grrrrrrrreeeaaat', 'ADJ'), ('!', 'PUNCT')]
[('the', 'DET'), ('2010', 'NUM'), ('Genesis', 'PROPN'), ('is', 'VERB'), ('grrrrrrrreeeaaat', 'ADJ'), ('!', 'PUNCT')]
TEST 6 [('the', 'O', 'O'), ('2010', 'B', 'O'), ('Genesis', 'I', 'O'), ('is', 'O', 'O'), ('grrrrrrrreeeaaat', 'O', 'O'), ('!', 'O', 'O')]
[('I', 'PRON'), ('am', 'VERB'), ('a', 'DET'), ('proud', 'ADJ'), ('owner', 'NOUN'), ('of', 'ADP'), ('a', 'DET'), ('brand', 'ADV'), ('new', 'ADJ'), ('2010', 'NUM'), ('Hyundai', 'PROPN'), ('Genesis', 'PROPN'), ('.', 

TEST 4 [('She', 'O', 'O'), ('is', 'O', 'O'), ('amazing', 'O', 'O'), ('.', 'O', 'O')]
[('I', 'PRON'), ('would', 'AUX'), ('recommend', 'VERB'), ('her', 'PRON'), ('to', 'ADP'), ('anyone', 'NOUN'), ('!', 'PUNCT')]
[('I', 'PRON'), ('would', 'AUX'), ('recommend', 'VERB'), ('her', 'PRON'), ('to', 'ADP'), ('anyone', 'NOUN'), ('!', 'PUNCT')]
TEST 7 [('I', 'O', 'O'), ('would', 'O', 'O'), ('recommend', 'O', 'O'), ('her', 'O', 'O'), ('to', 'O', 'O'), ('anyone', 'O', 'O'), ('!', 'O', 'O')]
[('The', 'DET'), ('best', 'ADJ'), ('pizza', 'NOUN'), ('ever', 'ADV'), ('i', 'PRON'), ('m', 'VERB'), ('fat', 'ADJ'), ('so', 'ADV'), ('i', 'PRON'), ('ve', 'AUX'), ('had', 'VERB'), ('a', 'DET'), ('ton', 'NOUN'), ('of', 'ADP'), ('pizza', 'NOUN'), ('other', 'ADJ'), ('than', 'ADP'), ('pizza', 'NOUN'), ('from', 'ADP'), ('chicago', 'PROPN'), ('it', 'PRON'), ('s', 'VERB'), ('the', 'DET'), ('best', 'ADJ')]
[('The', 'DET'), ('best', 'ADJ'), ('pizza', 'NOUN'), ('ever', 'ADV'), ('i', 'PRON'), ('m', 'VERB'), ('fat', 'ADJ'), ('

[('I', 'PRON'), ('considered', 'VERB'), ('just', 'ADV'), ('leaving', 'VERB'), ('after', 'SCONJ'), ('going', 'VERB'), ('inside', 'ADV'), ('and', 'CONJ'), ('nearly', 'ADV'), ('did', 'AUX'), ('.', 'PUNCT')]
TEST 11 [('I', 'O', 'O'), ('considered', 'O', 'O'), ('just', 'O', 'O'), ('leaving', 'O', 'O'), ('after', 'O', 'O'), ('going', 'O', 'O'), ('inside', 'O', 'O'), ('and', 'O', 'O'), ('nearly', 'O', 'O'), ('did', 'O', 'O'), ('.', 'O', 'O')]
[('Doctor', 'PROPN'), ('Gonzales', 'PROPN'), ('and', 'CONJ'), ('his', 'PRON'), ('entire', 'ADJ'), ('staff', 'NOUN'), ('are', 'VERB'), ('the', 'DET'), ('most', 'ADV'), ('professional', 'ADJ'), ('people', 'NOUN'), ('I', 'PRON'), ('have', 'AUX'), ('ever', 'ADV'), ('dealt', 'VERB'), ('with', 'ADP'), ('.', 'PUNCT')]
[('Doctor', 'PROPN'), ('Gonzales', 'PROPN'), ('and', 'CONJ'), ('his', 'PRON'), ('entire', 'ADJ'), ('staff', 'NOUN'), ('are', 'VERB'), ('the', 'DET'), ('most', 'ADV'), ('professional', 'ADJ'), ('people', 'NOUN'), ('I', 'PRON'), ('have', 'AUX'), ('e

[('Fantastic', 'ADJ'), ('food', 'NOUN'), ('served', 'VERB'), ('without', 'ADP'), ('pretense', 'NOUN'), (',', 'PUNCT'), ('very', 'ADV'), ('reasonably', 'ADV'), ('priced', 'VERB'), ('wine', 'NOUN'), ('selections', 'NOUN'), ('.', 'PUNCT')]
[('Fantastic', 'ADJ'), ('food', 'NOUN'), ('served', 'VERB'), ('without', 'ADP'), ('pretense', 'NOUN'), (',', 'PUNCT'), ('very', 'ADV'), ('reasonably', 'ADV'), ('priced', 'VERB'), ('wine', 'NOUN'), ('selections', 'NOUN'), ('.', 'PUNCT')]
TEST 12 [('Fantastic', 'O', 'O'), ('food', 'O', 'O'), ('served', 'O', 'O'), ('without', 'O', 'O'), ('pretense', 'O', 'O'), (',', 'O', 'O'), ('very', 'O', 'O'), ('reasonably', 'O', 'O'), ('priced', 'O', 'B'), ('wine', 'O', 'I'), ('selections', 'O', 'O'), ('.', 'O', 'O')]
[('A', 'DET'), ('great', 'ADJ'), ('place', 'NOUN'), ('to', 'PART'), ('go', 'VERB'), ('for', 'ADP'), ('dinner', 'NOUN'), ('after', 'ADP'), ('a', 'DET'), ('day', 'NOUN'), ('of', 'ADP'), ('wine', 'NOUN'), ('tasting', 'NOUN'), ('.', 'PUNCT')]
[('A', 'DET'), (

# Evaluation sur l'ensemble des données de tests

In [1]:
from Model import Model

from Generic_Torch_Model import Simple_GRU as GRU
from Generic_Torch_Model import NN_Model, NN_CRF_Model, extract_data, train_test_split_sentences, transform, \
                                load_pretrained_weights, transform_test, align_pred, write_data, combine_sst_mwe
import torch
import torch.nn as nn
import torch.optim as optim
from torchcrf import CRF
import pickle


def preprocess_train_test(model_sst_generator, model_mwe_generator, epochs_sst, epochs_mwe,
                          generate_pretrained_weights=False, use_pos=False, use_mwe=False,
                          batch_size=64, max_len=16):
    
    # Load data
    data = open('../dimsum-data-1.5/dimsum16.train', 'r').readlines()
    data_test = open('../dimsum-data-1.5/dimsum16.test.blind', 'r', encoding='utf-8').readlines()

    X, y_sst, y_mwe = extract_data(data)
    X_, _, y_sst, _ = train_test_split_sentences(X, y_sst, test_size=.0, random_state=0, shuffle=False)
    X, _, y_mwe, __ = train_test_split_sentences(X, y_mwe, test_size=.0, random_state=0, shuffle=False)
    assert (X == X_)
    X_test, y_sst_test, y_mwe_test = extract_data(data_test)
    X_test_, _, y_sst_test, _ = train_test_split_sentences(X_test, y_sst_test, test_size=.0, random_state=0, shuffle=False)
    X_test, _, y_mwe_test, __ = train_test_split_sentences(X_test, y_mwe_test, test_size=.0, random_state=0, shuffle=False)
    assert (X_test == X_test_)
    
    vocab_, le_vocab_, le_pos_, le_sst, train_loader_sst, test_loader_sst = transform(X, y_sst, X_test, y_sst_test,
                                                                                      max_len, batch_size, use_pos)
    vocab, le_vocab, le_pos, le_mwe, train_loader_mwe, test_loader_mwe = transform(X, y_mwe, X_test, y_mwe_test,
                                                                                   max_len, batch_size, use_pos)
    assert (vocab == vocab_) and (le_vocab == le_vocab_) and (le_pos == le_pos_)



    # Load Pretrained Weights (or generate for the first time)
    if generate_pretrained_weights:
        pretrained_weights = load_pretrained_weights('wiki-news-300d-1M.vec', vocab_size=len(vocab), le_vocab=le_vocab)
        with open(f"pretrained_weights_2.pkl", 'wb') as fo:
            pickle.dump(pretrained_weights, fo)
    else:
        pretrained_weights = load_pretrained_weights('pretrained_weights_2.pkl', from_pickle=True)


    # Train
    print('Model MWE:')
    model_mwe = model_mwe_generator(pretrained_weights, le_mwe, le_vocab)
    model_mwe.fit(train_loader_mwe, test_loader_mwe, epochs=epochs_mwe)
    
    if use_mwe:
        X_enc, y_mwe_enc = transform_test(le_vocab, le_mwe, X, y_mwe, max_len, le_pos)
        X_enc, y_sst_enc = transform_test(le_vocab, le_sst, X, y_sst, max_len, le_pos)
        X_enc_test, y_mwe_enc_test = transform_test(le_vocab, le_mwe, X_test, y_mwe_test, max_len, le_pos)
        X_enc_test, y_sst_enc_test = transform_test(le_vocab, le_sst, X_test, y_sst_test, max_len, le_pos)
        
        y_mwe_hat = y_mwe_enc
        y_mwe_hat_test = y_mwe_enc_test
        
        #print(y_sst_enc[:,0])
        
        if use_pos:
            X_enc = torch.cat((X_enc, y_mwe_hat.view(y_mwe_hat.size(0), y_mwe_hat.size(1), 1)), dim=-1)
            X_enc_test = torch.cat((X_enc_test,
                                    y_mwe_hat_test.view(y_mwe_hat_test.size(0), y_mwe_hat_test.size(1), 1)), dim=-1)
        else:
            X_enc = torch.cat((X_enc.view(X_enc.size(0), X_enc.size(1), 1),
                               y_mwe_hat.view(y_mwe_hat.size(0), y_mwe_hat.size(1), 1)), dim=-1)
            X_enc_test = torch.cat((X_enc_test.view(X_enc_test.size(0), X_enc_test.size(1), 1),
                                    y_mwe_hat_test.view(y_mwe_hat_test.size(0), y_mwe_hat_test.size(1), 1)), dim=-1)
        
        train_set = TensorDataset(X_enc, y_sst_enc)
        test_set = TensorDataset(X_enc_test, y_sst_enc_test)
        train_loader_sst = DataLoader(train_set, batch_size=batch_size, shuffle=True)
        test_loader_sst = DataLoader(test_set, batch_size=batch_size)
        
        assert (vocab == vocab_) and (le_vocab == le_vocab_)
    
    print('\nModel SST:')
    model_sst = model_sst_generator(pretrained_weights, le_sst, le_vocab)
    model_sst.fit(train_loader_sst, test_loader_sst, epochs=epochs_sst)
    
    
    # Get Model F1-Score on test set
    X_test_enc, y_sst_test_enc = transform_test(le_vocab, le_sst, X_test, y_sst_test, max_len=100, le_pos=le_pos)
    X_test_enc, y_mwe_test_enc = transform_test(le_vocab, le_mwe, X_test, y_mwe_test, max_len=100, le_pos=le_pos)
    
    precision, recall, f1_score, accuracy = model_mwe.score(X_test_enc, y_mwe_test_enc, le_mwe[''], le_mwe)
    print(f'\nModel MWE score: Acc={accuracy:.4} P={precision:.4f}, R={recall:.4f}, F1={f1_score:.4f}')
    
    if use_mwe:
        y_mwe_hat_test = model_mwe.predict(X_test_enc)
        if use_pos:
            X_test_enc = torch.cat((X_test_enc,
                                    y_mwe_hat_test.view(y_mwe_hat_test.size(0), y_mwe_hat_test.size(1), 1)), dim=-1)
        else:
            X_test_enc = torch.cat((X_test_enc.view(X_test_enc.size(0), X_test_enc.size(1), 1),
                                    y_mwe_hat_test.view(y_mwe_hat_test.size(0), y_mwe_hat_test.size(1), 1)), dim=-1)
    precision, recall, f1_score, accuracy = model_sst.score(X_test_enc, y_sst_test_enc, le_sst[''], le_sst)
    print(f'Model SST score: Acc={accuracy:.4} P={precision:.4f}, R={recall:.4f}, F1={f1_score:.4f}')


    # Écriture du fichier pour évaluation
    y_mwe_hat_test = model_mwe.predict(X_test_enc)
    if use_mwe:
        if use_pos:
            X_test_enc = torch.cat((X_test_enc,
                                   y_mwe_hat_test.view(y_mwe_hat_test.size(0), y_mwe_hat_test.size(1), 1)), dim=-1)
        else:
            X_test_enc = torch.cat((X_test_enc.view(X_test_enc.size(0), X_test_enc.size(1), 1),
                                   y_mwe_hat_test.view(y_mwe_hat_test.size(0), y_mwe_hat_test.size(1), 1)), dim=-1)
    y_sst_hat_test = model_sst.predict(X_test_enc)
    y_mwe_hat_test_align = align_pred(X_test, y_mwe_hat_test, le_mwe, is_mwe=True)
    y_sst_hat_test_align = align_pred(X_test, y_sst_hat_test, le_sst, is_mwe=False)
    rev_le_mwe = {v: k for k, v in le_mwe.items()}
    rev_le_sst = {v: k for k, v in le_sst.items()}
    y_mwe_hat_test_align = [rev_le_mwe[yi] if yi in rev_le_mwe else '' for yi in y_mwe_hat_test_align] # Decode
    y_sst_hat_test_align = [rev_le_sst[yi] if yi in rev_le_sst else '' for yi in y_sst_hat_test_align] # Decode
    write_data(X_test, y_sst_hat_test_align.copy(), y_mwe_hat_test_align.copy(), 'val_before_comb.pred')
    y_sst_hat_test_comb, y_mwe_hat_test_comb = combine_sst_mwe(X_test, y_sst_hat_test_align, y_mwe_hat_test_align)
    write_data(X_test, y_sst_hat_test_comb.copy(), y_mwe_hat_test_comb.copy(), 'test.pred')

# Différence de score entre l'évaluation avec le modèle et l'évaluateur fourni,
# car le modèle fait l'évaluation uniquement sur les séquences de mots de taille inférieure à max_len,
# alors que l'évaluateur le fait sur tous les mots, même ceux coupés lors du pré-traitement

In [6]:
use_pos = True
use_mwe = False

model_sst_generator = lambda pretrained_weights, le, le_vocab : NN_Model(model=GRU(pretrained_weights, le,
                                                                                   embed_size=300, hidden_size=128,
                                                                                   use_pos=use_pos, use_mwe=use_mwe),
                                                                         criterion=nn.CrossEntropyLoss(),
                                                                         optim=optim.Adam
                                                                        )

model_mwe_generator = lambda pretrained_weights, le, le_vocab : NN_CRF_Model(nn_model=GRU(pretrained_weights, le,
                                                                                      embed_size=300, hidden_size=128,
                                                                                      use_pos=use_pos),
                                                                             crf_model=CRF(len(le), batch_first=True),
                                                                             optim=optim.Adam
                                                                            )

preprocess_train_test(model_sst_generator, model_mwe_generator, epochs_sst=50, epochs_mwe=50,
                      generate_pretrained_weights=False, use_pos=use_pos, use_mwe=use_mwe,
                      batch_size=64, max_len=16)

Model MWE:
1 8.132709170113158 2.3801029968261718 1.0
2 5.762753956614496 2.072233680725098 1.0
3 5.358522151653705 2.180206787109375 1.0
4 5.057845538346612 1.8386101989746093 1.0
5 4.773624910615737 1.7384697113037109 1.0
6 4.516289251748411 1.8107745742797852 0.9981891513704831
7 4.294748931458702 1.8827135162353517 0.9968721705490163
8 4.0986575591462335 1.68231640625 0.9972837270557248
9 3.9336355328187267 2.1059583587646484 0.9911926907564409
10 3.781376280072183 1.7265038223266602 0.9925919828792493
11 3.636628755258257 1.2699452667236328 0.9963783027409663
12 3.510375769890008 1.2677659454345702 0.9947320767141329
13 3.378319411407935 1.324837371826172 0.9920981150711993
14 3.2761343470909465 1.164588035583496 0.9934150958926661
15 3.177141085046608 1.0042812690734864 0.9951436332208412
16 3.079178040860767 1.2120559616088866 0.9883941065108239
17 3.0062968784085062 1.0309064254760743 0.992427360276566
18 2.904366154798892 1.1216413269042969 0.9888056630175323
19 2.814539426663

In [7]:
# Évaluation avec 'dimsumeval.py'
!python ../dimsum-data-1.5/scripts/dimsumeval.py ../dimsum-data-1.5/dimsum16.test_encoded.test test.pred

[40m[97m[('@JoJoLyrics', 'X'), ('I', 'PRON'), ('hear', 'VERB'), ('enough', 'ADV'), ('talking', 'VERB'), (',', 'PUNCT'), ('just', 'ADV'), ('turn', 'VERB'), ('round', 'ADV'), ('keep', 'VERB'), ('walking', 'VERB'), ('haha', 'INTJ'), ('in', 'ADP'), ('that', 'DET'), ('part', 'NOUN')]
[('@JoJoLyrics', 'X'), ('I', 'PRON'), ('hear', 'VERB'), ('enough', 'ADV'), ('talking', 'VERB'), (',', 'PUNCT'), ('just', 'ADV'), ('turn', 'VERB'), ('round', 'ADV'), ('keep', 'VERB'), ('walking', 'VERB'), ('haha', 'INTJ'), ('in', 'ADP'), ('that', 'DET'), ('part', 'NOUN')]
TEST 15 [('@JoJoLyrics', 'O', 'O'), ('I', 'O', 'O'), ('hear', 'O', 'O'), ('enough', 'O', 'O'), ('talking', 'O', 'O'), (',', 'O', 'O'), ('just', 'O', 'O'), ('turn', 'B', 'O'), ('round', 'I', 'O'), ('keep', 'O', 'O'), ('walking', 'O', 'O'), ('haha', 'O', 'O'), ('in', 'O', 'O'), ('that', 'O', 'O'), ('part', 'O', 'O')]
[('Photo', 'NOUN'), (':', 'PUNCT'), ('http://t.co/2zggP2a', 'X')]
[('Photo', 'NOUN'), (':', 'PUNCT'), ('http://t.co/2zggP2a', 'X'

[('Wind', 'NOUN'), ('2.2', 'NUM'), ('mph', 'NOUN'), ('NNW', 'NOUN'), ('.', 'PUNCT'), ('Barometer', 'NOUN'), ('1019.0', 'NUM'), ('mb', 'NOUN'), (',', 'PUNCT'), ('Rising', 'VERB'), ('.', 'PUNCT'), ('Temperature', 'NOUN'), ('15.5', 'NUM'), ('Â°C', 'NOUN'), ('.', 'PUNCT'), ('Rain', 'NOUN'), ('today', 'NOUN'), ('0.0', 'NUM'), ('mm', 'NOUN'), ('.', 'PUNCT'), ('Humidity', 'NOUN'), ('59%', 'NUM')]
TEST 22 [('Wind', 'O', 'O'), ('2.2', 'O', 'O'), ('mph', 'O', 'O'), ('NNW', 'O', 'O'), ('.', 'O', 'O'), ('Barometer', 'O', 'O'), ('1019.0', 'O', 'O'), ('mb', 'O', 'O'), (',', 'O', 'O'), ('Rising', 'O', 'O'), ('.', 'O', 'O'), ('Temperature', 'O', 'O'), ('15.5', 'O', 'O'), ('Â°C', 'O', 'O'), ('.', 'O', 'O'), ('Rain', 'O', 'O'), ('today', 'O', 'O'), ('0.0', 'O', 'O'), ('mm', 'O', 'O'), ('.', 'O', 'O'), ('Humidity', 'O', 'O'), ('59%', 'O', 'O')]
[('I', 'PRON'), ("'", 'PART'), ('ve', 'VERB'), ('just', 'ADV'), ('found', 'VERB'), ('out', 'PART'), ('that', 'CONJ'), ('I', 'PRON'), ("'", 'PART'), ('ve', 'VERB')

[('http://www.youtube.com/watch?v=DnrAJgkOG3w', 'X'), ('@justinbieber', 'X'), ('look', 'VERB'), ('what', 'PRON'), ('i', 'PRON'), ('made', 'VERB'), ('?', 'PUNCT')]
[('http://www.youtube.com/watch?v=DnrAJgkOG3w', 'X'), ('@justinbieber', 'X'), ('look', 'VERB'), ('what', 'PRON'), ('i', 'PRON'), ('made', 'VERB'), ('?', 'PUNCT')]
TEST 7 [('http://www.youtube.com/watch?v=DnrAJgkOG3w', 'O', 'O'), ('@justinbieber', 'O', 'O'), ('look', 'O', 'O'), ('what', 'O', 'O'), ('i', 'O', 'O'), ('made', 'O', 'O'), ('?', 'O', 'O')]
[('Would', 'VERB'), ('still', 'ADV'), ('have', 'VERB'), ('no', 'DET'), ('hesitation', 'NOUN'), ('about', 'ADP'), ('booking', 'VERB'), ('Resort', 'PROPN'), ('Hoppa', 'PROPN'), ('for', 'ADP'), ('our', 'DET'), ('next', 'ADJ'), ('holiday', 'NOUN'), ('in', 'ADP'), ('June', 'NOUN'), ('and', 'CONJ'), ('again', 'ADV'), ('after', 'ADP'), ('that', 'PRON'), ('.', 'PUNCT')]
[('Would', 'VERB'), ('still', 'ADV'), ('have', 'VERB'), ('no', 'DET'), ('hesitation', 'NOUN'), ('about', 'ADP'), ('booki

[('Be', 'VERB'), ('ready', 'ADJ'), ('for', 'ADP'), ('anything', 'NOUN'), ('because', 'ADP'), ('anything', 'NOUN'), ('goes', 'VERB'), ('with', 'ADP'), ('me', 'PRON'), ('.', 'PUNCT'), ('Check', 'VERB'), ('out', 'ADP'), ('http://minilien.fr/a0ku66', 'X'), ('#match', 'X'), ('#sex', 'X'), ('#real', 'X')]
TEST 16 [('Be', 'O', 'O'), ('ready', 'O', 'O'), ('for', 'O', 'O'), ('anything', 'O', 'O'), ('because', 'O', 'B'), ('anything', 'O', 'I'), ('goes', 'O', 'O'), ('with', 'O', 'O'), ('me', 'O', 'O'), ('.', 'O', 'O'), ('Check', 'B', 'B'), ('out', 'I', 'I'), ('http://minilien.fr/a0ku66', 'O', 'B'), ('#match', 'O', 'I'), ('#sex', 'O', 'O'), ('#real', 'O', 'O')]
[('It', 'PRON'), ("'s", 'AUX'), ('raining', 'VERB'), ('!!', 'PUNCT'), ('---', 'PUNCT')]
[('It', 'PRON'), ("'s", 'AUX'), ('raining', 'VERB'), ('!!', 'PUNCT'), ('---', 'PUNCT')]
TEST 5 [('It', 'O', 'O'), ("'s", 'O', 'O'), ('raining', 'O', 'O'), ('!!', 'O', 'O'), ('---', 'O', 'O')]
[('And', 'CONJ'), ('their', 'PRON'), ('aspiration', 'NOUN'), (

# Tests fonction combine_sst_mwe

In [15]:
_ = None
X = [[i+1, _, _, _] for i in range(3)] + [['', '<eos>', _, _]] #[[i+1, _, _, _] for i in range(10)]# + [['', '<eos>', _, _]] + [[i+1, _, _, _] for i in range(10)] + ['<eos>']
#y_mwe = ['I', 'b', 'b', 'I', 'b', 'B', 'I', 'O', 'O', 'O']
#y_mwe = ['I', 'I', 'I', 'I', 'o', 'B', 'B', 'O', 'O', 'O']
#y_mwe = ['O', 'O', 'O', 'B', 'o', 'O', 'O', 'O', 'O', 'O']
y_mwe = ['b', 'b', 'b', '']#, 'B', 'o', 'O', 'O', 'O', 'O', 'O']
y_sst = ['', '', '', '']#, '', '', 'v.social', '', 'n.act', '', '']

#y_mwe = ['O', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'B'] + ['I'] + ['O', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'B'] + ['<eos>']
#y_sst = ['', '', '', '', '', 'v.social', '', 'n.act', '', ''] + [''] + ['', '', '', '', '', 'v.social', '', 'n.act', '', ''] + ['<eos>']

combine_sst_mwe(X, y_sst, y_mwe)

(['', '', '', ''], ['O', 'O', 'O', '<eos>'])