In [1]:
import keras
import pandas as pd
import re
import numpy as np

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding

Using TensorFlow backend.


Імплементація мережі базується на матеріалах цбого посту:
https://medium.com/@dev.elect.iitd/neural-machine-translation-using-word-level-seq2seq-model-47538cba8cd7

In [None]:
path = '/content/drive/My Drive/opusparcus_v2/en-train-100K.txt'

In [None]:
dataset = []
with open(path, 'r') as f:
    for line in f.readlines():
        line = line[:-1]
        dataset.append(line.split('\t'))

In [None]:
df = pd.DataFrame(dataset, columns=['id', 'first', 'second', 'metric_1', 'metric_2', 'metric_3', 'metric_4'])

In [5]:
df.head()

Unnamed: 0,id,first,second,metric_1,metric_2,metric_3,metric_4
0,en-N7,Jumby now wants to be born .,Jumby want birth .,77.5163,2.5,5,9
1,en-N8,It was a difficult and long delivery .,The delivery was difficult and long .,77.5163,2.5,5,14
2,en-N12,I like to be beautiful everyday .,I like to be pretty everyday .,77.5163,2.5,5,8
3,en-N22,Bernadette wants a prenup .,Bernadette wants to get a prenup .,77.5163,2.5,5,7
4,en-N45,Don 't say you don 't remember me .,Don 't tell me you don 't remember me .,74.3904,3.33333,5,7


Мабуть, краще було б мати один словник і для декодера, і для енкодера, але було концептуально простіше лишити так

In [None]:
# Lowercase all characters
df['first'] = df['first'].apply(lambda x: x.lower())
df['second'] = df['second'].apply(lambda x: x.lower())

df['second'] = df['second'].apply(lambda x : 'START_ '+ x + ' _END')

In [None]:
# Vocabulary of encoder
all_encoder_words=set()
for sent in df['first']:
    for word in sent.split():
        if word not in all_encoder_words:
            all_encoder_words.add(word)

# Vocabulary of decoder
all_decoder_words=set()
for sent in df['second']:
    for word in sent.split():
        if word not in all_decoder_words:
            all_decoder_words.add(word)

In [None]:
# Max Length of source sequence
lenght_list=[]
for l in df['first']:
    lenght_list.append(len(l.split(' ')))
max_length_src = np.max(lenght_list)

# Max Length of target sequence
lenght_list=[]
for l in df['second']:
    lenght_list.append(len(l.split(' ')))
max_length_tar = np.max(lenght_list)

input_words = sorted(list(all_encoder_words))
target_words = sorted(list(all_decoder_words))

# Calculate Vocab size for both source and target
num_encoder_tokens = len(all_encoder_words)
num_decoder_tokens = len(all_decoder_words)
num_decoder_tokens += 1 # For zero padding

# Create word to token dictionary for both source and target
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

# Create token to word dictionary for both source and target
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [9]:
input_words[:10]

['!', '"', '#', '$', '%', "'", "'a", "'aime", "'all", "'am"]

In [10]:
df.head()

Unnamed: 0,id,first,second,metric_1,metric_2,metric_3,metric_4
0,en-N7,jumby now wants to be born .,START_ jumby want birth . _END,77.5163,2.5,5,9
1,en-N8,it was a difficult and long delivery .,START_ the delivery was difficult and long . _END,77.5163,2.5,5,14
2,en-N12,i like to be beautiful everyday .,START_ i like to be pretty everyday . _END,77.5163,2.5,5,8
3,en-N22,bernadette wants a prenup .,START_ bernadette wants to get a prenup . _END,77.5163,2.5,5,7
4,en-N45,don 't say you don 't remember me .,START_ don 't tell me you don 't remember me ....,74.3904,3.33333,5,7


In [None]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [12]:
X, y = df['first'], df['second']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=42)
X_train.shape, X_test.shape

((90000,), (10000,))

In [None]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [None]:
latent_dim = 100

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

# Use a softmax to generate a probability distribution over the target vocabulary for each time step
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 30

In [None]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)
model.save_weights('/content/drive/My Drive/opusparcus_v2/model_weights.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
model.load_weights('/content/drive/My Drive/opusparcus_v2/model_weights.h5')

In [None]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
dev_path = '/content/drive/My Drive/opusparcus_v2/en-dev.txt'

In [None]:
dataset = []
with open(dev_path, 'r') as f:
    for line in f.readlines():
        line = line[:-1]
        dataset.append(line.split('\t'))

In [None]:
df_dev = pd.DataFrame(dataset, columns=['id', 'first', 'second', 'score'])

In [28]:
df_dev.head()

Unnamed: 0,id,first,second,score
0,en-D69,300 heavy horse ?,We have no chance .,1.5
1,en-D76,When 'd you last see him ?,When was the last time you saw him ?,4.0
2,en-D119,Anyone who can verify that ?,Can anyone corroborate that ?,3.5
3,en-D168,I 'm not promising anything .,"No promises , okay ?",3.0
4,en-D242,Nothing 's changed .,Things ain 't no different .,3.5


In [30]:
df_dev['score'].value_counts()

4.0    409
3.5    319
3.0    287
1.0    213
2.5    105
2.0     74
1.5     48
Name: score, dtype: int64

In [None]:
df_dev['first'] = df_dev['first'].apply(lambda x: x.lower())
df_dev['second'] = df_dev['second'].apply(lambda x: x.lower())

df_dev['second'] = df_dev['second'].apply(lambda x : 'START_ '+ x + ' _END')

In [32]:
df_dev['score'] = df_dev['score'].astype('float')
df_dev_good_score = df_dev[df_dev['score']>=3]
df_dev_good_score.shape

(1015, 4)

In [None]:
X_dev, y_dev = df_dev_good_score['first'], df_dev_good_score['second']

In [35]:
! pip install py-rouge

Collecting py-rouge
[?25l  Downloading https://files.pythonhosted.org/packages/9c/1d/0bdbaf559fb7afe32308ebc84a2028600988212d7eb7fb9f69c4e829e4a0/py_rouge-1.1-py3-none-any.whl (56kB)
[K     |████████████████████████████████| 61kB 1.8MB/s eta 0:00:011
[?25hInstalling collected packages: py-rouge
Successfully installed py-rouge-1.1


In [None]:
import rouge

evaluator = rouge.Rouge(['rouge-l'])

In [None]:
dev_gen = generate_batch(X_dev, y_dev, batch_size = 2)

Аж на цьому етапі я виявив велике упущення: архітектура мережі не була розрахована на невідомі слова) Спочатку я просто ігнорував такі випадки, їх було десь 130 із 1015 у тренувальній вибірці. Потім спробував невідоме слово заміняти просто на перше слово у словнику, це був знак оклику. Ясно, що такий підхід поганий з точки зору моделювання мови, але виявилось таке: я окремо перевіряв, як цей підхід працює на реченнях з невідомими словами і модель все одно генерувала перефразування, іноді навіть влучні. Остаточна якість моделі погіршилась трохи (бо раніше ми погані приклади просто викидали, а зараз генеруємо для них не завжди вдалі перефразування), але я вирішив уже не перенавчати сітку. Я ж правильно розумію, що не вийшло би просто змінити виміри моделі, якось додавши на вхід ще одне слово? Тоді у моделі не було би вивчених вагів для цього слова.

In [None]:
def generate_data_to_inference(X, y, j):
    encoder_input_data = np.zeros((1, max_length_src),dtype='float32')
    decoder_input_data = np.zeros((1, max_length_tar),dtype='float32')
    decoder_target_data = np.zeros((1, max_length_tar, num_decoder_tokens),dtype='float32')
    for i, (input_text, target_text) in enumerate(zip(X[j:j+1], y[j:j+1])):
        for t, word in enumerate(input_text.split()):
            encoder_input_data[i, t] = input_token_index.get(word, 0) # encoder input seq
        for t, word in enumerate(target_text.split()):
            if t<len(target_text.split())-1:
                decoder_input_data[i, t] = target_token_index.get(word,0) # decoder input seq
            if t>0:
                decoder_target_data[i, t - 1, target_token_index.get(word,0)] = 1.
    return(encoder_input_data, decoder_input_data)

In [None]:
k = 128
input_seq, actual_output = generate_data_to_inference(X_dev, y_dev, k)
decoded_sentence = decode_sequence(input_seq)

In [134]:
print('Input phrase:', X_dev[k:k+1].values[0])
print('Actual paraphrase:', y_dev[k:k+1].values[0][6:-4])
print('Predicted parafrase:', decoded_sentence[:-4])

Input phrase: actually , hum .
Actual paraphrase:  as a matter of fact . 
Predicted parafrase:  in fact , please . 


In [135]:
paraphrases = []
for k in tqdm.tqdm(range(len(X_dev))):
    try:
        p = {}
        input_seq, actual_output = generate_data_to_inference(X_dev, y_dev, k)
        decoded_sentence = decode_sequence(input_seq)
        p['input_phrase'] = X_dev[k:k+1].values[0]
        p['actual_output'] = y_dev[k:k+1].values[0][6:-4]
        p['predicted_output'] = decoded_sentence[:-4]
        paraphrases.append(p)
    except:
        print(k)

predicted_df = pd.DataFrame(paraphrases)
predicted_df.to_csv('/content/drive/My Drive/opusparcus_v2/predicted_df2.csv')

100%|██████████| 1015/1015 [00:18<00:00, 55.71it/s]


In [None]:
import tqdm

In [136]:
predicted_df.shape

(1015, 3)

In [106]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Це метрика порахована по 885 реченнях, де використовувались тільки відомі слова

In [107]:
evaluator.get_scores(predicted_df['input_phrase'], predicted_df['predicted_output'])

{'rouge-l': {'f': 0.43117841243448507,
  'p': 0.42363882378837475,
  'r': 0.45709475395916044}}

Це вже більш чесна метрика, де так чи інакше оцінюються всі 1015 перефразувань

In [137]:
evaluator.get_scores(predicted_df['input_phrase'], predicted_df['predicted_output'])

{'rouge-l': {'f': 0.4285871993508369,
  'p': 0.4211491534962989,
  'r': 0.45425614888669064}}

Далі ручне анотування перефразувань

In [None]:
pred_sample = predicted_df.sample(50, random_state=42).copy()

In [None]:
pred_sample.to_csv('/content/drive/My Drive/opusparcus_v2/predicted_sample2.csv')

In [141]:
pred_sample_annotated2 = pd.read_csv('/content/drive/My Drive/opusparcus_v2/predicted_sample_annotated2.csv')
pred_sample_annotated2['is_good'].mean()

0.48

In [142]:
pred_sample_annotated2[['input_phrase', 'predicted_output', 'is_good']].head(50)

Unnamed: 0,input_phrase,predicted_output,is_good
0,get on the motherfucking ground .,get the fuck down .,1
1,do you want to give it another shot ?,would you like to try it ?,1
2,everything is about money .,it 's all about money .,1
3,do you want company ?,do you want me to have a seat ?,1
4,shut off that engine .,turn off the engine .,0
5,you 're a scam artist .,you 're a listen to .,0
6,what 's happening over here ?,what 's going on here ?,1
7,you 'll be working over here .,you 're gonna work out here .,1
8,no one should have to die alone .,you can 't die .,0
9,he approached me .,he 's my own .,0


По анотованих прикладах такі коментарі:
* Мережа пристойно впоралась на недовгих реченнях. Ті, що були довші (на око) перефразувались гірше
* Здалось, що мережа гарно вивчила прямі синоніми слів, а також якісь ідеоматичні синоніми або фразові дієслова
* іноді мережа гарно перефразовує, але губить заперечення, що змінює весь сенс речення
* Хоча начебто схема роботи такої мережі більш-менш зрозуміла, все одно іноді не міг позбутись відчуття "Чому це взагалі працює і як їй вдається навіть такі перефразування робити?")
* Хоча фреймворки все навчання беруть на себе, треба вміти робити специфічну підготовку даних, якби я зараз сів повторити все з нуля, то точно щось пішло би не по плану