In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Lecture des fichiers 

In [None]:
data_dir = '/content/drive/MyDrive/DATASET/machine_translation/'

In [None]:
data_path_fr = data_dir+'small_vocab_fr.txt'
data_path_en = data_dir+'small_vocab_en.txt'

In [None]:
text_fr = open(data_path_fr,'r',encoding='utf8').read()
text_en = open(data_path_en,'r',encoding='utf8').read()

In [None]:
text_fr[:1000]

In [None]:
lines_fr = text_fr.split('\n')
lines_en = text_en.split('\n')

In [None]:
lines_fr[:10]

In [None]:
lines_en[:10]

In [None]:
for i in range(5):
    print(i, ' english sentence : ', lines_en[i])
    print(i, ' french sentence : ', lines_fr[i])
    print('\n')

In [None]:
total_word_fr = text_fr.split(' ')
total_word_en = text_en.split(' ')
print('nombre de mots fr :',len(total_word_fr))
print('nombre de mots en : ', len(total_word_en))

nombre de mots fr : 1823439
nombre de mots en :  1685433


# Tokenization

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
def tokenize(x):
    tokenizer=Tokenizer(filters='!"#$%&()*+-/:;<=>?@[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(x)
    return tokenizer, tokenizer.texts_to_sequences(x)

In [None]:
tok_en, sequences_en = tokenize(lines_en)

In [None]:
sequences_en[0]

[19, 25, 1, 10, 69, 6, 41, 2, 9, 5, 1, 57, 4, 46, 3]

In [None]:
tok_fr, sequences_fr = tokenize(lines_fr)

In [None]:
tok_en.word_index

In [None]:
tok_fr.word_index

In [None]:
for i in range(5):
    print(i, ' english sentence : ', lines_en[i])
    print(i, ' english sequence : ', sequences_en[i])
    print(i, ' french sentence : ', lines_fr[i])
    print(i, ' french sequence : ', sequences_fr[i])
    
    
    print('\n')

# Padding des sequences

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
max_len_sequence_en = max([len(x) for x in sequences_en])
print('len max en :',max_len_sequence_en)
max_len_sequence_fr = max([len(x) for x in sequences_fr])
print('len max fr :',max_len_sequence_fr)

len max en : 17
len max fr : 23


In [None]:
def padding(x,length):
    padded = pad_sequences(x,maxlen = length, padding='post')
    return padded

In [None]:
padded = padding(sequences_en,None)
padded_fr = padding(sequences_fr,None)

In [None]:
for i in range(10):
    print(i,' original sequence : ', sequences_en[i])
    print(i, ' padded sequence :   ',padded[i])
    print('\n')

0  original sequence :  [19, 25, 1, 10, 69, 6, 41, 2, 9, 5, 1, 57, 4, 46, 3]
0  padded sequence :    [19 25  1 10 69  6 41  2  9  5  1 57  4 46  3  0  0]


1  original sequence :  [7, 22, 23, 1, 11, 64, 6, 45, 2, 9, 5, 1, 11, 53, 4, 47, 3]
1  padded sequence :    [ 7 22 23  1 11 64  6 45  2  9  5  1 11 53  4 47  3]


2  original sequence :  [24, 1, 11, 69, 6, 40, 2, 9, 5, 1, 11, 70, 4, 36, 3]
2  padded sequence :    [24  1 11 69  6 40  2  9  5  1 11 70  4 36  3  0  0]


3  original sequence :  [7, 22, 23, 1, 10, 66, 6, 36, 2, 9, 5, 1, 59, 4, 44, 3]
3  padded sequence :    [ 7 22 23  1 10 66  6 36  2  9  5  1 59  4 44  3  0]


4  original sequence :  [31, 13, 18, 15, 1, 7, 85, 2, 8, 32, 13, 18, 1, 7, 87, 3]
4  padded sequence :    [31 13 18 15  1  7 85  2  8 32 13 18  1  7 87  3  0]


5  original sequence :  [33, 14, 15, 1, 7, 93, 2, 8, 32, 14, 1, 7, 85, 3]
5  padded sequence :    [33 14 15  1  7 93  2  8 32 14  1  7 85  3  0  0  0]


6  original sequence :  [20, 1, 68, 6, 49, 2, 8, 5, 

In [None]:
for i in range(10):
    print(i,' original sequence : ', sequences_fr[i])
    print(i, ' padded sequence :   ',padded_fr[i])
    print('\n')

0  original sequence :  [37, 36, 1, 10, 69, 39, 13, 26, 3, 8, 5, 1, 114, 4, 52, 2]
0  padded sequence :    [ 37  36   1  10  69  39  13  26   3   8   5   1 114   4  52   2   0   0
   0   0   0   0   0]


1  original sequence :  [6, 34, 33, 1, 14, 21, 4, 51, 3, 8, 5, 97, 71, 4, 53, 2]
1  padded sequence :    [ 6 34 33  1 14 21  4 51  3  8  5 97 71  4 53  2  0  0  0  0  0  0  0]


2  original sequence :  [103, 1, 14, 69, 4, 47, 3, 8, 5, 1, 14, 23, 4, 43, 2]
2  padded sequence :    [103   1  14  69   4  47   3   8   5   1  14  23   4  43   2   0   0   0
   0   0   0   0   0]


3  original sequence :  [6, 34, 33, 1, 10, 273, 4, 43, 3, 8, 5, 105, 21, 4, 50, 2]
3  padded sequence :    [  6  34  33   1  10 273   4  43   3   8   5 105  21   4  50   2   0   0
   0   0   0   0   0]


4  original sequence :  [42, 15, 17, 18, 1, 12, 84, 3, 7, 41, 15, 17, 1, 9, 85, 2]
4  padded sequence :    [42 15 17 18  1 12 84  3  7 41 15 17  1  9 85  2  0  0  0  0  0  0  0]


5  original sequence :  [22, 18, 19

# La pipeline

In [None]:
def preprocess(x,y):
    x_tok, sequence_x = tokenize(x)
    y_tok, sequence_y = tokenize(y)
    
    padded_x = padding(sequence_x,None)
    padded_y = padding(sequence_y,None)
    
    padded_y = np.expand_dims(padded_y, axis=-1)
    
    return(padded_x, padded_y, x_tok, y_tok)

In [None]:
padded_en, padded_fr, tok_en, tok_fr = preprocess(lines_en,lines_fr)

In [None]:
padded_fr.shape

(137861, 23, 1)

In [None]:
padded_en

array([[19, 25,  1, ...,  3,  0,  0],
       [ 7, 22, 23, ...,  4, 47,  3],
       [24,  1, 11, ...,  3,  0,  0],
       ...,
       [26,  1, 12, ...,  3,  0,  0],
       [ 7, 93,  1, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int32)

# Conversion one  hot encodinng

format des données en on hot encoding : x ( len(x), taille de la sequence paddé, longueur vocab x )

y ( len(y), taille de la sequence paddé, longueur vocab y )


In [None]:
print(len(lines_fr))
print(len(padded_fr))
padded_fr.shape
padded_en.shape


137861
137861


(137861, 17)

In [None]:
#x_ohe_data = np.zeros((padded_en.shape[0],padded_en.shape[1],len(tok_en.word_index)))
#y_ohe_data= np.zeros((padded_fr.shape[0],padded_fr.shape[1],len(tok_fr.word_index)))

In [None]:
#print(x_ohe_data.shape)
#print(y_ohe_data.shape)

# Decodage

In [None]:
print("max len sequence en :",padded_en.shape[1])
print("max len sequence fr :",padded_fr.shape[1])
print("number words en :", len(tok_en.word_index))
print("number words fr :", len(tok_fr.word_index))
print("number sequences en :", len(padded_en))
print("number sequences fr :", len(padded_fr))

max len sequence en : 17
max len sequence fr : 23
number words en : 226
number words fr : 348
number sequences en : 137861
number sequences fr : 137861


In [None]:
def sequence_to_sentence(sequence, tokenizer):
    decoded_sequence = []
    for i in sequence:
        for word, index in tokenizer.word_index.items():
                if i == index:
                    decoded_sequence.append(word)
                    decoded_sequence.append(' ')
    return ''.join(decoded_sequence)


In [None]:
padded_en[10]

array([ 7, 86,  1, 34, 13, 18, 15,  2,  8,  7, 89,  1, 32, 13, 18,  3,  0],
      dtype=int32)

In [None]:
sentence = sequence_to_sentence(padded_en[10], tok_en)
sentence

'the lime is her least liked fruit , but the banana is my least liked . '

# Creation du modele

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Dropout, TimeDistributed, LSTM, Bidirectional, Embedding, RepeatVector
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [None]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256))
    model.add(Bidirectional(LSTM(256, return_sequences=True)))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(french_vocab_size, activation='softmax'))
    model.compile(loss=sparse_categorical_crossentropy,optimizer='adam',metrics=['accuracy'])
    return model

In [None]:
model = Sequential()
model.add(Embedding(len(tok_en.word_index)+1,256))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(tok_fr.word_index)+1, activation='softmax'))
model.compile(loss=sparse_categorical_crossentropy, optimizer='adam',metrics=['accuracy'])

In [None]:
tmp_x = padding(padded_en,padded_fr.shape[1])
print(tmp_x.shape)
tmp_x = np.expand_dims(tmp_x, axis=-1)
print(tmp_x.shape)

(137861, 23)
(137861, 23, 1)


In [None]:
model.fit(tmp_x,padded_fr,batch_size = 1024, epochs=20, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20

In [None]:
model_lstm = simple_model(tmp_x.shape,padded_fr.shape[1],len(tok_en.word_index)+1,len(tok_fr.word_index)+1)


In [None]:
#, input_length=input_shape[1], input_shape=input_shape[1:]

In [None]:
print(tmp_x.shape)
print(padded_fr.shape[1])
print(len(tok_en.word_index)+1)
print(len(tok_fr.word_index)+1)

NameError: ignored

In [None]:
model_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 256)         58112     
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 512)         1050624   
_________________________________________________________________
dense_2 (Dense)              (None, None, 1024)        525312    
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 1024)        0         
_________________________________________________________________
dense_3 (Dense)              (None, None, 349)         357725    
Total params: 1,991,773
Trainable params: 1,991,773
Non-trainable params: 0
_________________________________________________________________


In [None]:
#simple_rnn_model.fit(tmp_x,padded_fr,batch_size=1024, epochs=20, validation_split=0.2)
model_lstm.fit(tmp_x,padded_fr,batch_size=1024, epochs=20, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fb7b0d71710>

In [None]:
#simple_rnn_model.save('machine_translation.h5')

In [None]:
from tensorflow.keras.models import load_model

In [None]:
#model = load_model('machine_translation.h5')

# Test 

In [None]:
for i in range(10):
  true_translation = lines_fr[i]
  original_text = lines_en[i]
  print(original_text)
  print(true_translation)

  new_tmp = np.expand_dims(tmp_x[i],axis=0)
  #print(new_tmp.shape)
  prediction = model_lstm(new_tmp)
  prediction = np.squeeze(prediction, axis=0)
  pred_sequence = []
  for i in range(len(prediction)):
      pred_sequence.append(np.argmax(prediction[i]))
  true_pred = sequence_to_sentence(pred_sequence, tok_fr)
  print(true_pred)
  print("\n")






new jersey is sometimes quiet during autumn , and it is snowy in april .
new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
new jersey est parfois calme pendant l' automne , et il est neigeux en avril . 


the united states is usually chilly during july , and it is usually freezing in november .
les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .
les états unis est généralement froid en juillet , et il gèle habituellement en novembre . 


california is usually quiet during march , and it is usually hot in june .
california est généralement calme en mars , et il est généralement chaud en juin .
california est généralement calme en mars , et il est généralement chaud en juin . 


the united states is sometimes mild during june , and it is cold in september .
les états-unis est parfois légère en juin , et il fait froid en septembre .
les états unis est parfois doux en juin , et il fait froid en septembre . 


your least li

# Test sur des phrases

In [None]:
test = open('test.txt','r',encoding='utf8').read()
lines_test=test.split('\n')
custom_sequence = tok_en.texts_to_sequences(lines_test)
padded_test= padding(custom_sequence,23)
padded_test = np.expand_dims(padded_test, axis=-1)



In [None]:
  prediction = model_lstm.predict(padded_test)
  prediction = np.squeeze(prediction, axis=0)
  pred_sequence = []
  for i in range(len(prediction)):
      pred_sequence.append(np.argmax(prediction[i]))
  true_pred = sequence_to_sentence(pred_sequence, tok_fr)
  print(true_pred)
  print("\n")

les pommes sont . 


