In [1]:
import os
import numpy as np
import collections
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
# config = ConfigProto()
# config.gpu_options.allow_growth = True
# session = InteractiveSession(config=config)

2022-05-01 17:56:06.022477: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-01 17:56:06.022508: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
#prprocessing


In [3]:
import os
from nltk.tokenize import word_tokenize
# nltk.download("punkt")

In [4]:

def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')


In [5]:
def tokenize(x):
    # TODO: Implement
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    #it return object->tokenizer and tokenize form of data(sentesces)
    return tokenizer.texts_to_sequences(x), tokenizer


In [6]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    # TODO: Implement
    print("type is:",type(x))
    return pad_sequences(x, maxlen=length, padding='post')


In [7]:
def preprocess(x, y, en_max_len=None, fr_max_len=None):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x, en_max_len)
    preprocess_y = pad(preprocess_y, fr_max_len)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk


In [8]:
#training

In [9]:
def train_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    
    """
    input_shpae=dimention of english data 
    
    Build and train a model that incorporates embedding, encoder-decoder, and bidirectional RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # TODO: Implement

    # Hyperparameters
    learning_rate = 0.003
    
    # Build the layers    
    model = Sequential()
    # Embedding
    model.add(Embedding(english_vocab_size, 128, input_length=input_shape[1], input_shape=input_shape[1:]))
    # Encoder
    model.add(Bidirectional(GRU(128)))
    model.add(RepeatVector(output_sequence_length))
    # Decoder
    model.add(Bidirectional(GRU(128, return_sequences=True)))
    model.add(TimeDistributed(Dense(512, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    model.compile(loss=sparse_categorical_crossentropy,optimizer=Adam(learning_rate),metrics=['accuracy'])
    
    #compile
    model.fit(preproc_english_sentences_train, preproc_french_sentences_train,batch_size=1024, epochs=1, validation_split=0.2)
    model.summary()
    return model

In [10]:
#test accurecy

In [11]:
def test_accuracy(model,preproc_english_sentences_test,preproc_french_sentences_test):
    
    result = model.evaluate(preproc_english_sentences_test,preproc_french_sentences_test, batch_size=1024)
    return result

In [12]:
def save_model(model,path):
    print("inside save_model")
    model.save(path)

In [13]:
def load_model(path):
    model=tf.keras.models.load_model(path)
    return model

In [14]:
#prediction

In [15]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    
    index_to_words = {}
    for word, _id in tokenizer.word_index.items():
        index_to_words[_id] = word
    index_to_words[0] = '<PAD>'
    
    res="" 
    for prediction in np.argmax(logits,1):
        if prediction!=0:
            res=res+" "+index_to_words[prediction]
        
    return res;

In [16]:
def final_predictions(model,sentence,french_tokenizer,english_tokenizer,preproc_english_sentences):
    
    sentence = [english_tokenizer.word_index[word] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=preproc_english_sentences.shape[-1], padding='post')
    sentence=model.predict(sentence, len(sentence))
    #reshaping because after prediction size is (1,21,345)
    sentence = sentence.reshape(21,345)
    res=logits_to_text(sentence,french_tokenizer)
    return res

In [17]:
#main

In [18]:
#1
english_sentences = '/home/jai/Documents/projects/translator/data/small_vocab_en'
french_sentences = '/home/jai/Documents/projects/translator/data/small_vocab_fr'

In [19]:
#2
english_sentences =load_data(english_sentences)
french_sentences =load_data(french_sentences)
    
#english_tokenizer stores key and value where key is a number and value is unique word in englsih data set
#same for french_tokenizer
preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =preprocess(english_sentences, french_sentences)

preproc_english_sentences_train, preproc_english_sentences_test, preproc_french_sentences_train, preproc_french_sentences_test = train_test_split(preproc_english_sentences, preproc_french_sentences, test_size=0.33,random_state=42)

type is: <class 'list'>
type is: <class 'list'>


In [20]:
# for jai, vik in english_tokenizer.word_index.items():
#     print(jai, vik)
#len(english_tokenizer.word_index)+1

In [25]:
#3
#preproc_english_sentences_train.shape->dimention of preproc_english_sentences_train
#preproc_french_sentences_train.shape[1]->number of collumn in preproc_french_sentences_train.
#len(english_tokenizer.word_index)->number of unique words in english dataset

model=train_model(preproc_english_sentences_train.shape,preproc_french_sentences_train.shape[1],len(english_tokenizer.word_index)+1,len(french_tokenizer.word_index)+1)

2022-05-01 17:15:09.352344: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-01 17:15:09.352369: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-01 17:15:09.352385: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (vikesh-hp-pc): /proc/driver/nvidia/version does not exist
2022-05-01 17:15:09.352706: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-01 17:15:09.810184: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of th

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 128)           25600     
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               198144    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 21, 256)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 21, 256)           296448    
_________________________________________________________________
time_distributed (TimeDistri (None, 21, 512)           131584    
_________________________________________________________________
dropout (Dropout)            (None, 21, 512)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 21, 345)           1

In [26]:
#test accurecy
test_accuracy(model,preproc_english_sentences_test,preproc_french_sentences_test)



[1.9536659717559814, 0.509480357170105]

In [27]:
#prediction
sentence = 'he saw a old yellow truck'

In [28]:
final_predictions(model,sentence,french_tokenizer,english_tokenizer,preproc_english_sentences)

' il les les les et'