We are using code from [Tensorflow NMT Tutorial](https://www.tensorflow.org/alpha/tutorials/text/nmt_with_attention) to do most of the preprocessing

## Clean Data

1. Add a *start* and *end* token to each sentence.
2. Clean the sentences by removing special characters.
3. Create a word index and reverse word index (dictionaries mapping from word → id and id → word).
4. Pad each sentence to a maximum length.

In [0]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
    w = w.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    
    return w

In [0]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, FRENCH]

import zipfile
def create_dataset(path_raw, path_target, num_examples):
  
    with zipfile.ZipFile(path_raw, 'r') as zip_ref:
      zip_ref.extractall('')
    
    with open(path_target, encoding = 'UTF-8') as f:
      lines = f.read().strip().split('\n')
      each_line = []
      for i in lines:
        each_line.append(i.split("|"))
        
      src_data = []
      tgt_data = []
      for i in range(len(each_line)-1):
        the_tuple = each_line[i][1]
        if(the_tuple!="[In Process Citation]." and the_tuple!="[Not Available]."):
          src_data.append(each_line[i][1].strip('[].').lower())
          tgt_data.append(each_line[i][2].strip('[].').lower())
    
    src = [preprocess_sentence(s) for s in src_data[:num_examples]]
    tgt = [preprocess_sentence(s) for s in tgt_data[:num_examples]]
    
    # save a list of clean sentences to file
    def save_clean_txt(filename, dataset): 
      with open(filename, 'x') as f:
        for line in dataset:
          f.write(line)
          f.write("\n") 
      print('Saved: {}'.format(filename))  
    
    save_clean_txt('en_clean.txt', src)
    save_clean_txt('fr_clean.txt', tgt)
    
    return src, tgt

In [0]:
# path_raw = 'drive/My Drive/INFO7374-NeuralNetwork&AI/Final-Project/data/med/pubmed_en_fr.txt.zip'
# path_data = 'pubmed_en_fr.txt'

In [0]:
# src, tgt = create_dataset(path_raw, path_data, None)

Saved: en_clean.txt
Saved: fr_clean.txt
