# Google Translate API

This notebook demonstrates reading from a text file, sending it to the Google translate API, receiving the translation, and writing the output to file. (Nicolas Chan, First created 11/16/2017)

### Kernal Setup
```bash
conda create --name=translate python=3.6 ipykernel
source activate translate
ipython kernel install --user --name translate
pip install --upgrade google-cloud-translate nltk
```
### Credentials
IMPORTANT: Store your service account JSON credentials in `client_secret.json`.

In [None]:
# Configuration
input_filename = 'input.txt'
output_filename = 'output.txt'
target_language = 'en'

# punkt_tokenizer is used to identify where sentences end for splitting into chunks.
# It should be set to the INPUT language.
# For more info: http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt
# Available languages: czech, dutch, estonian, french, greek, norwegian, portuguese, 
#   spanish, turkish, danish, english, finnish, german, italian, polish, slovene, swedish
punkt_tokenizer = 'tokenizers/punkt/german.pickle'

# Limit maximum character length of text sent to Google translate at once
# Larger limit yields better translations because Google translate uses context
# Too large of a length might be rejected by Google translate (2000 seems safe)
max_length = 2000

# Set to True only if it makes sense to send chunks split by line breaks.
# If line breaks might occur in the middle of sentences, set this to False.
preserve_line_breaks = False

In [None]:
# Read input file contents
input_contents = ''
with open(input_filename, encoding='utf8') as file:
    for line in file.readlines():
        input_contents += line

In [None]:
# Function to combine sentences into larger chunks
def condense(lst, length):
    """Concatenates elements in lst until each element in lst is at most length"""
    if len(lst) == 0:
        return lst
    
    # Split elements at spaces if they exceed length
    number_split = 0
    new_lst = []
    for elem in lst:
        if len(elem) > length:
            number_split += 1
            new_lst.extend(elem.split(' '))
        else:
            new_lst.append(elem)
    lst = new_lst
    
    if number_split > 0:
        print('WARNING! Had to split', number_split, 
              'sentences because the sentence length exceeded', length, 'characters.')
    if max([ len(elem) for elem in lst ]) > length:
        raise Exception('A single word exceeded ' + length + ' characters')
    
    # Now that all elements are guaranteed to be <= length,
    # combine them as long as they do not exceed length.
    chunks = []
    current_chunk = ''
    for sentence in lst:
        if len(current_chunk) + len(sentence) < length:
            current_chunk += ' ' + sentence
        else:
            chunks.append(current_chunk)
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

In [None]:
# Translate file contents
from google.cloud import translate
client = translate.Client.from_service_account_json('client_secret.json')
def google_translate(text):
    return client.translate(text, target_language=target_language)['translatedText']

In [None]:
# Split at sentences
# Uses nltk to identify sentence breaks (http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt)
import nltk
nltk.download('punkt')

# Split into sentences
import nltk.data
tokenizer = nltk.data.load(punkt_tokenizer)

def translate(text):
    sentences = tokenizer.tokenize(text)
    print('Identified', len(sentences), 'sentences')
    
    chunks = condense(sentences, max_length)
    print('Condensed into', len(chunks), 'chunks')
    
    translated_chunks = [ google_translate(chunk) for chunk in chunks ]
    translation = ' '.join(translated_chunks)
    return translation

In [None]:
from html import escape
if preserve_line_breaks:
    input_contents = input_contents.replace('\n', '<br>')
translation = translate(input_contents)
if preserve_line_breaks:
    translation = translation.replace('<br>', '\n')

In [None]:
# Translation contains escaped HTML charcaters such as '&#39;' for an apostrophe.
# To fix this, unescape HTML
from html import unescape
translation_text = unescape(translation)

In [None]:
# Write output to output file
output_file = open(output_filename, 'w', encoding='utf8')
output_file.write(translation_text)
output_file.close()