# Google Translate API

This notebook demonstrates reading from a text file, sending it to the Google translate API, receiving the translation, and writing the output to file. (Nicolas Chan, First created 11/16/2017)

### Kernal Setup
```bash
conda create --name=translate python=3.6 ipykernel
source activate translate
ipython kernel install --user --name translate
pip install --upgrade google-cloud-translate nltk google-api-python-client
```
### Credentials
IMPORTANT: Store your service account JSON credentials in `client_secret.json`.

In [None]:
# Configuration
target_language = 'en'

# Limit maximum character length of text sent to Google translate at once
# Larger limit yields better translations because Google translate uses context
# Too large of a length might be rejected by Google translate (maximum allowed: 5000)
# See: https://cloud.google.com/translate/faq
max_length = 4000

# Delay between sending chunks
delay = 20 # seconds

# Set to True only if it makes sense to send chunks split by line breaks.
# If line breaks might occur in the middle of sentences, set this to False.
preserve_line_breaks = True

input_folder = '1dIVXCpexYecYUWj4NdL-4IhGzDMvOakN'
completed_folder = '1b9yvvDsm2lH6bLT8wRJnErFSdcMnfwYa'
results_folder = '1lVJoATSlYFyb6I_N8EwGVyhSn_AM6vuX'

# Credentials
translate_secret = 'shapreau_translate.json'
google_drive_secret = 'client_secret.json'

## Google Drive Authentication
Google drive interaction uses code from `AdamAndersonFindSumerian.ipynb`

In [None]:
# Google Drive authentication based on AdamAndersonFindSumerian.ipynb
import codecs
import httplib2
import os
from googleapiclient.http import MediaIoBaseDownload
from googleapiclient.http import MediaFileUpload
from apiclient import discovery, errors
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage

SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'gDriveConnect'

def get_credentials():
    home_dir = os.path.expanduser('~')
    credential_dir = os.path.join(home_dir, '.credentials')
    if not os.path.exists(credential_dir):
        os.makedirs(credential_dir)
    credential_path = os.path.join(credential_dir, 'gDriveConnect.json')
    store = Storage(credential_path)    
    credentials = store.get()
    if not credentials or credentials.invalid:
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else: # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
    return credentials

def get_folder_contents(folder_id):
    return service.files().list(
        q="'" + folder_id + "' in parents and trashed=false", 
        spaces='drive',
        fields='nextPageToken, files(id, name)',
        pageToken=None).execute()

credentials = get_credentials()
print('credential_path:', credentials)
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)

In [None]:
# Identify file to process
response = get_folder_contents(input_folder)
files = list(filter(lambda file: file['name'][-4:] == '.txt', response['files']))
if (len(files)):
    file = files[0]
else:
    import sys
    sys.exit()
    
print(file)

In [None]:
# Download file
# Source: https://developers.google.com/drive/v3/web/manage-downloads
# Source: AdamAndersonFindSumerian.ipynb
import io, sys
def download_file(google_id, destination):
    """Downloads a file from Google Drive"""
    request = service.files().get_media(fileId=google_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while not done:
        try:
            status, done = downloader.next_chunk()
            sys.stdout.write('.')
        except errors.HttpError as error :
            print('Error file:', value, '   id:', key)
            print('An error occurred pulling the next chunk:', error)
            break
    fh.seek(0)
    contents = fh.getvalue()
    with open(destination, 'wb') as f2:
        f2.write(contents)
        f2.close()
    fh.close()
    return contents

input_filename = file['id'] + '.txt'
output_filename = file['id'] + '_translated.txt'
download_file(file['id'], input_filename)

# Set tokenizer language
# punkt_tokenizer is used to identify where sentences end for splitting into chunks.
# It should be set to the INPUT language.
# For more info: http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt
# Available languages: czech, dutch, estonian, french, greek, norwegian, portuguese, 
#   spanish, turkish, danish, english, finnish, german, italian, polish, slovene, swedish

language_code = file['name'][:3]
def lookup_language(code):
    languages = {
        'cze': 'czech',
        'dut': 'dutch',
        'est': 'estonian',
        'fre': 'french',
        'gre': 'greek',
        'nor': 'norwegian',
        'por': 'portuguese',
        'spa': 'spanish',
        'tur': 'turkish',
        'dan': 'danish',
        'eng': 'english',
        'fin': 'finnish',
        'ger': 'german',
        'ita': 'italian',
        'pol': 'polish',
        'slv': 'slovene',
        'swe': 'swedish'
    }
    return languages[code]
punkt_tokenizer = 'tokenizers/punkt/' + lookup_language(language_code) + '.pickle'

## Run Translation

In [None]:
# Read input file contents
input_contents = ''
with open(input_filename, encoding='latin-1') as f:
    for line in f.readlines():
        input_contents += line

from html import escape
input_contents = escape(input_contents)
if preserve_line_breaks:
    input_contents = input_contents.replace('\n', '<br>')
    input_contents = input_contents.replace('\r', '<br>')

In [None]:
# Function to combine sentences into larger chunks
def condense(lst, length):
    """Concatenates elements in lst until each element in lst is at most length"""
    if len(lst) == 0:
        return lst
    
    # Split elements at spaces if they exceed length
    number_split = 0
    new_lst = []
    for elem in lst:
        if len(elem) > length:
            number_split += 1
            new_lst.extend(elem.split(' '))
        else:
            new_lst.append(elem)
    lst = new_lst
    
    if number_split > 0:
        print('WARNING! Had to split', number_split, 
              'sentences because the sentence length exceeded', length, 'characters.')
    if max([ len(elem) for elem in lst ]) > length:
        raise Exception('A single word exceeded ' + length + ' characters')
    
    # Now that all elements are guaranteed to be <= length,
    # combine them as long as they do not exceed length.
    chunks = []
    current_chunk = ''
    for sentence in lst:
        if len(current_chunk) + len(sentence) < length:
            current_chunk += ' ' + sentence
        else:
            chunks.append(current_chunk)
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

In [None]:
# Translate file contents
from google.cloud import translate
client = translate.Client.from_service_account_json(translate_secret)
def google_translate(text):
    return client.translate(text, target_language=target_language)['translatedText']

In [None]:
# Split at sentences
# Uses nltk to identify sentence breaks (http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt)
import nltk
nltk.download('punkt')

# Split into sentences
import nltk.data
tokenizer = nltk.data.load(punkt_tokenizer)

import time
def translate(text):
    sentences = tokenizer.tokenize(text)
    print('Identified', len(sentences), 'sentences')
    
    chunks = condense(sentences, max_length)
    print('Condensed into', len(chunks), 'chunks')
    
    translated_chunks = []
    for chunk in chunks:
        translated_chunks.append(google_translate(chunk))
        print('Translated chunk')
        time.sleep(delay)
    
    translation = ' '.join(translated_chunks)
    return translation

In [None]:
translation = translate(input_contents)

In [None]:
# Translation contains escaped HTML charcaters such as '&#39;' for an apostrophe.
# To fix this, unescape HTML
from html import unescape
translation_text = unescape(translation)
if preserve_line_breaks:
    translation_text = translation_text.replace('<br>', '\n')

In [None]:
# Write output to output file
output_file = open(output_filename, 'w', encoding='utf8')
output_file.write(translation_text)
output_file.close()

In [None]:
# @source Google Drive upload based on AdamAndersonFindSumerian.ipynb
def upload_txt_file(name, path, destination_folder=None):
    """Upload a text file to Google Drive"""
    file_metadata = { 'name': name }
    if destination_folder:
        file_metadata['parents'] = [destination_folder]
    media = MediaFileUpload(path, mimetype='text/plain')
    file = service.files().create(
        body=file_metadata,
        media_body=media,
        fields='id'
    ).execute()
    print('Uploaded', name, '; ID:', file.get('id'))
    
time.sleep(10)
upload_txt_file('eng_translated' + file['name'][3:], output_filename, results_folder)

In [None]:
# Move input text file to completed folder so it is not processed again
service.files().update(fileId=file['id'],
                       addParents=completed_folder,
                       removeParents=input_folder).execute()