## 1. detect_labels_uri( ) from vision.py

In [3]:
# [START vision_label_detection_gcs]
def detect_labels_uri(uri):
    """Detects labels in the file located in Google Cloud Storage or on the
    Web."""
    from google.cloud import vision
    client = vision.ImageAnnotatorClient()
    image = vision.types.Image()
    image.source.image_uri = uri

    response = client.label_detection(image=image)
    labels = response.label_annotations
    print('Labels:')

    for label in labels:
        print(label.description)
# [END vision_label_detection_gcs]

## To be modified 
#     label_descriptions = []
#     for label in labels:
#         label_descriptions.append(label.description)
#     return label_descriptions

## 2. transcribe_gcs( ) from speech2text.py

In [4]:
# [START speech_transcribe_sync_gcs]
def transcribe_gcs(language, gcs_uri):
    """Transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    # [START speech_python_migration_config_gcs]
    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        # encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        # sample_rate_hertz=16000,
        language_code=language)
    # [END speech_python_migration_config_gcs]

    response = client.recognize(config, audio)
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
# [END speech_transcribe_sync_gcs]
    
    
    
## To be modified    
#     return response.results[0].alternatives[0].transcript


## 3. translate_text( ) from translate.py

In [5]:
from google.cloud import translate
import six

def translate_text(target, text):
    # [START translate_translate_text]
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """
    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(
        text, target_language=target)

    print(u'Text: {}'.format(result['input']))
    print(u'Translation: {}'.format(result['translatedText']))
    print(u'Detected source language: {}'.format(
        result['detectedSourceLanguage']))
    # [END translate_translate_text]
    
## To be modified 
#     return result['translatedText']


## 4. entities_text( ) from natural_language.py

In [6]:
import sys

from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
import six

# [START language_entities_text]
def entities_text(text):
    """Detects entities in the text."""
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    # [START language_python_migration_entities_text]
    # [START language_python_migration_document_text]
    document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)
    # [END language_python_migration_document_text]

    # Detects entities in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    entities = client.analyze_entities(document).entities

    for entity in entities:
        entity_type = enums.Entity.Type(entity.type)
        print('=' * 20)
        print(u'{:<16}: {}'.format('name', entity.name))
        print(u'{:<16}: {}'.format('type', entity_type.name))
        print(u'{:<16}: {}'.format('metadata', entity.metadata))
        print(u'{:<16}: {}'.format('salience', entity.salience))
        print(u'{:<16}: {}'.format('wikipedia_url',
              entity.metadata.get('wikipedia_url', '-')))
    # [END language_python_migration_entities_text]
# [END language_entities_text]

## To be modified   
#     entity_names = []
#     for entity in entities:
#         entity_names.append(entity.name)
#     return entity_names


## Put together

In [15]:
def compare_audio_to_image(language, audio, image):
    transcription = transcribe_gcs(language, audio)
    translation = translate_text('en', transcription)   
    entities = entities_text(translation)
    labels = detect_labels_uri(image)
    
    has_match = False
    for entity in entities:
        if entity in labels:
            print('The audio and image both contain: {}'.format(entity))
            has_match = True
    if not has_match:
        print('The audio and image do not appear to be related.')

In [16]:
compare_audio_to_image('de-DE', 'gs://ml-api-codelab/de-ostrich.wav', 'gs://ml-api-codelab/birds.jpg')

The audio and image do not appear to be related.


In [None]:
# [START language_syntax_text]
def syntax_text(text):
    """Detects syntax in the text."""
    client = language.LanguageServiceClient()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Instantiates a plain text document.
    # [START language_python_migration_syntax_text]
    document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)

    # Detects syntax in the document. You can also analyze HTML with:
    #   document.type == enums.Document.Type.HTML
    tokens = client.analyze_syntax(document).tokens

    for token in tokens:
        part_of_speech_tag = enums.PartOfSpeech.Tag(token.part_of_speech.tag)
        print(u'{}: {}'.format(part_of_speech_tag.name,
                               token.text.content))
    # [END language_python_migration_syntax_text]
# [END language_syntax_text]


#     lemmas = ''
#     for token in tokens:
#         temp = token.lemma + ' '
#         lemmas += temp
#     return lemmas

### tr-TR speech samples:

gs://ml-api-codelab/tr-ball.wav

gs://ml-api-codelab/tr-bike.wav

gs://ml-api-codelab/tr-jacket.wav

gs://ml-api-codelab/tr-ostrich.wav
    

### de-DE speech samples:

gs://ml-api-codelab/de-ball.wav

gs://ml-api-codelab/de-bike.wav

gs://ml-api-codelab/de-jacket.wav

gs://ml-api-codelab/de-ostrich.wav
    

### Image samples:

gs://ml-api-codelab/bicycle.jpg

gs://ml-api-codelab/birds.jpg

gs://ml-api-codelab/coat_rack.jpg

gs://ml-api-codelab/football.jpg