In [1]:
from google.cloud import speech
import os

# set service account credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]='naep-orf-4ea5eacc76b4.json'

In [2]:
from pydub import AudioSegment
from pydub.utils import make_chunks
import wave

In [3]:
ORIG_DIR = './audio/orig'
CHUNK_DIR = './audio/chunks'

In [4]:
def frame_rate_channel(audio_file_name):
    with wave.open(audio_file_name, "rb") as wave_file:
        frame_rate = wave_file.getframerate()
        channels = wave_file.getnchannels()
    return frame_rate, channels

In [19]:
def create_chunks(filename):
    myaudio = AudioSegment.from_file('{}/{}'.format(ORIG_DIR, filename), "wav") 
    chunk_length_ms = 30000 # pydub calculates in millisec
    chunks = make_chunks(myaudio, chunk_length_ms) #Make chunks of one sec
    pref = filename.split('/')[-1].split('.wav')[0]
    chunk_lst = []
    
    #Export all of the individual chunks as wav files
    for i, chunk in enumerate(chunks):
        chunk_name = "{}_chunk{}.wav".format(pref, i)
        chunk_lst.append(chunk_name)
        print("exporting", chunk_name)
        chunk.export('{}/{}'.format(CHUNK_DIR, chunk_name), format="wav")
    return chunk_lst

In [20]:
def stereo_to_mono(audio_file_name):
    audio_file_name = '{}/{}'.format(CHUNK_DIR, audio_file_name)
    sound = AudioSegment.from_wav(audio_file_name)
    sound = sound.set_channels(1)
    sound.export(audio_file_name, format="wav")

In [21]:
def transcribe_file(filename):
    """Transcribe the given audio file asynchronously."""
    client = speech.SpeechClient()
    chunk_lst = create_chunks(filename)

    transcript = ''
    transcript = []
    for file in chunk_lst:
        
        frame_rate, channels = frame_rate_channel('{}/{}'.format(CHUNK_DIR, file))
    
        if channels > 1:
            stereo_to_mono(file)
        
        with open('{}/{}'.format(CHUNK_DIR, file), "rb") as audio_file:
            content = audio_file.read()

        audio = speech.RecognitionAudio(content=content)

        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
            sample_rate_hertz=frame_rate,
            language_code="en-US",
        )

        operation = client.long_running_recognize(config=config, audio=audio)

        print("Waiting for operation to complete...")
        response = operation.result(timeout=90)
        
#         for result in response.results:
#             # The first alternative is the most likely one for this portion.
#             transcript += result.alternatives[0].transcript
        transcript.append(response)
    
    return transcript


In [8]:
res = []
for filename in os.listdir(ORIG_DIR):
    transcript = transcribe_file(filename)
    res.append((filename, transcript))

exporting Score2.1-161835_chunk0.wav
exporting Score2.1-161835_chunk1.wav
exporting Score2.1-161835_chunk2.wav
Waiting for operation to complete...
Waiting for operation to complete...
Waiting for operation to complete...
exporting Score3_168692_chunk0.wav
exporting Score3_168692_chunk1.wav
exporting Score3_168692_chunk2.wav
exporting Score3_168692_chunk3.wav
Waiting for operation to complete...
Waiting for operation to complete...
Waiting for operation to complete...
Waiting for operation to complete...
exporting Score1.2-155888_chunk0.wav
exporting Score1.2-155888_chunk1.wav
exporting Score1.2-155888_chunk2.wav
exporting Score1.2-155888_chunk3.wav
Waiting for operation to complete...
Waiting for operation to complete...
Waiting for operation to complete...
Waiting for operation to complete...
exporting Score2.2-186808_chunk0.wav
exporting Score2.2-186808_chunk1.wav
exporting Score2.2-186808_chunk2.wav
exporting Score2.2-186808_chunk3.wav
Waiting for operation to complete...
Waiting f

In [10]:
with open('./google_results.txt', 'w') as f:
    for pair in res:
        f.write(pair[0])
        f.write('\n')
        f.write(pair[1])
        f.write('\n\n\n')
    

In [22]:
a = transcribe_file('Score2.2-153707.wav')
a

exporting Score2.2-153707_chunk0.wav
exporting Score2.2-153707_chunk1.wav
exporting Score2.2-153707_chunk2.wav
Waiting for operation to complete...
Waiting for operation to complete...
Waiting for operation to complete...


[results {
   alternatives {
     transcript: "Guy Talk"
     confidence: 0.5273598432540894
   }
   result_end_time {
     seconds: 19
     nanos: 170000000
   }
   language_code: "en-us"
 }
 results {
   alternatives {
     transcript: " Gothic me period just by 4:10 or talk to you there are time"
     confidence: 0.5334952473640442
   }
   result_end_time {
     seconds: 29
     nanos: 950000000
   }
   language_code: "en-us"
 }
 total_billed_time {
   seconds: 30
 },
 results {
   alternatives {
     transcript: "like person to do this do"
     confidence: 0.6437003016471863
   }
   result_end_time {
     seconds: 3
   }
   language_code: "en-us"
 }
 results {
   alternatives {
     transcript: " they must be smart as hell and very well trained left the post I thought it was a second offense positive Tyler was raised by LP the dogs are given to be seen everything or or when the dogs are about 42 or 600 a m n o"
     confidence: 0.5972380042076111
   }
   result_end_time {
     seco

In [23]:
len(a)

3

In [5]:
### another option is to use trascribe
### isn't working right now
bucketname = "naep_orf_files" #Name of the bucket created in the step before

import io
from google.cloud import storage

In [6]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

In [7]:
def delete_blob(bucket_name, blob_name):
    """Deletes a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)

    blob.delete()

In [25]:
def google_transcribe(audio_file_name):
    
    file_name = '{}/{}'.format(ORIG_DIR, audio_file_name)
    frame_rate, channels = frame_rate_channel(file_name)
    
    bucket_name = bucketname
    destination_blob_name = audio_file_name
    
    upload_blob(bucket_name, file_name, destination_blob_name)
    
    gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
    
    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
        sample_rate_hertz=frame_rate,
        language_code='en-US',
        speech_contexts=[{"phrases": ["guide dogs", 
                                      "lead very interesting lives", 
                                      "For ten or twelve years", 
                                      "in charge of guiding",
                                     "blind person",
                                      "To do this job",
                                     "smart gentle and very well trained"]
        }]
    )

    # Detects speech in the audio file
    operation = client.long_running_recognize(config=config,audio=audio)
    response = operation.result(timeout=10000)
    
    transcript = ''

    for result in response.results:
        transcript += result.alternatives[0].transcript
    
    delete_blob(bucket_name, destination_blob_name)
    print('Done,', audio_file_name)
    return transcript

In [26]:
l = ['Score2.1-161835.wav',
 'Score3_168692.wav',
 'Score1.2-155888.wav',
 'Score2.2-186808.wav',
 'Score2.1-178676.wav',
 'Score2.2-153707.wav',
 'Score2.2-168581.wav',
 '640000108-Proficient1_edited.wav']

In [27]:
res = []
for filename in l:
    try:
        transcript = google_transcribe(filename)
        res.append((filename, transcript))
    except:
        print('Boo', filename)

Done, Score2.1-161835.wav
Done, Score3_168692.wav
Done, Score1.2-155888.wav
Done, Score2.2-186808.wav
Done, Score2.1-178676.wav
Done, Score2.2-153707.wav
Done, Score2.2-168581.wav
Boo 640000108-Proficient1_edited.wav


In [28]:
with open('./google_results_phrase2.txt', 'w') as f:
    for pair in res:
        f.write(pair[0])
        f.write('\n')
        f.write(pair[1])
        f.write('\n\n\n')

In [17]:
res

[('Score2.1-161835.wav',
  results {
    alternatives {
      transcript: "guide dogs live very friendly 410 or 12 years they are in charge of buying it in person to do the shop they must be smart guy and Mary would try must guide dogs are born as a no signs dogs are in cannot win without a family dog this is family soon after they are four one 2000 about 14 months old he\'s return to the 10022 to be trained to talk train in March groups for about 3 months or at the end of the tandem most dogs everywhere but train over yet they\'re new monsters"
      confidence: 0.6194838285446167
    }
    result_end_time {
      seconds: 65
      nanos: 730000000
    }
    language_code: "en-us"
  }
  results {
    alternatives {
      transcript: "hotels day trying to get it to you for 4 months at the end of the time they are ready for the world guy dogs form strong bonds with your mother\'s name and they keep them company all the time"
      confidence: 0.628514289855957
    }
    result_end_time 