In [1]:
!pip install google.cloud.speech

Collecting google.cloud.speech
  Using cached google_cloud_speech-0.29.0-py2.py3-none-any.whl
Installing collected packages: google.cloud.speech
Successfully installed google.cloud.speech


In [2]:
!pip install google.cloud.core

Collecting google.cloud.core
  Using cached google_cloud_core-0.27.1-py2.py3-none-any.whl
Installing collected packages: google.cloud.core
Successfully installed google.cloud.core


In [3]:
import argparse
import io
import google.datalab.storage as storage

In [136]:
def short_transcribe_gcs(gcs_uri):
    """Transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    # [START migration_audio_config_gcs]
    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        language_code='en-US')
    # [END migration_audio_config_gcs]

    response = client.recognize(config, audio)
    # return the first alternative of all the consecutive results.
    res = list()
    for result in response.results:
        res.append(result.alternatives[0].transcript)
    return '\n'.join(res)

In [137]:
def long_transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        language_code='en-US')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=90)

    # return the first alternative of all the consecutive results.
    res = ""
    res = list()
    for result in response.results:
        res.append(result.alternatives[0].transcript)
    return '\n'.join(res)

In [6]:
def findWordsWithDistance(words, word1, word2, distance):
    """
    :type words: List[str]
    :type word1: str
    :type word2: str
    :rtype: int
    """
    m=len(words)
    p1=-len(words)
    p2=len(words)
    for i in range(0,len(words)):
        if word1[-1] == '!':
            if words[i] == word1:
                p1=i
        else:
            if word1 in words[i].lower():
                p1=i
        if word2[-1] == '!':
            if words[i] == word2:
                p2=i
        else:
            if word2 in words[i].lower():
                p2=i
        m=min(m,abs(p1-p2))
    if m <= distance:
        return True
    return False

In [7]:
def phrase_search(text, phrase):
    distance = int(phrase.split('~')[1])
    word1 = phrase.split('~')[0].split()[0]
    word2 = phrase.split('~')[0].split()[1]
    return findWordsWithDistance(text.split(), word1, word2, distance) 

In [8]:
def word_search(text, word):
    if word[-1] == '!':
        for i in text.split():
            if i == word:
                return True
        return False
    else:
        return word.lower() in text.lower()

In [9]:
def condition_search(text, terms):
    term_list_1 = terms.split(' AND ')[0].split(',')
    term_list_2 = terms.split(' AND ')[1].split(',')
    for i in term_list_1:
        first_check = False
        if '~' in i:
            first_check = phrase_search(text, i)
        else:
            first_check = word_search(text, i)
        if first_check:
            second_check = False
            for j in term_list_2:
                if '~' in j:
                    second_check = phrase_search(text, j)
                else:
                    second_check = word_search(text, j)
                if second_check:
                    return True
    return False           

In [138]:
def tag_text(text):
    tag_list = list()
    for i in range(0,len(ontology)):
        tag = True
        and_check = ['Keywords', 'And Words', 'And Words (2)']
        not_check = 'Not Words'
        continue_check = True
        for a in and_check:
            if str(ontology[a].values[i]) != 'nan':
                a_check = False
                for wds in ontology[a].values[i].split('|'):
                    print(wds)
                    if 'AND' in wds:
                        a_check = condition_search(text, wds)
                        if a_check:
                            break
                    else: 
                        for sub_wds in wds.split(','):
                            if '~' in sub_wds:
                                a_check = phrase_search(text, sub_wds)
                            elif len(sub_wds) != 0:
                                a_check = word_search(text, sub_wds)
                            if a_check:
                                break
            if not a_check:
                tag = False
                continue_check = False
                break
        if continue_check:
            if str(ontology[not_check].values[i]) != 'nan':
                n_check = False
                for wds in ontology[not_check].values[i].split('|'):
                    if 'AND' in wds:
                        n_check = condition_search(text, wds)
                        if n_check:
                                tag = False
                                break
                    else: 
                        for sub_wds in wds.split(','):
                            if '~' in sub_wds:
                                n_check = phrase_search(text, sub_wds)
                            elif len(sub_wds) != 0:
                                n_check = word_search(text, sub_wds)
                            if n_check:
                                tag = False
                                break
        if tag:
            tag_list.append(i)
    return ontology.iloc[tag_list,]

In [83]:
sample_bucket = storage.Bucket('cpc-speech-analytics-demo')
complaints = list()
for i in sample_bucket.objects():
    if str(i)[-3:] == 'wav' and 'long-audio' not in str(i):
        complaints.append(short_transcribe_gcs(str(i)[28:]))
    if str(i)[-3:] == 'wav' and 'long-audio' in str(i):
        complaints.append(long_transcribe_gcs(str(i)[28:]))

In [94]:
complaints

[u"I opened an account online but never received my account opening disclosures unless I am mistaken they never mailed emailed or stored them electronically on online banking I called the call center to find out where I could get them she told me I would receive them by mail in 10 business days will I still don't have to disclose your how am I supposed to know to avoid a fee if I don't have the disclosures",
 u"I have used this fence credit card check for 3% interest rate last year I remember that I need to pay you full in Maine Tucson 16 so I'm heading for much earlier than the due date for the fight but I will still be in charge for interest $96 I am a thorough care for small business person but I have seen being shipped to buy this cunning Bankers Banks and their politicians would like to get rid of cfpb so that they can suck our blood dry is greedy Bunch no longer American people hate them",
 u"I am continually being charged overdraft fees on my account and I'm somehow able to cont

In [86]:
import pandas as pd
import numpy as np

In [87]:
import datalab.data
import sys  
reload(sys)  
sys.setdefaultencoding('utf8')

In [130]:
csv = datalab.data.Csv('gs://cpc-speech-analytics-demo/ontology_demo.csv',delimiter=';')

In [131]:
ontology = csv.browse()

In [132]:
ontology.columns = ontology.iloc[0,]
ontology = ontology.iloc[1:,1:]

In [134]:
def tagging(complaints):
    tagged_cp = pd.DataFrame()
    columns = ['Complaint ID', 'Complaint', 'Regulation', 'Theme ID', 'Theme Name', 'Section', 'Plain Language Description']
    for c in complaints:
        res = tag_text(c)
        if res.empty:
            res = pd.DataFrame.transpose(pd.DataFrame.from_dict({'Complaint ID': complaints.index(c), 'Complaint': c, 'Regulation': None, 'Theme ID': None, 'Theme Name': None, 'Section': None, 'Plain Language Description': None}, 'index'))
        else:
            res['Complaint'] = c
            res['Complaint ID'] = complaints.index(c)
            res = res[columns]
        if tagged_cp.empty:
            tagged_cp = res
        else:
            tagged_cp = tagged_cp.append(res,ignore_index=True)
    return tagged_cp

In [135]:
tagging([i for i in complaints if i is not None])

Unnamed: 0,Complaint,Complaint ID,Plain Language Description,Regulation,Section,Theme ID,Theme Name
0,I opened an account online but never received ...,0,§ 1026.5 General disclosure requirements\nMake...,Reg Z: TILA,§ 1026.6 Account-opening disclosures,1.0,DISCLOSURES / NOTICES
1,I have used this fence credit card check for 3...,1,§ 1026.5 General disclosure requirements\nMake...,Reg Z: TILA,§ 1026.5 General disclosure requirements\n§ 10...,1.0,DISCLOSURES / NOTICES
2,I am continually being charged overdraft fees ...,2,"Consumer must opt in to overdraft services, an...",Reg E: EFTA,§1005.17 Requirements for overdraft services,11.0,OVERDRAFT FEES
3,my bank continuously refuses to honor my reque...,3,,,,,
4,I never opting to overdraft protection and hav...,4,,,,,
5,after By Number beds eligible product from thi...,5,§ 1026.5 General disclosure requirements\nMake...,Reg Z: TILA,§ 1026.5 General disclosure requirements\n§ 10...,1.0,DISCLOSURES / NOTICES
6,I requested International wire transfer at thi...,6,"Disclosure, notice, and accuracy requirements ...",Reg E: EFTA,§1005.36 Transfers scheduled before the date o...,13.0,REMITTANCE TRANSFERS
7,I requested International wire transfer at thi...,6,§ 1026.5 General disclosure requirements\nMake...,Reg Z: TILA,§ 1026.5 General disclosure requirements\n§ 10...,1.0,DISCLOSURES / NOTICES
8,I sent an international wire transfer through ...,7,"Disclosure, notice, and accuracy requirements ...",Reg E: EFTA,§1005.36 Transfers scheduled before the date o...,13.0,REMITTANCE TRANSFERS
9,I sent an international wire transfer through ...,7,§ 1026.5 General disclosure requirements\nMake...,Reg Z: TILA,§ 1026.9(c) Change in terms,1.0,DISCLOSURES / NOTICES
