# ASR-CL
## Lucida AI Evaluation

### Goal
Investigate the relationship between the errors of ASR (Automatic Speech Recognition) and the errors of CL (Classifier for query).

### Generate Quries from Original Data

100 sentences on QA (Generic QA; from a public dataset) and 100 sentences on CA (Calendar; hand-made).

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import re, string, sys, json
from random import randrange
from operator import itemgetter
import matplotlib.pyplot as plt
fig = plt.figure()
plt.rc("font", family="serif")
plt.rc("font", size=15)

<matplotlib.figure.Figure at 0x109f46550>

In [2]:
# Return true if the string contains any of the characters in the set.
def contains_any(str, set):
    return 1 in [c in str for c in set]

# Generate 100 queires from the original data file
# 'dev-v1.1.json'.
# Format: JSON.
# Print out the queries.
# Data source: https://rajpurkar.github.io/SQuAD-explorer/.
def generate_100_queires():
    with open('dev-v1.1.json') as json_data:
        d = json.load(json_data)
    count = 0
    while True:
        seen = {}
        if count >= 100:
            break
        data = d['data'][randrange(len(d['data']))]
        paragraph = data['paragraphs'][randrange(len(data['paragraphs']))]
        qa = paragraph['qas'][randrange(len(paragraph['qas']))]
        query = qa['question']
        if query in seen or contains_any(query[:-1], string.punctuation) \
        or not '?' in query or len(query.split()) < 7 or '  ' in query:
            continue
        seen[query] = 1
        print(query)
        count += 1

# Label the first `first_part` queries as `QA`, and the rest as `CA`.
# Print the label transcripts.
def label_transcript(transcript_file_path, first_part):
    print('query,answer')
    file = open(transcript_file_path)
    lines= file.readlines()
    file.close()
    for i, line in enumerate(lines):
        line = line.rstrip('\n')
        if ',' in line:
            line = '"' + line + '"'
        print(line + ',' + ('QA' if i < first_part else 'CA'))
    
# generate_100_queires() # save the result to 'speech/transcript.txt'
# label_transcript('speech/transcript.txt', 100)

### Generate Raw Data by Sending Queries to the Pipeline

### Generate DataFrame from ASR Results

In [3]:
# Return a DataFrame from the file with the following format:
# <audio_path>,<transcript>
# in which the audio path is of the following format:
# "path/to/file/<query_id>_<text>.wav"
# An example file:
# ../asr_cl/speech/0_Helloworld.wav,"hello world."
# Its first line must have be the header!
# The DataFrame has only one column: transcript
# with the specified header
# and sorted by the query id.
def get_df_from_asr_result(file_path, header):
    file = open(file_path, 'r')
    lines = file.readlines()
    lines.pop(0) # pop the header
    file.close()
    proc_lines = []
    for line in lines:
        line = line.lstrip().rstrip('\n')
        # The first occurrence of a number is assumed to be the query id.
        id = int(re.search(r'\d+', line).group())
        transcript = line.split(',')[1]
        if transcript.startswith('"') and transcript.endswith('"'):
            transcript = transcript[1:-1]
        proc_lines.append((id, transcript))
    proc_lines.sort(key=itemgetter(0)) # sort data by ID
    df = pd.DataFrame([row[1] for row in proc_lines], columns=[header])
    return df

data = pd.read_csv('text/query.txt')

asr_labels = ['fisher', 'librispeech', 'tedlium']

for asr_label in asr_labels:
    data = data.join \
    (get_df_from_asr_result('asr_result_{}.txt'.format(asr_label), \
                            'transcript_{}'.format(asr_label)))

In [4]:
# Return a DataFrame from CL result data.
# In order to match the query in the CL result data
# to the query in `data`,
# the column name to match should be specified.
def get_df_from_cl_result(cl_label, data, match_col):
    cl_data = pd.read_csv('cl_result_{}.txt'.format(cl_label))
    results = []
    cl_data[:]
    for j, asr_output in enumerate(data[match_col]):
        match_row = -1
        if asr_output != '':
            for i, cl_input in enumerate(cl_data['query']):
                if cl_input == asr_output:
                    match_row = i
                    break
            if match_row == -1:
                raise RuntimeError('Cannot find the query ' + asr_output)
            result = cl_data['result'].ix[match_row]
        else:
            result = None # ASR did not return any result, so CL must be wrong!
        results.append((j, result))
    assert(len(results) == data.shape[0])
    results.sort(key=itemgetter(0)) # sort data by row index
    df = pd.DataFrame({'answer_{}'.format(cl_label): [row[1] for row in results]})
    return df
    
cl_labels = [('fisher_svc', 'transcript_fisher'), \
             ('librispeech_svc', 'transcript_librispeech'), \
             ('tedlium_svc', 'transcript_tedlium'), \
             ('regular_svc', 'query')]

for cl_label, match_col in cl_labels:
    data = data.join \
    (get_df_from_cl_result(cl_label, data, match_col), lsuffix='')
data = data[['query', 'answer', \
             'answer_regular_svc', \
             'transcript_fisher', \
             'answer_fisher_svc', \
             'transcript_librispeech', \
             'answer_librispeech_svc', \
             'transcript_tedlium', \
             'answer_tedlium_svc']]
data.to_csv('data.txt') # save to disk

### Observation

In [5]:
from IPython.display import display
pd.set_option('display.max_colwidth', -1) # display full text
data = pd.read_csv('data.txt', index_col=0)
pd.set_option('display.max_columns', data.shape[1]) # display all columns
data[:]

Unnamed: 0,query,answer,answer_regular_svc,transcript_fisher,answer_fisher_svc,transcript_librispeech,answer_librispeech_svc,transcript_tedlium,answer_tedlium_svc
0,What party had a victory in the 2015 UK election?,QA,"['QA', '0'], and starting indices: [0]",what party had a victory and she'll and says it's election.,"['QA', '0'], and starting indices: [0]",WHAT PART HE HAD A VICTORY AND THE TWO L ONE FIVE OF COLLECTION.,"['QA', '0'], and starting indices: [0]",what party had a vegetarian that show and five election.,"['QA', '0'], and starting indices: [0]"
1,What is the main difference between online pharmacies and community pharmacies?,QA,"['QA', '0'], and starting indices: [0]",what is the main difference between online pharmacies and community pharmacies.,"['QA', '0'], and starting indices: [0]",WHAT IS THE MAIN DIFFERENCE BETWEEN ONLINE PHARMACIES AND COMMUNITY PHARMACIES.,"['QA', '0'], and starting indices: [0]",what is the main difference between online pharmacies and community pharmacies.,"['QA', '0'], and starting indices: [0]"
2,What type of treatment are pharmacists important for?,QA,"['QA', '0'], and starting indices: [0]",what type of treatments are pharmacists important for.,"['QA', '0'], and starting indices: [0]",WHAT TYPE OF TREATMENT ARE PHARMACISTS IMPORTANT FOR.,"['QA', '0'], and starting indices: [0]",what type of treatment are pharmacists important for.,"['QA', '0'], and starting indices: [0]"
3,Who did Genghis Khan unite before he began conquering the rest of Eurasia?,QA,"['QA', '0'], and starting indices: [0]",it changes fun unite before he began time during the rest of your asia.,"['QA', '0'], and starting indices: [0]",THEIR GENGHIS KHAN UNITE BEFORE HE BEGAN CONQUERING THE REST OF EURASIA.,"['QA', '0'], and starting indices: [0]",the chain has been unite before he began found carrying the rest of your asia.,"['QA', '0'], and starting indices: [0]"
4,In what year was HMS Dreadnought launched?,QA,"['QA', '0'], and starting indices: [0]",right yeah lessons trying not launched.,"['QA', '0'], and starting indices: [0]",AND WHAT YEAR WAS HIS DREAD NOT LAUNCHED.,"['QA', '0'], and starting indices: [0]",what year was instead not launched.,"['QA', '0'], and starting indices: [0]"
5,What cytokines are responsible for communication between white blood cells?,QA,"['QA', '0'], and starting indices: [0]",let's say looking you sorry sponsel for communication between like blood cells.,"['QA', '0'], and starting indices: [0]",WHAT SAID OPINIONS ARE RESPONSIBLE FOR COMMUNICATION BETWEEN WHITE BLOOD SELVES.,"['QA', '0'], and starting indices: [0]",what set of genes are responsible for communication between white blood cells.,"['QA', '0'], and starting indices: [0]"
6,What are some proposals to connect campuses?,QA,"['QA', '0'], and starting indices: [0]",what are some proposals to connect campuses.,"['QA', '0'], and starting indices: [0]",WHAT ARE SOME PROPOSALS TO CONNECT CANVASES.,"['QA', '0'], and starting indices: [0]",but are some proposals to connect campuses.,"['QA', '0'], and starting indices: [0]"
7,What compounds can be masked with the molecules of the host cell in order for a virus to evade detection?,QA,"['QA', '0'], and starting indices: [0]",well sometimes can be messed with the knowledge ills of the hosts cell in order for virus to invade detection.,"['QA', '0'], and starting indices: [0]",WHAT COMPOUNDS CONVENE ASKED FOR THE MOLECULES OF THE HOST SELLIN ORDER FOR A VIRUS TO EVADE DETECTION.,"['QA', '0'], and starting indices: [0]",what from fans can be messed with the molecules of the host cell in order for a virus to evade detection.,"['QA', '0'], and starting indices: [0]"
8,Where did the residents of Antioch flee to?,QA,"['QA', '0'], and starting indices: [0]",where did the residents of and she actually too.,"['QA', '0'], and starting indices: [0]",WHERE DID THE RESIDENCE OF ANTIOCH FLEE TO.,"['QA', '0'], and starting indices: [0]",where did the residents of antioch fleets to.,"['QA', '0'], and starting indices: [0]"
9,Who designed the garden for the University Library?,QA,"['QA', '0'], and starting indices: [0]",design the garden for the university library.,"['QA', '0'], and starting indices: [0]",IT IS EYEING THE GARDEN FOR THE UNIVERSITY LIBRARY.,"['QA', '0'], and starting indices: [0]",he designed the garden for the university library.,"['QA', '0'], and starting indices: [0]"


### Collect Statistics and Plot

### Plot Google MT Performance

1. Baseline is Google MT with the original text query

2. Score is relative to the baseline:

    - If the baseline is more correct, the score is -1

    - If the baseline is more wrong, the score is +1

    - Otherwise, the score is 0


In [6]:
sys.path.append('../../asr')
from wer import error_rate

# Analyze the data for the specified ASR and return the statistics.
def analyze(data, asr_name, mt_name):
    size = data.shape[0]
    asr_errors = np.zeros(size)
    for j, transcript in enumerate(data['_'.join(['transcript', asr_name])]):
        asr_errors[j] = error_rate(data['query'][j], transcript)
    relative_scores = np.zeros(size)
    with_asr_scores = data['_'.join(['score', asr_name, mt_name])]
    regular_scores = data['_'.join(['score', 'regular', mt_name])]
    for j, answer in enumerate(data['_'.join(['answer', asr_name, mt_name])]):
        with_asr_score = with_asr_scores[j]
        regular_score = regular_scores[j]
        if with_asr_score == regular_score:
            relative_scores[j] = 0
        elif with_asr_score > regular_score:
            relative_scores[j] = -1
        else:
            relative_scores[j] = 1
    avg_mt_accuracy = np.mean(with_asr_scores)
    avg_regular_mt_accuracy = np.mean(regular_scores)
    print('avg_asr_error:', asr_errors.mean(), \
          'avg_mt_accuracy:', avg_mt_accuracy, \
          'avg_regular_mt_accuracy', avg_regular_mt_accuracy)
    return {'asr_name': asr_name, 'mt_name': mt_name, \
            'asr_errors': asr_errors, \
            'avg_asr_error': asr_errors.mean(), \
            'relative_scores': relative_scores, \
            'avg_mt_accuracy': avg_mt_accuracy, \
            'avg_regular_mt_accuracy': avg_regular_mt_accuracy}

# Plot the relative scores vs error rates for a particular ASR+MT combination.
def plot_score_vs_error(color, asr_name, mt_name, asr_errors, relative_scores, **extras):
    plt.xlabel('ASR Error Rate')
    plt.ylabel('MT Relative Score')
    plt.title(' '.join([asr_name, mt_name]))
    plt.scatter(asr_errors, relative_scores, label=asr_name, color=color)

In [7]:
google_stats_list = []
google_stats_list.append(analyze(data, 'fisher', 'google'))
plot_score_vs_error('lightcoral', **google_stats_list[-1])

KeyError: 'score_fisher_google'

In [None]:
google_stats_list.append(analyze(data, 'librispeech', 'google'))
plot_score_vs_error('lightskyblue', **google_stats_list[-1])

In [None]:
google_stats_list.append(analyze(data, 'tedlium', 'google'))
plot_score_vs_error('yellowgreen', **google_stats_list[-1])

In [None]:
# Return the data from the list of MT statistics (one for each ASR model).
def gather_data_for_mt_vs_asr(stats_list):
    asr_name_list = [stats['asr_name'] for stats in stats_list]
    avg_asr_accuracy_list = [1 - stats['avg_asr_error'] for stats in stats_list]
    avg_mt_accuracy_list = [stats['avg_mt_accuracy'] for stats in stats_list]
    avg_regular_mt_accuracy_list = \
    [stats['avg_regular_mt_accuracy'] for stats in stats_list]
    avg_mt_accuracy_drop_list = \
    [stats['avg_mt_accuracy'] - avg_regular_mt_accuracy_list[i] \
     for i, stats in enumerate(stats_list)]
    avg_mt_accuracy_drop_percentage_list = \
    [ (avg_mt_accuracy_drop / avg_regular_mt_accuracy_list[i]) * 100 \
     for i, avg_mt_accuracy_drop in enumerate(avg_mt_accuracy_drop_list)]
    return asr_name_list, avg_asr_accuracy_list, avg_mt_accuracy_list, \
avg_regular_mt_accuracy_list, avg_mt_accuracy_drop_list, \
avg_mt_accuracy_drop_percentage_list

# Plot the performance of a particular MT under the influence of different ASR models.
def plot_mt_vs_asr(stats_list, mt_name, color):  
    assert(len(stats_list) != 0)
    plt.xlabel('Average ASR Accuracy')
    plt.ylabel('Average MT Accuracy')
    plt.title('Performace of {}\n under the influence of ASR'.format(mt_name))
    # Gather data.
    asr_name_list, avg_asr_accuracy_list, avg_mt_accuracy_list, \
    avg_regular_mt_accuracy_list, \
    avg_mt_accuracy_drop_list, avg_mt_accuracy_drop_percentage_list = \
    gather_data_for_mt_vs_asr(stats_list)
    # Plot.
    plt.scatter(avg_asr_accuracy_list, avg_mt_accuracy_list, color=color)
    for i, avg_asr_accuracy in enumerate(avg_asr_accuracy_list):
        txt = '{} ({:.2f},{:.3f})\n$\Delta$Accuracy={:.3f}\n({:.2f}%)'.format \
        (asr_name_list[i], avg_asr_accuracy, avg_mt_accuracy_list[i], \
         avg_mt_accuracy_drop_list[i], \
         avg_mt_accuracy_drop_percentage_list[i])
        plt.annotate(txt, (avg_asr_accuracy, avg_mt_accuracy_list[i]), \
                    fontsize=10)

# Plot the performance of a different MTs under the influence of different ASR models.
def plot_mts_vs_asr(list_of_stats_list, list_of_mt_name, list_of_colors):
    assert(len(list_of_stats_list) != 0)
    assert(len(list_of_stats_list[0]) != 0)
    assert(len(list_of_stats_list) == len(list_of_mt_name))
    assert(len(list_of_mt_name) == len(list_of_colors))
    plt.xlabel('Average ASR Accuracy')
    plt.ylabel('Average MT Accuracy')
    plt.title('Performace of {}\nunder the influence of ASR'. \
              format(', '.join(list_of_mt_name)))
    for i, stats_list in enumerate(list_of_stats_list):
        mt_name = list_of_mt_name[i]
        # Gather data.
        asr_name_list, avg_asr_accuracy_list, avg_mt_accuracy_list, \
        avg_regular_mt_accuracy_list, \
        avg_mt_accuracy_drop_list, avg_mt_accuracy_drop_percentage_list = \
        gather_data_for_mt_vs_asr(stats_list)
        # Plot.
        plt.scatter(avg_asr_accuracy_list, avg_mt_accuracy_list, \
                    label='{} with ASR'.format(mt_name), \
                    color=list_of_colors[i])
        xs = np.arange(0.28, 0.36, 0.02)
        plt.plot(xs, \
                 np.full(xs.shape, avg_regular_mt_accuracy_list[0]), \
                 'k--', color=list_of_colors[i], \
                 label='{} Regular'.format(mt_name))
        plt.legend(loc='best', fontsize=9)
        for i, avg_asr_accuracy in enumerate(avg_asr_accuracy_list):
            txt = '{}'.format(asr_name_list[i])
            plt.annotate(txt, (avg_asr_accuracy, avg_mt_accuracy_list[i]), \
                         fontsize=10)

In [None]:
plot_mt_vs_asr(google_stats_list, 'Google MT', 'red')

### Plot Microsoft MT Performance

In [None]:
microsoft_stats_list = []
microsoft_stats_list.append(analyze(data, 'fisher', 'microsoft'))
plot_score_vs_error('lightcoral', **microsoft_stats_list[-1])

In [None]:
microsoft_stats_list.append(analyze(data, 'librispeech', 'microsoft'))
plot_score_vs_error('lightskyblue', **microsoft_stats_list[-1])

In [None]:
microsoft_stats_list.append(analyze(data, 'tedlium', 'microsoft'))
plot_score_vs_error('yellowgreen', **microsoft_stats_list[-1])

In [None]:
plot_mt_vs_asr(microsoft_stats_list, 'Microsoft MT', 'blue')

### Summary

In [None]:
plot_mts_vs_asr([google_stats_list, microsoft_stats_list], \
                ['Google MT', 'Microsoft MT'], ['red', 'blue'])