# ASR-MT
## Lucida AI Evaluation

### Goal
Investigate the relationship between the errors of ASR (Automatic Speech Recognition) and the errors of MT (Machine Translation).

### Generate Quries from Original Data

100 sentences.

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import re, string, sys
from operator import itemgetter
import matplotlib.pyplot as plt
fig = plt.figure()
plt.rc("font", family="serif")
plt.rc("font", size=15)

<matplotlib.figure.Figure at 0x1046d45f8>

In [2]:
# Return the Ensligh text corresponding to the specific sentence.
def get_English_text(id):
    links_with_id = links.loc[links['id'] == id]
    for translation_id in links_with_id['translation_id']:
        data_with_id = data.loc[data['id'] == translation_id]
        for index, row in data_with_id.iterrows():
            if row['lang'] == 'eng':
                return row['text']

# Generate 100 queires from the original data files
#'sentences.csv' and 'links.csv'.
# Format: 'Ensligh text','Chinese text'
# Save the queries to 'text/query.txt'.
# Data source: http://tatoeba.org/eng/downloads.
def generate_100_queires():
    data = pd.read_csv('sentences.csv', \
                       names = ["id", "lang", "text"], delimiter='\t')
    chinese_rows = data.loc[data['lang'] == 'cmn']
    links = pd.read_csv('links.csv', \
                        names = ["id", "translation_id"], delimiter='\t')
    num_queries = 100
    header = ['query', 'chinese']
    query_data = pd.DataFrame(np.zeros((num_queries, len(header))), \
                              columns=header)
    # Randomly select 1000 rows, and pick 100 from them as queries.
    chinese_rows = chinese_rows.ix[np.random.choice(chinese_rows.index, 1000)]
    chinese_rows = chinese_rows.assign(english=np.zeros(chinese_rows.shape[0]))
    count = 0
    for index, row in chinese_rows.iterrows():
        if count >= num_queries:
            break
        english = get_English_text(row['id'])
        # Prefer long sentences without '.', ';', '?', or '!'.
        if english is not None and not '.' in english[0:-1] and \
        not ';' in english[0:-1] and not '?' in english[0:-1] and \
        not '!' in english[0:-1] and \
        len(english.split()) >= 10:
            query_data.ix[count, 'query'] = english
            query_data.ix[count, 'chinese'] = row['text']
            count += 1
    if count < num_queries:
        raise RuntimeError('Too few data!')
    query_data.to_csv('text/query.txt')
    
# generate_100_queires()

In [3]:
# Generate the transcript with all the 100 English sentences in it
# stored in 'speech/transcript.txt'.
def generate_transcript():
    query_data = pd.read_csv('text/query.txt')
    file = open('speech/transcript.txt', 'w')
    for english in query_data['english']:
        file.write(english + '\n')
    print('Use the transcript to generate audio files!')
        
# generate_transcript()

### Convert Traditional Chinese to Simplified Chinese

[This tool](https://www.branah.com/traditional-to-simplified) could help.

### Generate Raw Data by Sending Queries to the Pipeline

### Generate DataFrame from ASR Results

In [7]:
# Return a DataFrame from the file with the following format:
# <audio_path>,<transcript>
# in which the audio path is of the following format:
# "path/to/file/<query_id>_<text>.wav"
# An example file:
# ../asr_sa/speech/0_Helloworld.wav,"hello world."
# Its first line must have be the header!
# The DataFrame has only one column: transcript
# with the specified header
# and sorted by the query id.
def get_df_from_asr_result(file_path, header):
    file = open(file_path, 'r')
    lines = file.readlines()
    lines.pop(0) # pop the header
    file.close()
    proc_lines = []
    for line in lines:
        line = line.lstrip().rstrip('\n')
        # The first occurrence of a number is assumed to be the query id.
        id = int(re.search(r'\d+', line).group())
        transcript = line.split(',')[1]
        if transcript.startswith('"') and transcript.endswith('"'):
            transcript = transcript[1:-1]
        proc_lines.append((id, transcript))
    proc_lines.sort(key=itemgetter(0)) # sort data by ID
    df = pd.DataFrame([row[1] for row in proc_lines], columns=[header])
    return df

data = pd.read_csv('text/query.txt')

asr_labels = ['fisher', 'librispeech', 'tedlium']

for asr_label in asr_labels:
    data = data.join \
    (get_df_from_asr_result('asr_result_{}.txt'.format(asr_label), \
                            'transcript_{}'.format(asr_label)))

In [8]:
data[:]

Unnamed: 0.1,Unnamed: 0,query,chinese,transcript_fisher,transcript_librispeech,transcript_tedlium
0,0,"Life being very short, we ought not to waste t...",人生十分短暂，我们不应该浪费时间。,i've been very shortly out not to waste time.,I'VE BEEN VERY SHORTLY OUGHT NOT TO WASTE TIME.,i've been very short we are not to waste time.
1,1,It's a pity that you can't buy miracles like y...,可惜我们不能像买土豆一样买奇迹。,it's it's pretty that you can't buy miracles l...,IT'S A PITY THAT YOU CAN'T BY MIRACLES LIKE YO...,it's a pity that you can buy americans like yo...
2,2,My father has been living in Nagoya for 30 years.,我父亲住在名古屋有30年了。,my son has been living in the glass source rad...,MY FATHER HAS BEEN LIVING IN NAGOYA FOR FRIO Y...,my father has been living in nagoya force rio ...
3,3,The average man fails not because he lacks abi...,通常人们失败并非能力不足，而是不够专注。,the average man sells not because she lets abi...,THAT VARIES MAN FAILS NOT BECAUSE HE LACKS ABI...,that average man fails not because he laughs a...
4,4,Would the girls please bring the donation boxe...,我们请女孩带着募捐箱来到舞台。,but the girls please bring the donations house...,BUT THE GIRLS PLEASE BRING THE DONATION BUFF A...,but the girls please bring the donation of say...
5,5,She told me that she would go to Paris in August.,她跟我说八月份她会去巴黎。,she told me that she would go to paris in august.,SHE TOLD ME THAT SHE WOULD GO TO PARIS IN AUGUST.,she told me that she would go to paris and hon...
6,6,Since then he had put his whole soul into his ...,从那时起，他就全心全意地投入到他的工作。,since then he had put his whole cylinder has w...,SINCE THEN HE HAD PUT HIS WHOLE SOUL INTO HIS ...,since then he had put his halsall into his work.
7,7,A man came over and asked if I was OK.,一个男人过来问我怎么了。,and then came over and asked if i was okay.,A MAN CAME OVER AND ASKED IF I WAS OKAY.,a man came over and asked if i was ok.
8,8,I didn't get to spend a lot of time with you o...,你这次来，我都没跟你在一起待多久。,i didn't get to spend a lot of time with you o...,I DIDN'T GET TO SPEND A LOT OF TIME WITH YOU O...,i didn't get to spend a lot of time with you o...
9,9,Do you know the concert schedule of London Sym...,你知道伦敦交响乐团的演奏会行程吗？,do you know the concerts schedule of london ci...,DO YOU KNOW THE CONCERT SCHEDULE OF LONDON SYM...,i am now the concert schedule of london sympho...


In [6]:
sys.path.append('../../mt')
from bleu import Evaluator
e = Evaluator()
score = e.evaluate(u'人生十分短促，我们不应该浪费时间。', \
                   u'人生十分短暂，我们不应该浪费时间。')
print('Score:', score)


['人生', '十分', '短促', '，', '我们', '不', '应该', '浪费时间', '。']
['人生', '十分', '短暂', '，', '我们', '不', '应该', '浪费时间', '。']
Score: 0.6606328636027614
