# Analyze MTurk MOS Scores

This repository analyzes MOS scores from MTurk.

TODO:
* Compute a confidence interval via the crowdMOS algorithm. The current confidence algorithm is more naive.

In [None]:
# NOTE: This CSV can be downloaded from MTurk after recieving results from a batch.
PATH = '/Users/michaelp/Code/Text-to-Speech/disk/other/Batch_3829940_batch_results.csv'

In [None]:
%matplotlib inline

from IPython.display import display
from IPython.display import Markdown

import pandas

data_frame = pandas.read_csv(PATH)

display(Markdown('### Number Of Workers'))
display(Markdown(str(len(data_frame['WorkerId'].unique()))))

display(Markdown('### Number Of Hits'))
display(Markdown(str(len(data_frame))))

display(Markdown('### Data Denominations'))
data_frame.groupby(['Input.name']).size()

In [None]:
import random
import numpy

from IPython.display import Audio
from IPython.display import Markdown

from IPython.display import FileLink

def random_sample(data_frame, *args, num_samples=100, sort_by='Input.name'):
    """ Randomly sample audio clips from the data frame.
    """
    if len(data_frame) == 0 or num_samples == 0:
        return
    
    display(Markdown('### Random Sample'))
    for i, row in data_frame.sample(n=min(num_samples, len(data_frame))).sort_values(by=[sort_by]).iterrows():
        display(Markdown('**Index:** ' + str(i) + 
                         '  |  **Process Name:** ' + row['Input.name'] + 
                         '  |  **Speaker:** ' + row['Input.speaker'] ))
        display(Markdown('**Text:** "' + row['Input.text'] + '"'))
        for key in args:
            display(Markdown('**%s:** ' % key + str(row[key])))
        display(Audio(str(row['Input.audio_path'])))
        display(Markdown('\n\n ___'))
        display()
        
random_sample(data_frame, num_samples=3)

# Add MOS Column

We need to first convert the label to an MOS score.

In [None]:
import math

def label_to_score(label):
    """ Get the MOS score from the MOS label.
    """
    tokens = label.lower().split()
    if 'excellent' in tokens:
        return 5.0
    elif 'good' in tokens:
        return 4.0
    elif 'fair' in tokens:
        return 3.0
    elif 'poor' in tokens:
        return 2.0
    elif 'bad' in tokens:
        return 1.0
    raise ValueError()

data_frame['MOS'] = data_frame['Answer.audio-naturalness.label'].apply(label_to_score)

# Task Completion Time

It's useful to guage the time it takes to compelete a task to determine fair worker pay.

In [None]:
# NOTE: This may be useful for determining the time taken to complete a task.
display(Markdown('### Density of time taken to complete a task:'))
# NOTE: Remove outliers.
# For example, it doesn't make sense to take 250 (4 minutes) seconds to review a 10 second clip. Workers that take
# so long to complete a task, may be working on multiple tasks at a time.
# NOTE (Michael 07-25-2019): From a previous analysis it looks like many workers are able to complete a task in
# 3 - 4x the audio time.
most_time = data_frame['WorkTimeInSecondsPerAudioSecond'][data_frame['WorkTimeInSecondsPerAudioSecond'] < 6]
most_time.plot.kde(bw_method=0.2)

# Remove Bad Data

Before we analyze our MOS scores, we are first going to filter out any bad data submitted to us.  

## Filter By Audio Length

This removes any data submitted by workers that didn't listen to the entire audio clip based on Amazon's 
`WorkTimeInSeconds` metric. This filter was inspired by the "crowdMOS" paper.

In [None]:
import numpy

# Temporary addition to combine 'Input.audio_length' and 'Input.audio_length_in_seconds' together.
def combine_columns(row):
    return row[1] if numpy.isnan(row[0]) else row[0]

data_frame['Input.audio_length_in_seconds'] = data_frame[
    ['Input.audio_length', 'Input.audio_length_in_seconds']].apply(combine_columns, axis=1)

In [None]:
data_frame['WorkTimeInSecondsPerAudioSecond'] = (
    data_frame['WorkTimeInSeconds'] / data_frame['Input.audio_length_in_seconds'])

In [None]:
workers_to_remove = list(set(data_frame[data_frame['WorkTimeInSecondsPerAudioSecond'] < 1.0]['WorkerId']))
rows_to_remove = data_frame['WorkerId'].isin(workers_to_remove)
display(Markdown('### Results'))
display(Markdown('%d workers had a mininum `WorkTimeInSecondsPerAudioSecond` ' % len(workers_to_remove) +
                 'less than the audio clip length and completed %d hits.' % rows_to_remove.sum()))
random_sample(data_frame[data_frame['WorkTimeInSecondsPerAudioSecond'] < 1.0], 'WorkTimeInSecondsPerAudioSecond',
              'Input.audio_length_in_seconds', num_samples=3)

In [None]:
data_frame = data_frame[~rows_to_remove]

## Filter By Ground Truth

The ground truth audio should recieve a high score. A worker misunderstood the task if they graded our ground truth 
with a low score such as 'Poor - Mostly unnatural speech'.

In [None]:
criteria = ((data_frame['Answer.audio-naturalness.label'] == 'Poor - Mostly unnatural speech') & 
            (data_frame['Input.type'] == 'gold'))
workers_to_remove = list(set(data_frame[criteria]['WorkerId']))
rows_to_remove = data_frame['WorkerId'].isin(workers_to_remove)
display(Markdown('#### Results'))
display(Markdown('%d workers rated a ground truth clip `Poor - Mostly unnatural speech` ' % len(workers_to_remove) +
                 'and completed %d hits.' % rows_to_remove.sum()))
random_sample(data_frame[criteria], 'WorkTimeInSecondsPerAudioSecond',
              'Answer.audio-naturalness.label', num_samples=3)

In [None]:
data_frame = data_frame[~rows_to_remove]

## Filter By Past Experience

Amazon provides us a "Life Time Approval Rate" for each worker. We can filter out any workers we have rejected in the past.

In [None]:
worker_past_approval = {}
for key, value in data_frame.groupby('WorkerId')['LifetimeApprovalRate'].unique().iteritems():
    assert len(value) == 1, 'There must be only one "LifetimeApprovalRate" per worker.'
    value = value[0]
    value = str(value).split('% (')  # Example Value: "0% (0/0)"
    approval_rate = float(value[0])
    num_approved, total_hits  = tuple(value[1][:-1].split('/'))
    num_approved, total_hits = float(num_approved), float(total_hits)
    assert (approval_rate == 0 and total_hits == 0) or (num_approved / total_hits) * 100 == approval_rate
    worker_past_approval[key] = {
        'num_approved': num_approved,
        'total_hits': total_hits,
        'approval_rate': approval_rate,
    }

workers_to_remove = [k for k, v in worker_past_approval.items() if v['total_hits'] != v['num_approved']]  
rows_to_remove = data_frame['WorkerId'].isin(workers_to_remove)
display(Markdown('#### Results'))
display(Markdown('%d workers were rejected in the past ' % len(workers_to_remove) +
                 'and completed %d hits.' % rows_to_remove.sum()))

In [None]:
data_frame = data_frame[~rows_to_remove]

## Filter By Experience

We know that this task may take sometime to get used to; therefore, it makes sense to filter our workers that have 
only graded a small number of clips.

In [None]:
cutoff = 5

def combine_hits(row):
    return row[0] + worker_past_approval[row.name]['total_hits']

worker_total_hits = data_frame.groupby('WorkerId').size().to_frame(0)
worker_total_hits[0] = worker_total_hits.apply(combine_hits, axis=1)
workers_to_remove = list(worker_total_hits[worker_total_hits[0] < cutoff].index.unique())
rows_to_remove = data_frame['WorkerId'].isin(workers_to_remove)
display(Markdown('#### Results'))
display(Markdown('%d workers completed less than %d hits ever ' % (len(workers_to_remove), cutoff) +
                 'and completed %d hits in total' % rows_to_remove.sum()))

In [None]:
data_frame = data_frame[~rows_to_remove]

## Filter by Speaker

We only want to include the speakers that'll be used by our users on the website for analysis. 

In [None]:
speakers_to_remove = ['Judy Bieber', 'Mary Ann', 'Linda Johnson']
filter_ = data_frame['Input.speaker'].isin(speakers_to_remove)
data_frame = data_frame[~filter_]
display(Markdown('#### Results'))
display(Markdown('Removing %d hits.' % filter_.sum()))

## Filter by Rejection

Amazon provides a method for us to "Reject" a hit. We should filter out rejected hits.

In [None]:
filter_ = data_frame['Reject'].notnull()
data_frame = data_frame[~filter_]
display(Markdown('#### Results'))
display(Markdown('Removing %d hits.' % filter_.sum()))

## Manual Ground Truth vs Synthetic Filter

We can filter our workers that score the ground truth samples lower than the synthetic samples, potentially.

In [None]:
display(Markdown('#### Ground Truth vs Synthetic MOS'))
merged = pandas.merge(
    data_frame[data_frame['Input.type'] != 'gold'].groupby('WorkerId')['MOS'].describe()[['mean', 'count', 'std']],
    data_frame[data_frame['Input.type'] == 'gold'].groupby('WorkerId')['MOS'].describe()[['mean', 'count', 'std']], 
    on='WorkerId',
    suffixes=('_synthetic', '_ground_truth'))
merged['gap'] = merged['mean_ground_truth'] - merged['mean_synthetic']
merged.sort_values(by=['gap'])

In [None]:
# Fill this in manually for workers to remove based on the `gap` column with considersation for other statistics.
workers_to_remove = []  

In [None]:
filter_ = data_frame['WorkerId'].isin(workers_to_remove)
data_frame = data_frame[~filter_]
display(Markdown('#### Results'))
display(Markdown('Removing %d hits.' % filter_.sum()))

## Manual Filter

From a random sample of the scores provided, are their any workers that are submitting poor results consistently?

In [None]:
num_samples = 0 # Set this appropriately.

In [None]:
random_sample(data_frame, 'WorkerId', 'Answer.audio-naturalness.label', num_samples=num_samples, sort_by='WorkerId')

In [None]:
workers_to_remove = []
filter_ = data_frame['WorkerId'].isin(workers_to_remove)
data_frame = data_frame[~filter_]
display(Markdown('#### Results'))
display(Markdown('Removing %d hits.' % filter_.sum()))

# Select Data Subset

You'll want to select a subset of the data to analayze from here on.

In [None]:
display(Markdown('### Stats'))
display(Markdown('#### Number Of Workers'))
display(Markdown(str(len(data_frame['WorkerId'].unique()))))

display(Markdown('#### Number Of Hits'))
display(Markdown(str(len(data_frame))))

display(Markdown('#### Data Denominations'))
data_frame.groupby(['Input.name']).size()

In [None]:
# Select your data subset here
subset = data_frame[data_frame['Input.name'].isin(['ground-truth'])]

In [None]:
import numpy as np 
import scipy.stats

def sample_mean_confidence_interval(data, confidence=0.95):
    """
    NOTE: This is a similar approach to computing a confidence interval as the Tacotron 2 approach.
    Inspired by: https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
    """
    data = np.array(data)
    a, b = scipy.stats.t.interval(confidence, len(data)-1, loc=np.mean(data), scale=scipy.stats.sem(data))
    return np.mean(data), (b - a) / 2

display(Markdown('### 95 Percent Confidence Interval \n%f ± %f' % 
    sample_mean_confidence_interval(subset["MOS"].tolist())))

In [None]:
display(Markdown('### MOS Distribution'))
pandas.value_counts(data_frame['MOS']).plot.bar()

In [None]:
display(Markdown('### Speaker MOS Score Distribution'))
data_frame.groupby('Input.speaker')['MOS'].describe().sort_values(by=['mean'])