This notebook goes through the corpus and stores, which snippets are predicted as which letter and phoneme.  
(before, all snippets were saved - to save memory, I'll change to writing a mapping to a file)  
Information about each snippet is written to a line in a npy file containing:  
`start_position,letter,phoneme,phoneme_full/n`

In [1]:
import sys
import librosa
import string
import numpy as np
import pickle
import matplotlib.pyplot as plt
import tensorflow as tf
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
sys.path.append("/project/asr_introspection/jens_w2l/")
from w2l.estimator_main import run_asr
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner, LocalSequenceAligner
from itertools import chain

data_config = "/data/johannsm/introspection_data/mel_new"
model_config = '/data/johannsm/introspection_data/w2l_mel_extralayer'
model_dir = "/data/johannsm/introspection_data/model/"

char_list = list(" '" + string.ascii_lowercase + '12 ')

def remove_duplicates(s):
    if not s:
        return ""
    if len(s) == 1:
        return s
    if s[0] == s[1]:
        return remove_duplicates(s[1:])
    return s[0] + remove_duplicates(s[1:])

def flatten(listOfLists):
    "Flatten one level of nesting"
    return chain.from_iterable(listOfLists)

In [2]:
mappings_lines = [line.rstrip('\n') for line in open('/data/johannsm/introspection_data/predictions.phon.txt')]
transcription_lines = [line.rstrip('\n') for line in open('/data/johannsm/introspection_data/valid_predictions.txt')]

In [3]:
preds = run_asr("predict",data_config, model_config, model_dir)

sample_start = 0
line_id = 0


for sample_id in range(5559):
    decoded = next(preds)
    if(sample_id>=sample_start):
    
        spectrogram = decoded['all_layers'][0][1][:,:decoded['input_length']]
        np.save('/data/asr_introspection/spectrogram_input/sample'+str(sample_id).zfill(4)+'.npy',spectrogram)
        

        transcription = transcription_lines[sample_id]

        for c in '12':
            transcription = transcription.replace(c," ")
        letter_ids = np.where(np.array(list(transcription))!=' ')[0]

        transcription = transcription.replace(' ',"")
        transcription = ''.join([' ' for i in range(50)]) + transcription + ''.join([' ' for i in range(50)])
#         print(transcription)

        pred = decoded['decoding'][0].replace('  ',' ')
        pred = pred.replace('  ',' ')
        while(pred[0]==' '):
            pred = pred[1:]
#         print(pred)

        # parse mapping for prediction
        pred_split = pred.split(' ')
        pred_mapping = []
        line_id = line_id+1
        for pred_word in pred_split:
            word,_,phonemes = mappings_lines[line_id].split(' ')
            if(pred_word.lower() != word.lower()):
                print("Word mismatch",pred_word.lower(), word.lower())
            pred_mapping.append((phonemes+". ").split('.'))
            line_id = line_id + 1
        pred_mapping = list(flatten(pred_mapping))[:-1]

#         print(str(sample_id).zfill(4),'transcr len:',str(len(transcription)-100).zfill(5))

        v = Vocabulary()

        aEncoded = v.encodeSequence(Sequence(transcription))
        bEncoded = v.encodeSequence(Sequence(pred))

        # Create a scoring and align the sequences using global aligner.
        scoring = SimpleScoring(2, -1)
        aligner = GlobalSequenceAligner(scoring, -1)
        score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

        # Iterate over optimal alignments and print them.
        alignment = v.decodeSequenceAlignment(encodeds[0])
#         print(''.join(alignment[:117][0]))
#         print(''.join(alignment[:117][1]))
#         print('Alignment score:', alignment.score)
#         print('Percent identity:', alignment.percentIdentity())

        if(len(alignment)>0):
            alignment_array = np.array(alignment,dtype='object')
            alignment_array = np.concatenate((
                    np.reshape(np.repeat(' ',alignment_array.shape[0]),(alignment_array.shape[0],1)),
                    alignment_array[:,0:1],    
                    np.reshape(np.repeat(' ',alignment_array.shape[0]),(alignment_array.shape[0],1)),                    
                    np.reshape(np.repeat(' ',alignment_array.shape[0]),(alignment_array.shape[0],1)),
                    alignment_array[:,1:2]),
                axis=1)
            


            pred_id=0
            transcr_id = 0
            for c_id, c in enumerate(alignment):
                if(c[1]!='-'):
                    alignment_array[c_id,3] = pred_mapping[pred_id]
                    pred_id = pred_id + 1

                if(c[0]!='-' and c[0]!=' '):
                    alignment_array[c_id,0] = letter_ids[transcr_id]
                    transcr_id = transcr_id + 1
            
            alignment_array[:,2] = [s.replace("0","") for s in alignment_array[:,3]]
            alignment_array[:,2] = [s.replace("1","") for s in alignment_array[:,2]]
            alignment_array[:,2] = [s.replace("2","") for s in alignment_array[:,2]]

            alignment_array = alignment_array[alignment_array[:,0]!=' ',:4]
            np.save('/data/asr_introspection/spectrogram_input/sample'+str(sample_id).zfill(4)+'_groupmapping.npy',alignment_array)

#             print("- success - nextlineid", str(line_id+1))
#         else:
#             print("- skipped - nextlineid", str(line_id+1))

        line_id = line_id+1
    
    if((sample_id+1) % 250==0):
        print("finished " + str(sample_id+1) + " nextlineid " + str(line_id+1))
    
  

INFO:tensorflow:Using config: {'_tf_random_seed': None, '_save_checkpoints_steps': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff94632c0f0>, '_service': None, '_log_step_count_steps': 100, '_keep_checkpoint_max': 5, '_train_distribute': None, '_task_type': 'worker', '_global_id_in_cluster': 0, '_num_ps_replicas': 0, '_evaluation_master': '', '_num_worker_replicas': 1, '_session_config': None, '_master': '', '_model_dir': '/data/johannsm/introspection_data/model/', '_save_checkpoints_secs': 600, '_task_id': 0, '_keep_checkpoint_every_n_hours': 1, '_save_summary_steps': None}
Building input function for ['test-clean', 'test-other', 'test'] set using file /data/johannsm/corpus_new.csv...
	292367 entries found.
	Filtering requested subset...
	5559 entries remaining.
	Creating the dataset...
	Building iterator...
INFO:tensorflow:Calling model_fn.
Reading, building and applying model...
	Creating layer layer0 with 1573376 paramete



finished 250 nextlineid 4519
finished 500 nextlineid 9430
finished 750 nextlineid 15323
finished 1000 nextlineid 20355
finished 1250 nextlineid 25451
finished 1500 nextlineid 32153
finished 1750 nextlineid 38352
finished 2000 nextlineid 43713
finished 2250 nextlineid 49594
finished 2500 nextlineid 55684
finished 2750 nextlineid 60378
finished 3000 nextlineid 65338
finished 3250 nextlineid 70136
finished 3500 nextlineid 74036
finished 3750 nextlineid 78510
finished 4000 nextlineid 83283
finished 4250 nextlineid 88422
finished 4500 nextlineid 94472
finished 4750 nextlineid 100129
finished 5000 nextlineid 105052
finished 5250 nextlineid 110630
finished 5500 nextlineid 114788
