In [1]:
import os
import re
import glob
import json
import random
import argparse
import pandas as pd
import numpy as np

import librosa
import librosa.display
import soundfile as sf

from tqdm import tqdm
import subprocess
from functools import partial
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

import MeCab
import cutlet

from sklearn.model_selection import train_test_split, KFold

import tensorflow as tf
import tensorflow_io as tfio
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from tensorflow.python.client import device_lib
from tensorflow.keras.mixed_precision import Policy, set_global_policy

from transformers import (
    Wav2Vec2CTCTokenizer,
    TFWav2Vec2ForCTC,
    Wav2Vec2Processor,
    Wav2Vec2FeatureExtractor)

def seed_everything(SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)
    print("Random seed set.")

seed_everything(42)
tf.get_logger().setLevel('FATAL')
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Random seed set.


# Preprocessing

In [2]:
class Dataset:
    def __init__(self):
        self.main_dir = "E://Datasets/ASR-dataset"
        self.sample_rate = 16000
        self.n_shards = 10
        self.data = pd.concat([
            self.get_kokoro(),
            self.get_jsut(),
            self.get_commonvoice()
            ], 
            ignore_index=True)
        self.katsu = cutlet.Cutlet()
        self.wakati = MeCab.Tagger("-Owakati")
    
        tqdm.pandas()
        self.data['sentence'] = self.data['sentence'].progress_apply(self.clean_kanji)
        self.data['romaji'] = self.data['sentence'].progress_apply(self.katsu.romaji)
        self.data['romaji'] = self.data['romaji'].progress_apply(self.clean_romaji)
        self.data['romaji'] = self.data['romaji'].str.lower()
        self.data['length'] = self.data['path'].progress_apply(self.get_length)
        self.data = self.data[self.data['sentence'].apply(list).apply(len)>=5]
        self.data.query("(length >= 48000) & (length <= 80000)", inplace=True)
        self.data = self.data.dropna()
        self.data = self.data.sample(n=5000, random_state=42, ignore_index=True)
        self.data.sort_values(by="length", axis=0, ascending=True, inplace=True, ignore_index=True)
        self.data.to_csv(f"{self.main_dir}/ASRDataset.csv", encoding="utf-8", index=False)

    def get_kokoro(self):
        in_dir = "Datasets\KOKORO-dataset"

        data = []
        transcript_path = f"{in_dir}/transcripts/*.metadata.txt"
        for transcript in glob.glob(transcript_path):
            with open(transcript, "r", encoding="utf-8") as f:
                for line in f.readlines():
                    data.append(line.split("|"))

        data = pd.DataFrame(
            data, columns=[
                'text_id', 'path', 'start_idx', 
                'end_idx', 'sentence', 'phonemes'])       

        # paths = data['path'].unique()
        # for path in tqdm(paths, total=len(paths)):
        #     folder_name = path.split("_", 1)[0]
        #     in_path = os.path.join(in_dir, folder_name, path)
        #     y, sr = librosa.load(in_path, sr=None)
        #     for text_id in data.loc[data['path']==path, 'text_id']:
        #         out_path = os.path.join(self.main_dir, 'wav_cleaned', text_id) + ".wav"
        #         if not os.path.exists(out_path):
        #             start_idx = int(data.loc[data['text_id']==text_id, 'start_idx'].item())
        #             end_idx = int(data.loc[data['text_id']==text_id, 'end_idx'].item())
        #             y_slice = librosa.resample(
        #                 y[start_idx:end_idx], orig_sr=sr, target_sr=self.sample_rate)
        #             sf.write(out_path, y_slice, samplerate=self.sample_rate, subtype='PCM_16')

        data = data[['text_id', 'sentence']]
        data['text_id'] = data['text_id'].apply(lambda x: x + ".wav")
        data.columns = ['path', 'sentence']
        data['corpus'] = ['kokoro'] * len(data)
        return data

    def get_jsut(self):
        filenames, sentences = [], []
        for transcript in glob.glob(r"Datasets/JSUT-dataset/*/transcript_utf8.txt"):
            file_path = transcript.rsplit("\\", 1)[0]
            with open(transcript, "r", encoding="utf-8") as f:
                lines = f.readlines()
                for line in lines: 
                    filename, sentence = line.split(":")
                    filenames.append(os.path.join(file_path, "wav", filename) + ".wav")
                    sentences.append(sentence.strip("\n"))
        data = pd.DataFrame({'path': filenames, 'sentence': sentences}) 
        data['corpus'] = ['jsut'] * len(data)
        for i, in_path in tqdm(enumerate(data['path']), total=len(data['path'])):
            in_path = in_path.replace("\\", "/")
            out_path = f"{self.main_dir}\wav_cleaned"
            filename = in_path.rsplit("/", 1)[-1]
            out_path = os.path.join(out_path, filename)
            if not os.path.exists(out_path):
                subprocess.call([
                    "ffmpeg", "-i", in_path,"-acodec", "pcm_s16le", 
                    "-ar", str(self.sample_rate), out_path])
            data['path'][i] = filename
        return data

    def get_commonvoice(self):
        data = pd.read_csv(r"Datasets/CommonVoice-dataset/validated.tsv", sep="\t")
        data = data[['path', 'sentence']]    
        data['path'] = data['path'].apply(
            lambda x: r"Datasets/CommonVoice-dataset/mp3/" + x)
        data['corpus'] = ['common_voice'] * len(data)
        for i, in_path in tqdm(enumerate(data['path']), total=len(data['path'])):
            in_path = in_path.replace("\\", "/")
            out_path = f"{self.main_dir}\wav_cleaned"
            filename = in_path.rsplit("/", 1)[-1]
            filename = filename.replace("mp3", "wav")
            out_path = os.path.join(out_path, filename)
            if not os.path.exists(out_path):
                subprocess.call([
                    "ffmpeg", "-i", in_path,"-acodec", "pcm_s16le", 
                    "-ar", str(self.sample_rate), out_path])
            data['path'][i] = filename
        return data

    def clean_kanji(self, sentence):
        symbols = r"[（.*?）！-～.,;..._。、-〿・■（）：ㇰ-ㇿ㈠-㉃㊀-㋾㌀-㍿「」『』→ー -~‘–※π—ゐ’“”]"
        sentence = re.sub(symbols, "", sentence)
        sentence = self.wakati.parse(sentence).strip("\n")          
        return sentence

    def clean_romaji(self, sentence):
        return re.sub(r'[.,"\'\/?]', "", sentence)

    def get_length(self, path):
        path = os.path.join(self.main_dir, 'wav_cleaned', path)
        y, sr = librosa.load(path, sr=None)
        return len(y)

# data = Dataset().data
# data

In [3]:
# fig, ax = plt.subplots(1,1,figsize=(10, 4))
# sns.histplot(x=data['length'], hue=data['corpus'], ax=ax, palette="bright")
# plt.show()

# Arguments

In [4]:
def ArgParser():
    parser = argparse.ArgumentParser()

    # DataLoader
    parser.add_argument("--main_dir", default="E://Datasets/ASR-dataset")
    parser.add_argument("--sample_rate", default=16000)
    parser.add_argument("--test_size", default=0.1)
    parser.add_argument("--random_state", default=42)
    parser.add_argument("--batch_size", default=4)
    parser.add_argument("--n_shards", default=10)
    parser.add_argument("--buffer_size", default=512)

    # Trainer
    parser.add_argument("--model_name", default="facebook/wav2vec2-base")
    parser.add_argument("--epochs", default=30)
    parser.add_argument("--learning_rate", default=5e-5)
    parser.add_argument("--beam_width", default=20)
    parser.add_argument("--top_paths", default=1)

    # Scheduler
    parser.add_argument("--lr_start", default=5e-5)
    parser.add_argument("--lr_min", default=1e-4)
    parser.add_argument("--lr_max", default=1e-4)
    parser.add_argument("--n_cycles", default=0.5)
    parser.add_argument("--warmup_epochs", default=4)
    parser.add_argument("--sustain_epochs", default=2)    

    args = parser.parse_known_args()[0]

    with open(f"{args.main_dir}/vocab.json", "r") as f:
        vocab_size = len(json.load(f))
   
    n_samples = len(pd.read_csv(os.path.join(args.main_dir, "ASRDataset.csv")))
    n_train = int(n_samples * (1 - args.test_size))
    n_val = int(n_samples * args.test_size)
    train_steps = int(np.ceil(n_train / args.batch_size))
    val_steps = int(np.ceil(n_val / args.batch_size))

    parser.add_argument("--vocab_size", default=vocab_size)
    parser.add_argument("--n_samples", default=n_samples)
    parser.add_argument("--n_train", default=n_train)
    parser.add_argument("--n_val", default=n_val)
    parser.add_argument("--train_steps", default=train_steps)  
    parser.add_argument("--val_steps", default=val_steps)  
    
    return parser.parse_known_args()[0]

args = ArgParser()
args

Namespace(batch_size=4, beam_width=20, buffer_size=512, epochs=30, learning_rate=5e-05, lr_max=0.0001, lr_min=0.0001, lr_start=5e-05, main_dir='E://Datasets/ASR-dataset', model_name='facebook/wav2vec2-base', n_cycles=0.5, n_samples=5000, n_shards=10, n_train=4500, n_val=500, random_state=42, sample_rate=16000, sustain_epochs=2, test_size=0.1, top_paths=1, train_steps=1125, val_steps=125, vocab_size=37, warmup_epochs=4)

# Data Loading

In [5]:
class Config:
    def __init__(self, args):
        tokenizer = Wav2Vec2CTCTokenizer(
            vocab_file=f"{args.main_dir}/vocab.json",
            unk_token="<unk>",
            pad_token="<pad>",
            bos_token="<s>",
            eos_token="</s>",
            word_delimiter_token=" ",
            do_lower_case=False
        )

        feature_extractor = Wav2Vec2FeatureExtractor(
            feature_size=1,
            sampling_rate=args.sample_rate,
            padding_value=0.0,
            do_normalize=True,
            return_attention_mask=False
        )

        self.processor = Wav2Vec2Processor(
            feature_extractor=feature_extractor, 
            tokenizer=tokenizer
        )

In [6]:
class TFRWriter():
    def __init__(self, args):
        self.data = pd.read_csv(os.path.join(args.main_dir, "ASRDataset.csv"))
        self.args = args
        self.tokenizer = Config(args).processor.tokenizer

    def _bytes_feature(self, value):
        """Returns a bytes_list from a string / byte."""
        if isinstance(value, type(tf.constant(0))):
            value = value.numpy()
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

    def _int64_feature(self, value):
        """Returns an int64_list from a bool / enum / int / uint."""
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

    def _float_feature(self, value):
        """Returns a float_list from a float / double."""
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

    def serialize_example(self, *args):
        feature = {
            'input_values': self._bytes_feature(args[0]),
            'labels': self._bytes_feature(args[1]),
            }

        example_proto = tf.train.Example(
            features=tf.train.Features(feature=feature))
        return example_proto.SerializeToString()

    def get_labels(self, sample):
        labels = self.data.loc[self.data['path']==sample, "romaji"].item()
        labels = (self.tokenizer.bos_token + labels + 
            self.tokenizer.eos_token)
        labels = self.tokenizer(labels)['input_ids']
        return tf.convert_to_tensor(labels, dtype=tf.int32)

    def get_audio(self, sample):
        path = os.path.join(self.args.main_dir, "wav_cleaned", sample)
        audio = librosa.load(path, sr=None)[0]
        return tf.convert_to_tensor(audio, dtype=tf.float32)

    def get_shards(self):
        skf = KFold(n_splits=self.args.n_shards, shuffle=False)
        return [
            list(map(lambda x: self.data['path'][x], j))
            for i, j in skf.split(self.data['path'])]

    def get_shard_data(self, samples):
        for sample in samples:
            audio = self.get_audio(sample)
            labels = self.get_labels(sample)
            yield {
                'input_values': tf.io.serialize_tensor(audio),
                'labels': tf.io.serialize_tensor(labels),
            }

    def write(self):
        for shard, samples in tqdm(enumerate(self.get_shards()), total=self.args.n_shards):
            with tf.io.TFRecordWriter(f"{self.args.main_dir}/wav2vec2_tfrec/shard_{shard+1}.tfrec") as f:
                for sample in self.get_shard_data(samples):
                    example = self.serialize_example(
                        sample['input_values'], 
                        sample['labels'], 
                        )
                    f.write(example)

# TFRWriter(args).write()

In [7]:
class DataLoader:
    def __init__(self, args):
        self.files = glob.glob(args.main_dir + "/wav2vec2_tfrec/*.tfrec")
        self.args = args
        self.AUTOTUNE = tf.data.AUTOTUNE
        self.train_files, self.val_files = train_test_split(
            self.files, test_size=args.test_size, shuffle=True, 
            random_state=args.random_state)
        self.train = self.get_train()
        self.val = self.get_val()     

    def read_tfrecord(self, example):
        feature_description = {
            'input_values': tf.io.FixedLenFeature([], tf.string),
            'labels': tf.io.FixedLenFeature([], tf.string),
            }
        
        example = tf.io.parse_single_example(example, feature_description)
        example['input_values'] = tf.io.parse_tensor(
            example['input_values'], out_type=tf.float32)
        example['labels'] = tf.io.parse_tensor(
            example['labels'], out_type=tf.int32)
        return example

    def load_dataset(self, files):
        ignore_order = tf.data.Options()
        ignore_order.experimental_deterministic = False
        dataset = tf.data.TFRecordDataset(files)
        dataset = dataset.with_options(ignore_order)
        dataset = dataset.map(self.read_tfrecord, num_parallel_calls=self.AUTOTUNE)
        return dataset

    def get_train(self):
        dataset = self.load_dataset(self.train_files)
        dataset = dataset.padded_batch(
            self.args.batch_size,
            padded_shapes={
                'input_values': [None],
                'labels': [None]
            },
            padding_values={
                'input_values': tf.constant(0, dtype=tf.float32), 
                'labels': tf.constant(-100, dtype=tf.int32)
            })
        dataset = dataset.shuffle(self.args.buffer_size)
        dataset = dataset.prefetch(self.AUTOTUNE)
        return dataset

    def get_val(self):
        dataset = self.load_dataset(self.val_files)
        dataset = dataset.padded_batch(
            self.args.batch_size,
            padded_shapes={
                'input_values': [None],
                'labels': [None]
            },
            padding_values={
                'input_values': tf.constant(0, dtype=tf.float32), 
                'labels': tf.constant(-100, dtype=tf.int32)
            })
        dataset = dataset.cache()
        dataset = dataset.prefetch(self.AUTOTUNE)
        return dataset

train = DataLoader(args).train
next(iter(train))

{'input_values': <tf.Tensor: shape=(4, 66240), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
 'labels': <tf.Tensor: shape=(4, 55), dtype=int32, numpy=
 array([[   1,   15,    5,   12,    6,    8,    4,   19,    6,    4,   15,
            5,    9,   24,    8,    8,    4,   18,    8,   13,    5,   14,
            8,    4,   12,    6,   16,    7,    6,    4,   19,    6,    4,
           24,    8,    8,    4,   18,    6,    4,   18,    8,   13,    5,
           14,    8,    4,    7,   13,   10,   14,    5,   15,    8,    2],
        [   1,   29,    6,    8,    4,   19,    5,    4,    5,   15,    5,
            4,   11,    5,   13,    5,    4,   20,    8,    8,   18,    5,
           12,    5,    4,   14,    5,   17,   10,    4,   24,    8,   31,
            6,    8,    4,   18,    5,    4,    5,   13,    7,   14,    5,
           16,    7,   

# Prepare Model

In [8]:
class PER(tf.keras.metrics.Metric):
    """Phone Error Rate

    This metric calculates the normalized error rate based on phonemes.

    Args:
        beam_width: (Optional)
        top_paths: (Optional)
        name: (Optional) string name of the metric instance

    """
    def __init__(self, beam_width, top_paths, name="PER", **kwargs):
        super(PER, self).__init__(name=name,  **kwargs)
        self.beam_width = beam_width
        self.top_paths = top_paths
        self.per_accumulator = self.add_weight(name="total_per", initializer="zeros")
        self.counter = self.add_weight(name="per_count", initializer="zeros")        

    def update_state(self, y_true, y_pred, sample_weight=None):
        """
        Function takes in model output logits and target labels and updates
        accumulator globally.

        Args: 
            y_true shape: [batch_size, sequence_length]
            y_pred shape: [batch_size, sequence_length, num_features]

        Returns:
            None

        """
        batch_size, sequence_length, num_features = tf.shape(y_pred)
        y_pred = tf.reshape(y_pred, [sequence_length, batch_size, num_features])
        sequence_length = tf.repeat(sequence_length, batch_size)

        # Decode logits into sparse tensor using beam search decoder
        hypothesis = tf.nn.ctc_beam_search_decoder(
            y_pred, sequence_length=sequence_length, beam_width=self.beam_width,
            top_paths=self.top_paths)[0][0]
        hypothesis = tf.cast(hypothesis, dtype=tf.int32)

        # Convert dense to sparse tensor for edit_distance function
        y_true = tf.where(y_true == -100, x=0, y=y_true)
        truth = tf.sparse.from_dense(y_true)

        # Calculate Levenshtein distance
        distance = tf.edit_distance(hypothesis, truth, normalize=True)

        # Add distance and number of samples to variables
        self.per_accumulator.assign_add(tf.reduce_sum(distance))
        self.counter.assign_add(len(y_true))

    def result(self):
        # Divides accumulated distance scores against number of samples passed,
        # mimics mean reduction over batch
        return tf.math.divide_no_nan(self.per_accumulator, self.counter)   
    
    def reset_states(self):
        self.per_accumulator.assign(0.0)
        self.counter.assign(0.0)

class CosineDecayWithWarmup(LearningRateSchedule):
    def __init__(self, args):
        self.args = args

    def __call__(self, epoch):  
        if epoch < self.args.warmup_epochs:
            lr = ((self.args.lr_max - self.args.lr_start) / self.args.warmup_epochs) * epoch + self.args.lr_start
        elif epoch < (self.args.warmup_epochs + self.args.sustain_epochs):
            lr = self.args.lr_max
        else:
            progress = ((epoch - self.args.warmup_epochs - self.args.sustain_epochs) / 
            (self.args.epochs - self.args.warmup_epochs - self.args.sustain_epochs))
            lr = (self.args.lr_max-self.args.lr_min) * (0.5 * (1.0 + tf.math.cos((22/7) * 
                self.args.n_cycles * 2.0 * progress)))
            if self.args.lr_min is not None:
                lr = tf.math.maximum(self.args.lr_min, lr)
        return lr

In [9]:
class Trainer:
    def __init__(self, args):
        self.args = args
        self.config = Config(args)
        self.train_dataset = DataLoader(args).train
        self.val_dataset = DataLoader(args).val
        self.schedule = CosineDecayWithWarmup(args)
        self.optimizer = tf.keras.optimizers.Adam(self.schedule)
        self.metrics = PER(beam_width=args.beam_width, top_paths=args.top_paths)
        self.model = TFWav2Vec2ForCTC.from_pretrained(
            args.model_name,
            from_pt=True,
            ctc_loss_reduction="mean",
            pad_token_id=self.config.processor.tokenizer.pad_token_id,
            vocab_size=len(self.config.processor.tokenizer))
        self.model.freeze_feature_extractor()
        self.history = {
            "loss": [],
            "per": [],
            "val_loss": [],
            "val_per": []
        }
    
    def train_step(self, batch):
        X_train = batch['input_values']
        y_train = batch['labels']
        with tf.GradientTape() as tape:
            loss, logits = self.model(
                input_values=X_train, labels=y_train, training=True)[:2]
        gradients = tape.gradient(loss, self.model.trainable_weights)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_weights))
        self.metrics.update_state(y_train, logits)
        return loss, logits

    def val_step(self, batch):
        X_val = batch['input_values']
        y_val = batch['labels']
        loss, logits = self.model(
            input_values=X_val, labels=y_val, training=False)[:2]
        return loss, logits

    def display(self, epoch, t_labels, t_logits, v_labels, v_logits):
        if epoch % 5 != 0:
            return

        # Training loop
        t_labels = tf.where(t_labels == -100, x=0, y=t_labels)
        t_labels = self.config.processor.batch_decode(t_labels)
        t_logits = tf.argmax(t_logits, axis=-1)
        t_logits = self.config.processor.batch_decode(t_logits)

        print("-" * 129)
        print("Training")
        for y_true, y_pred in zip(t_labels, t_logits):
            print(f"Target:    {y_true}")
            print(f"Predicted: {y_pred}") 

        # Validation loop
        v_labels = tf.where(v_labels == -100, x=0, y=v_labels)
        v_labels = self.config.processor.batch_decode(v_labels)
        v_logits = tf.argmax(v_logits, axis=-1)
        v_logits = self.config.processor.batch_decode(v_logits)   

        print("\nValidation")
        for y_true, y_pred in zip(v_labels, v_logits):
            print(f"Target:    {y_true}")
            print(f"Predicted: {y_pred}")
        print("-" * 129)

    def fit(self):
        for epoch in range(self.args.epochs):
            print(f"Epoch {epoch+1}/{self.args.epochs}: Learning rate @ {self.optimizer.lr(epoch):.2e}")
            stateful_metrics = ["loss", "per", "val_loss", "val_per"]
            progbar = tf.keras.utils.Progbar(
                self.args.train_steps, interval=0.05,
                stateful_metrics=stateful_metrics)

            # Training loop
            for step, t_batch in enumerate(self.train_dataset):
                t_loss, t_logits = self.train_step(t_batch)
                t_per = self.metrics.result()
                t_values = [("loss", t_loss), ("per", t_per)]
                progbar.update(step, values=t_values, finalize=False)
            self.metrics.reset_states()
            
            # Validation loop
            for v_batch in self.val_dataset:
                v_loss, v_logits = self.val_step(v_batch)                         
                self.metrics.update_state(v_batch['labels'], v_logits)

            v_per = self.metrics.result()
            v_values = [
                ("loss", t_loss), ("per", t_per), ("val_loss", v_loss),
                ("val_per", v_per)]
            progbar.update(self.args.train_steps, values=v_values, finalize=True)
            self.metrics.reset_states()

            # Print sample transcriptions for both loops
            self.display(
                epoch, t_batch['labels'], t_logits, v_batch['labels'], v_logits)

            # Checkpointing
            self.model.save_weights("{}\checkpoints\model{}k_{}of{}.h5".format(
                self.args.main_dir, self.args.n_samples//1000, epoch+1, self.args.epochs))

            # Logging
            self.history['loss'].append(t_loss)
            self.history['per'].append(t_per)
            self.history['val_loss'].append(v_loss)
            self.history['val_per'].append(v_per)

        return history

history = Trainer(args).fit()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFWav2Vec2ForCTC: ['project_q.weight', 'quantizer.weight_proj.weight', 'project_q.bias', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'project_hid.weight', 'project_hid.bias']
- This IS expected if you are initializing TFWav2Vec2ForCTC from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFWav2Vec2ForCTC from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFWav2Vec2ForCTC were not initialized from the PyTorch model and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferenc

Epoch 1/30: Learning rate @ 5.00e-05
---------------------------------------------------------------------------------------------------------------------------------
Training
Target:    <s>shouyu wotsukete sashimi wo tabemasu</s>
Predicted: <s>shouyuotsukete sashimi n sabemasuu</s>
Target:    <s>yamadasan wa ochichisan ga isha desu</s>
Predicted: <s>kamuzasan wa otou san gae isha desu</s>
Target:    <s>kore hodo touitsu no aru haigou mo nai kore hodoshizen de</s>
Predicted: <s>kore hodo touitsu no aru haigou mo naikore hodoshimende</s>
Target:    <s>sona koto wa ikan yo kimi da teshikata ga nai sa</s>
Predicted: <s>sonna koto wa ikai wo kimi dat deshikata ga naisa</s>

Validation
Target:    <s>sensei wa yamadasan no namae wo yobimashita</s>
Predicted: <s>senseu wa yamatasan no namahae wo yobimashita</s>
Target:    <s>saudi no seiji jousei no hoka no bunsekisha wa isou utagai wo idaite imashita</s>
Predicted: <s>kauji no seijijou sei no hokarobunsekisha wa issou daga y wu iraiteimashit