In [2]:
#importing essential modules
import pandas as pd
import numpy as np
import os,sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
import tensorflow as tf

In [3]:
#importing local modules
sys.path.append(os.path.abspath(os.path.join('../scripts')))

from clean import Clean
from utils import vocab
from deep_learner import DeepLearn
from modeling import Modeler
from evaluator import CallbackEval

In [4]:
#mapping an object
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [5]:
AM_ALPHABET='ሀለሐመሠረሰቀበግዕዝተኀነአከወዐዘየደገጠጰጸፀፈፐቈኈጐኰፙፘፚauiāeəo'
EN_ALPHABET='abcdefghijklmnopqrstuvwxyz'

In [6]:
#create cleaner object to numerate every amharic alphabets
cleaner = Clean()
char_to_num,num_to_char=vocab(AM_ALPHABET)

2022-06-06 19:38:32,213:logger:Successfully initialized clean class


The vocabulary is: ['', 'ሀ', 'ለ', 'ሐ', 'መ', 'ሠ', 'ረ', 'ሰ', 'ቀ', 'በ', 'ግ', 'ዕ', 'ዝ', 'ተ', 'ኀ', 'ነ', 'አ', 'ከ', 'ወ', 'ዐ', 'ዘ', 'የ', 'ደ', 'ገ', 'ጠ', 'ጰ', 'ጸ', 'ፀ', 'ፈ', 'ፐ', 'ቈ', 'ኈ', 'ጐ', 'ኰ', 'ፙ', 'ፘ', 'ፚ', 'a', 'u', 'i', 'ā', 'e', 'ə', 'o'] (size =44)


# Deep Learning 

In [9]:
#read the csv of each dataset into their respective dataframes
swahili_df = pd.read_csv('../data/swahili.csv')
lang = pd.read_csv("../data/swahili.csv")
lang['type']='swahili'

amharic_df = pd.read_csv("../data/amharic.csv")
amharic_df['type']='amharic'
language_df = lang.append(amharic_df, ignore_index=True) #used to append two datasets and combine them in to a single dataset

In [10]:
#create an instance of modeler class
pre_model = Modeler()

In [11]:
#creating pre processed swahilli for deep learning
swahili_preprocessed = pre_model.preprocessing_learn(swahili_df,'key','text')

In [12]:
#creating pre processed amharic for deep learning
amharic_preprocessed = pre_model.preprocessing_learn(amharic_df,'key','text')

In [15]:
#separate amharic pre processed into train, test, validation dataset
train_df,val_df,test_df = amharic_preprocessed

In [14]:
batch_size = 3
# Define the trainig dataset
train_dataset = tf.data.Dataset.from_tensor_slices(
    (list(train_df["file"]), list(train_df["text"]))
)
train_dataset = (
    train_dataset.map(cleaner.encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

# Define the validation dataset
validation_dataset = tf.data.Dataset.from_tensor_slices(
    (list(val_df["file"]), list(val_df["text"]))
)
validation_dataset = (
    validation_dataset.map(cleaner.encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

## Deep Learning Architecture

In [16]:
learn = DeepLearn(input_width=1, label_width=1, shift=1,epochs=5,
                 train_df=train_df, val_df=val_df, test_df=test_df,
                 label_columns=['mfcc-0'])
fft_length = 2
model = learn.build_asr_model(
    input_dim=fft_length // 2 + 1,
    output_dim=char_to_num.vocabulary_size(),
    rnn_units=1,
)
model.summary(line_length=110)

Model: "DeepSpeech_2"
______________________________________________________________________________________________________________
 Layer (type)                                    Output Shape                                Param #          
 input (InputLayer)                              [(None, None, 2)]                           0                
                                                                                                              
 expand_dim (Reshape)                            (None, None, 2, 1)                          0                
                                                                                                              
 conv_1 (Conv2D)                                 (None, None, 1, 2)                          4                
                                                                                                              
 conv_1_bn (BatchNormalization)                  (None, None, 1, 2)                       