## Preprocessing

In [50]:
import wave
import os
import librosa 
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import librosa
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
from IPython.display import Image 
import plotly.graph_objects as go

In [52]:
sns.set()
plt.style.use('ggplot')
%matplotlib inline
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)
pd.set_option('display.float_format', '{:.2f}'.format)


# Loading the data

In [2]:
Path_to_train = "C:/Users/DESMOND/NLP/ALFFA_PUBLIC/ASR/SWAHILI/data/train/wav"
subfolders = os.listdir(Path_to_train)
data = []
for s in subfolders:
    files = os.listdir(Path_to_train + "/" +s)
    data.extend([Path_to_train + "/" + s+ "/" + f for f in files])
data[:10]

['C:/Users/DESMOND/NLP/ALFFA_PUBLIC/ASR/SWAHILI/data/train/wav/SWH-05-20101106/SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part10.wav',
 'C:/Users/DESMOND/NLP/ALFFA_PUBLIC/ASR/SWAHILI/data/train/wav/SWH-05-20101106/SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part100.wav',
 'C:/Users/DESMOND/NLP/ALFFA_PUBLIC/ASR/SWAHILI/data/train/wav/SWH-05-20101106/SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part101.wav',
 'C:/Users/DESMOND/NLP/ALFFA_PUBLIC/ASR/SWAHILI/data/train/wav/SWH-05-20101106/SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part102.wav',
 'C:/Users/DESMOND/NLP/ALFFA_PUBLIC/ASR/SWAHILI/data/train/wav/SWH-05-20101106/SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part103.wav',
 'C:/Users/DESMOND/NLP/ALFFA_PUBLIC/ASR/SWAHILI/data/train/wav/SWH-05-20101106/SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part104.wav',
 'C:/Users/DESMOND/NLP/ALFFA_PUBLIC/ASR/SWAHILI/data/train/wav/SW

In [12]:
#read text from every transcription audio
text_path = "C:/Users/DESMOND/NLP/ALFFA_PUBLIC/ASR/SWAHILI/data/train/text"
#read text from every transcription audio
def read_text( text_path):
    text = []
    with open(text_path) as fp:
        line = fp.readline()
        while line:
        # TODO: fix spaces in in amharic text
            text.append(line)
            line = fp.readline()
    return text

In [37]:
text

['SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part10\trais wa tanzania jakaya mrisho kikwete\n',
 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part100\tyanayo andaliwa nami pendo pondo idhaa ya kiswahili\n',
 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part101\tinayokutangazia moja kwa moja kutoka jijini dar es salaam tanzania\n',
 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part102\tjuma hili bara la afrika limeshuhudia raia wa nchi za niger\n',
 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part103\twakipiga kura ya maoni ilikufanya mabadiliko ya\n',
 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part104\tkule abidjan raia wa jiji hilo\n',
 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part105\twalipata fursa ya kutumia haki yao ya msingi\n',
 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part106\twaziri mkuu wa zamani alasane watara\n'

In [20]:
#extract the transcription and the label 
label=[]
transcriptions = []
for t in text:
    sp = t.split("\t")
    sp = [i.strip('\n') for i in sp]
    if len(sp) > 1:
        label.append(sp[0])
        transcriptions.append(sp[1])
transcriptions[:10]

['rais wa tanzania jakaya mrisho kikwete',
 'yanayo andaliwa nami pendo pondo idhaa ya kiswahili',
 'inayokutangazia moja kwa moja kutoka jijini dar es salaam tanzania',
 'juma hili bara la afrika limeshuhudia raia wa nchi za niger',
 'wakipiga kura ya maoni ilikufanya mabadiliko ya',
 'kule abidjan raia wa jiji hilo',
 'walipata fursa ya kutumia haki yao ya msingi',
 'waziri mkuu wa zamani alasane watara',
 'na rais aliyetangulia henry konan berdi',
 'walichuana vikali na rais lauren bagbo']

# Clean the data 

In [22]:
words = [t.split() for t in text]

print(words[:100])

[['SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part10', 'rais', 'wa', 'tanzania', 'jakaya', 'mrisho', 'kikwete'], ['SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part100', 'yanayo', 'andaliwa', 'nami', 'pendo', 'pondo', 'idhaa', 'ya', 'kiswahili'], ['SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part101', 'inayokutangazia', 'moja', 'kwa', 'moja', 'kutoka', 'jijini', 'dar', 'es', 'salaam', 'tanzania'], ['SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part102', 'juma', 'hili', 'bara', 'la', 'afrika', 'limeshuhudia', 'raia', 'wa', 'nchi', 'za', 'niger'], ['SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part103', 'wakipiga', 'kura', 'ya', 'maoni', 'ilikufanya', 'mabadiliko', 'ya'], ['SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part104', 'kule', 'abidjan', 'raia', 'wa', 'jiji', 'hilo'], ['SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part105', 'walipata', 'fursa', 'ya', '

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [35]:
data = transcriptions
data[10]

'matokeo ya uchaguzi mkuu wa nchi ya cote de ivoire inayoongoza kwa uzalishaji wa kakao duniani'

In [36]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [38]:
def remove_punctuation(text):
    text_nopunctuation = [c for c in text if c not in string.punctuation]
    return text_nopunctuation


In [46]:
data = [''.join(c for c in text if c not in string.punctuation) for s in text]

MemoryError: 

In [47]:
data

['rais wa tanzania jakaya mrisho kikwete',
 'yanayo andaliwa nami pendo pondo idhaa ya kiswahili',
 'inayokutangazia moja kwa moja kutoka jijini dar es salaam tanzania',
 'juma hili bara la afrika limeshuhudia raia wa nchi za niger',
 'wakipiga kura ya maoni ilikufanya mabadiliko ya',
 'kule abidjan raia wa jiji hilo',
 'walipata fursa ya kutumia haki yao ya msingi',
 'waziri mkuu wa zamani alasane watara',
 'na rais aliyetangulia henry konan berdi',
 'walichuana vikali na rais lauren bagbo',
 'matokeo ya uchaguzi mkuu wa nchi ya cote de ivoire inayoongoza kwa uzalishaji wa kakao duniani',
 'kuiongoza taifa hilo kwa awamu ya pili',
 'nina furaha kubwa baada ya kuona raia wa cote de ivoire',
 'wamepiga kura kwa amanii na utulivu',
 'ninaridhishwa na kila linaloendelea sasa kwani mambo ni shwari kabisa',
 'changamoto inayo tukabili kwa sasa',
 'ni kutangaza matokeo ya uchaguzi huu',
 'inabidi zoezi hilo lifanyike kwa amani pia',
 'nimewaomba viongozi mbalimbali wa dini',
 '<UNK>',
 'jeje

## loading the transcription files 