## Checking the dataset 

## Checking if all audio files are available

In [53]:
import os

In [54]:
totalfilesindir = os.listdir('wavs')

In [55]:
excelsheet = open('extracteddata.csv',encoding='utf-8',mode='r')

In [56]:
totalfilenames = []
for record in excelsheet:
    tok = record.split(',')
    filename = tok[3]
    totalfilenames.append(filename)

In [None]:
for filename in totalfilenames:
    if filename not in totalfilesindir:
        print(filename)

## Generation of Metadata

### Generation of speaker details

In [1]:
excelsheet = open('extracteddata.csv',encoding='utf-8',mode='r')

emails = []
for record in excelsheet:
    tok = record.split(',')
    email = tok[2]
    gender = tok[1]
    if (email,gender) not in emails:
        emails.append((email,gender))

emailswithid = {}
i = 0
for item in emails:
    emailswithid[item] = i 
    i += 1

ff3 = open('speakerdetails.csv',encoding='utf-8',mode='w')
for item in emailswithid:
    ff3.write(str(item[0])+','+str(emailswithid[item])+','+str(item[1])+'\n')

    
ff3.close()
excelsheet.close()

### Structing data as VCTK

In [2]:
excelsheet = open('extracteddata.csv',encoding='utf-8',mode='r')

def findnth(haystack, needle, n):
    parts= haystack.split(needle, n+1)
    if len(parts)<=n+1:
        return -1
    return len(haystack)-len(parts[-1])-len(needle)

items = []
for item in excelsheet:
    tok = item.split(',')
    location = findnth(item,',',5)
    emailwithgender = (tok[2],tok[1])
    nam = tok[3]
    sentence = item[location+1:]
    idd = emailswithid[emailwithgender]
    items.append('wavs/'+nam+'|'+str(idd)+'|'+sentence)
    
excelsheet.close()

### Removing sentences with english words

In [3]:
import re
import os 

englishcleaned = []
for item in items:
    tokens = item.split('|')
    sent = tokens[2]
    result = re.search('[a-zA-Z]\S+',sent)
    if result == None:
        englishcleaned.append(item)
    else:
        if os.path.isfile(tokens[0]):
            os.remove(tokens[0])


### Replacing english numbers with urdu numbers

In [4]:
for i in range(0,len(englishcleaned)):
    tok = englishcleaned[i].split('|')
    tok[2] = tok[2].replace('0','۰').replace('1','۱').replace('2','۲').replace('3','۳').replace('4','۴').replace('5','۵').replace('6','۶').replace('7','۷').replace('8','۸').replace('9','۹')
    englishcleaned[i] = '|'.join(tok)


ff2 = open('urdumetadatauncleaned.csv',encoding='utf-8',mode='w')
for item in englishcleaned:
    ff2.write(item)
ff2.close()

### Splitting and checking Dataset

In [3]:
import os 

filelist = os.listdir('wavs')
excelsheet = open('urdumetadatauncleaned.csv',encoding='utf-8',mode='r')
totalfilenames = []
for record in excelsheet:
    tok = record.split('|')
    filename = tok[0][5:]
    totalfilenames.append(filename)

for filename in totalfilenames:
    if filename not in filelist:
        print(filename)

for filename in filelist:
    if filename not in totalfilenames:
        print(filename)

excelsheet.close()

### Train Test Split

In [1]:
file = open('../metadata.csv',mode='r',encoding='utf-8')

In [2]:
totaldata = []
for item in file:
    data = []
    data.append(item.replace('\n',''))
    totaldata.append(data)

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(totaldata,test_size=0.01)

tf = open('../filelists/trainfile.txt',mode='w',encoding='utf-8')
ttf = open('../filelists/valfile.txt',mode='w',encoding='utf-8')

for item in train:
    tf.write(item[0]+'.\n')

for item in test: 
    ttf.write(item[0]+'.\n')

tf.close()
ttf.close()

## Generation of symbols

In [4]:
file = open('../filelists/trainfile.txt.cleaned',encoding='utf-8',mode='r')
symbols = []
for item in file:
    tok = item.split('|')
    phon = tok[2].replace('\n','')
    phonlis = list(phon)
    uniq = set(phonlis)
    for it in uniq:
        symbols.append(it)
        
file.close()
uniqsymbols = []
for it in symbols:
    if it not in uniqsymbols:
        uniqsymbols.append(it)

for i in uniqsymbols:
    print(i,end="")

akəː.ɟbxtrhmɛpẽuʌʂsi loʋndɪfjqʃcɡʰʊwʈzɔɖʐɣðŋʒɹœɒθɐʔ

In [None]:
from typing import FrozenSet

_pad        = '_'

URDU_ALPHABETS: FrozenSet[str] = frozenset("آ أ ا ب پ ت ٹ ث ج چ ح خ د ڈ ذ ر ڑ ز ژ س ش ص ض ط ظ ع غ ف ق ک گ ل م ن ں و ؤ ﺅ ہ ۂ ۃ ھ ء ی ئ ے ۓ".split())

URDU_DIGITS: FrozenSet[str] = frozenset("۰ ۱ ۲ ۳ ۴ ۵ ۶ ۷ ۸ ۹".split())

URDU_PUNCTUATIONS: FrozenSet[str] = frozenset("؟ ٪ ! ، ’ ‘ ' . , - ؛ :".split())

URDU_DIACRITICS: FrozenSet[str] = frozenset("\u064e \u064B \u0670 \u0650 \u064F \u064d \u0651 \u0654".split())

URDU_EXTRA_CHARACTERS: FrozenSet[str] = frozenset("ﷲ ﷺ  ؓ  ؑ  ؒ  ؐ  ۖ".split())

URDU_ALL_CHARACTERS: FrozenSet[str] = frozenset().union(URDU_ALPHABETS, URDU_DIGITS, URDU_PUNCTUATIONS,  # type: ignore
                                                        URDU_DIACRITICS, URDU_EXTRA_CHARACTERS)  # type: ignore
# Export all symbols:
symbols = [_pad] + list(URDU_ALL_CHARACTERS) 


print(len(symbols))


## Checking audio details

In [1]:
from scipy.io.wavfile import read
sampling_rate, data = read('testaudio.aac')

In [2]:
sampling_rate

16000

In [3]:
data

array([  0,   0,   0, ..., 329, 303, 277], dtype=int16)

In [4]:
len(data)

81920

In [5]:
import sounddevice as sd

In [6]:
sd.play(data, sampling_rate)

### Phoneme