# Part 1. clean data
here we will clean the text data and encode the audio. and save both in disk

In [1]:
import numpy as np
import pandas as pd
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df=pd.read_csv('csv/train_data.csv')
train_df.drop_duplicates(inplace=True)
print(len(train_df))
#dataframe should have two columns, text and audio path

48827


In [3]:
test_df=pd.read_csv('csv/test_data.csv')
test_df.drop_duplicates(inplace=True)
print(len(test_df))
test_df.head()

3298


Unnamed: 0,path,text,duration
0,data/commonvoice/clips/common_voice_ur_2897662...,"یہی تناسب ""یوتھ"" کا بھی ہے۔",2.8
1,data/commonvoice/clips/common_voice_ur_3109379...,آئی ایم ایف کے ساتھ کن شرائط پر بات ہو رہی ہے؟,4.9
2,data/commonvoice/clips/common_voice_ur_3109379...,"اور پھر سپاہی سے کہا ""ارے ہاں۔",4.0
3,data/commonvoice/clips/common_voice_ur_3201441...,اب صرف اور صرف انتظار ہے اگلے ماہ,3.6
4,data/commonvoice/clips/common_voice_ur_3203519...,تو میں پی ٹی وی میں اینکر تھا۔,3.1


In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
test_data=datasets.Dataset.from_pandas(test_df)
train_data=datasets.Dataset.from_pandas(train_df)

In [6]:
urdata = datasets.DatasetDict({
    'train': train_data,
    'test':test_data})
urdata

DatasetDict({
    train: Dataset({
        features: ['text', 'path', 'duration', '__index_level_0__'],
        num_rows: 48827
    })
    test: Dataset({
        features: ['path', 'text', 'duration', '__index_level_0__'],
        num_rows: 3298
    })
})

In [8]:
import re
import unicodedata
from urduhack.normalization import *
from urduhack.preprocessing import *
chars_to_ignore_regex = """[\!\؛\،\٫\؟\۔\٪\"\'\:\-\‘\’\`]"""
def clean_data(batch):
    text=remove_punctuation(batch["text"])
    text=normalize_whitespace(text)
    text=remove_accents(text)
    text=normalize_characters(text)
    text=normalize_combine_characters(text)
    text=remove_diacritics(text)
    text=normalize(text)
#     text = re.sub('\d+', '',text)
    text = re.sub('[a-zA-Z]+', '',text)
    text = re.sub('\u200c', '',text)
    text = re.sub('\u200f', '',text)
    text = re.sub('\ufeff', '',text)
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(chars_to_ignore_regex, "", text)
    #unify some character manually
    text = re.sub('\u0623','\u0627',text)#أ
    text = re.sub('\u06C3', '\u062A',text)#ه
    text = re.sub('\u0647', '\u06C1',text)#ۃ
    text = re.sub('\u06C2', '\u0621',text)#ۂ
    text = re.sub('\u0624', '\u0648',text)#ؤ
    text = re.sub('\u06D3', '\u06D2',text)#ۓ
    text = re.sub('\u0649', '\u06cc',text)#ى
    text = re.sub('\u0626', '\u06cc',text)#ئ
    text = re.sub('\u064a', '\u06cc',text)#ي
    return {"text":text }


urdata=urdata.map(clean_data)

2022-08-13 17:04:53.902357: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
100%|██████████████████████████████████| 48827/48827 [00:03<00:00, 13209.55ex/s]
100%|████████████████████████████████████| 3298/3298 [00:00<00:00, 15210.47ex/s]


48827

In [21]:
#saving the cleaned text, will later used in language model
text=[i['text'] for i in urdata['train']]
pd.DataFrame(text,columns=['text']).to_csv('text4lm.csv',index=False)
#will later use this test data text to evaluate our model 
text=[i['text'] for i in urdata['test']]
pd.DataFrame(text,columns=['text']).to_csv('tex-test.csv',index=False)
#both these files will be used in part 3

In [22]:
len(text)

3298

In [None]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [None]:
vocabs = urdata.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=urdata.column_names["train"])

In [None]:
vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
print(len(vocab_dict))

In [None]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict

In [None]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

In [None]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [None]:
from transformers import Wav2Vec2CTCTokenizer
tokenizer = Wav2Vec2CTCTokenizer("./cache/saved/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [None]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
urdata

In [None]:
urdata['train'][0]

In [None]:
urdata = urdata.cast_column("path", datasets.Audio(sampling_rate=16_000))
urdata

In [None]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(urdata['train'])-1)

print(urdata['train'][rand_int]["text"])
ipd.Audio(data=urdata['train'][rand_int]["path"]["array"], autoplay=True, rate=16000)

In [None]:
urdata.column_names

In [None]:
def prepare_dataset(batch):
    audio = batch["path"]
    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

urdudata = urdata.map(prepare_dataset, remove_columns=urdata.column_names["train"],num_proc=8)

In [None]:
urdudata.save_to_disk("cache/saved")