In [1]:
from datasets import load_dataset, load_metric

import pandas as pd
import numpy as np
import librosa
import json

import hazm
from num2fawords import words, ordinal_words
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

import os
import string
import six
import re
import glob
from pandarallel import pandarallel

pandarallel.initialize(nb_workers=10)
tqdm.pandas()

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
_normalizer = hazm.Normalizer()

chars_to_ignore = [
    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
    "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬",'ٔ', ",", "?", 
    ".", "!", "-", ";", ":",'"',"“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
    'ā', 'š',
#     "ء",
]

# In case of farsi
chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)

chars_to_mapping = {
    'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
    'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
    "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
    "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
    'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
    'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",
        
    # "ها": "  ها", "ئ": "ی",
    "۱۴ام": "۱۴ ام",
        
    "a": " ای ", "b": " بی ", "c": " سی ", "d": " دی ", "e": " ایی ", "f": " اف ",
    "g": " جی ", "h": " اچ ", "i": " آی ", "j": " جی ", "k": " کی ", "l": " ال ",
    "m": " ام ", "n": " ان ", "o": " او ", "p": " پی ", "q": " کیو ", "r": " آر ",
    "s": " اس ", "t": " تی ", "u": " یو ", "v": " وی ", "w": " دبلیو ", "x": " اکس ",
    "y": " وای ", "z": " زد ",
    "\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
}


def multiple_replace(text, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))

def remove_special_characters(text, chars_to_ignore_regex):
    text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
    return text

def normalizer(text, chars_to_ignore=chars_to_ignore, chars_to_mapping=chars_to_mapping):
    chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
    text = text.lower().strip()

    text = _normalizer.normalize(text)
    text = multiple_replace(text, chars_to_mapping)
    text = remove_special_characters(text, chars_to_ignore_regex)
    text = re.sub(" +", " ", text)
    _text = []
    for word in text.split():
        try:
            word = int(word)
            _text.append(words(word))
        except:
            _text.append(word)
            
    text = " ".join(_text) + " "
    text = text.strip()

    if not len(text) > 0:
        return None
    
    return text + " "

## general and medical

In [16]:
path = '/media/data/soroosh/dataset/ASR/medical_data/'
df = pd.read_csv(path + 'fanap_FANAP_1st_SERIES.csv')
print(df.shape)
df.head()

(9910, 3)


Unnamed: 0,text,file_name,user_id
0,چه چیزی بیشتر شما را پیر نشان می دهد,fanap_FANAP_1st_SERIES/1.wav,bqgRVXP0g7b8xzpB+4vTsxa8YX7UDJBsUrb2ZOed6eTE7v...
1,با کمال تعجب ممکن است چین و چروک یا موی سفید ش...,fanap_FANAP_1st_SERIES/2.wav,bqgRVXP0g7b8xzpB+4vTs/2jyYI8VS+jci1tDAl7OvY3oS...
2,اگرچه تیرگی زیر چشم می تواند شما را مسن بیمار ...,fanap_FANAP_1st_SERIES/3.wav,bqgRVXP0g7b8xzpB+4vTsxa8YX7UDJBsUrb2ZOed6eTE7v...
3,تیرگی زیر چشم هر دو جنس مرد و زن را تحت تاثیر ...,fanap_FANAP_1st_SERIES/4.wav,bqgRVXP0g7b8xzpB+4vTsxa8YX7UDJBsUrb2ZOed6eTE7v...
4,اگر شما زیاد می خوابید و همچنان زیر چشم هایتان...,fanap_FANAP_1st_SERIES/5.wav,bqgRVXP0g7b8xzpB+4vTs/2jyYI8VS+jci1tDAl7OvY3oS...


In [17]:
df["sentence"] = df["text"].progress_apply(lambda t: normalizer(t))

HBox(children=(FloatProgress(value=0.0, max=9910.0), HTML(value='')))




In [18]:
df['path'] = df['file_name'].progress_apply(lambda p: path + p)

HBox(children=(FloatProgress(value=0.0, max=9910.0), HTML(value='')))




In [9]:
# from joblib import Parallel, delayed

# save_dir = path + 'np/'
# os.makedirs(save_dir , exist_ok=True)

# def save2np(file):
#     try:
#         x, _ = librosa.load(file, sr=16000)
#         np.save(save_dir + file.split('/')[-1][:-4], x)
#     except Exception as e:
#         print(file, ':', e)

# _ = Parallel(n_jobs=10)(delayed(save2np)(file) for file in tqdm(df['path']))

HBox(children=(FloatProgress(value=0.0, max=9910.0), HTML(value='')))




In [19]:
df['np_path'] = df['path'].progress_apply(lambda path: save_dir + path.split('/')[-1][:-4] + '.npy')
df["status"] = df["np_path"].apply(lambda path: True if os.path.exists(path) else None)

df.dropna(subset=['status'], inplace=True)
df.drop(['status'], axis=1, inplace=True)
print(df.shape)

df['duration'] = df['np_path'].progress_apply(lambda p: np.load(p).shape[0]/ 16000)
df.head()

HBox(children=(FloatProgress(value=0.0, max=9910.0), HTML(value='')))


(9901, 7)


HBox(children=(FloatProgress(value=0.0, max=9901.0), HTML(value='')))




Unnamed: 0,text,file_name,user_id,sentence,path,np_path,status,duration
0,چه چیزی بیشتر شما را پیر نشان می دهد,fanap_FANAP_1st_SERIES/1.wav,bqgRVXP0g7b8xzpB+4vTsxa8YX7UDJBsUrb2ZOed6eTE7v...,چه چیزی بیشتر شما را پیر نشان می دهد,/media/data/soroosh/dataset/ASR/medical_data/f...,/media/data/soroosh/dataset/ASR/medical_data/n...,True,3.584
1,با کمال تعجب ممکن است چین و چروک یا موی سفید ش...,fanap_FANAP_1st_SERIES/2.wav,bqgRVXP0g7b8xzpB+4vTs/2jyYI8VS+jci1tDAl7OvY3oS...,با کمال تعجب ممکن است چین و چروک یا موی سفید ش...,/media/data/soroosh/dataset/ASR/medical_data/f...,/media/data/soroosh/dataset/ASR/medical_data/n...,True,9.642687
2,اگرچه تیرگی زیر چشم می تواند شما را مسن بیمار ...,fanap_FANAP_1st_SERIES/3.wav,bqgRVXP0g7b8xzpB+4vTsxa8YX7UDJBsUrb2ZOed6eTE7v...,اگرچه تیرگی زیر چشم می تواند شما را مسن بیمار ...,/media/data/soroosh/dataset/ASR/medical_data/f...,/media/data/soroosh/dataset/ASR/medical_data/n...,True,9.813375
3,تیرگی زیر چشم هر دو جنس مرد و زن را تحت تاثیر ...,fanap_FANAP_1st_SERIES/4.wav,bqgRVXP0g7b8xzpB+4vTsxa8YX7UDJBsUrb2ZOed6eTE7v...,تیرگی زیر چشم هر دو جنس مرد و زن را تحت تاثیر ...,/media/data/soroosh/dataset/ASR/medical_data/f...,/media/data/soroosh/dataset/ASR/medical_data/n...,True,11.776
4,اگر شما زیاد می خوابید و همچنان زیر چشم هایتان...,fanap_FANAP_1st_SERIES/5.wav,bqgRVXP0g7b8xzpB+4vTs/2jyYI8VS+jci1tDAl7OvY3oS...,اگر شما زیاد می خوابید و همچنان زیر چشم هایتان...,/media/data/soroosh/dataset/ASR/medical_data/f...,/media/data/soroosh/dataset/ASR/medical_data/n...,True,5.888


In [21]:
df = df[['sentence', 'path', 'np_path', 'duration']]
train_df, val_df = train_test_split(df, test_size=0.1, random_state=101)

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

print(train_df.shape)
print(val_df.shape)

df.to_csv(f"{path}meta.csv", encoding="utf-8", index=False)
train_df.to_csv(f"{path}train.csv", encoding="utf-8", index=False)
val_df.to_csv(f"{path}val.csv", encoding="utf-8", index=False)

(8910, 4)
(991, 4)


## Common voice

In [6]:
cv_path = '/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-corpus-6.1-2020-12-11/fa/'
valid = pd.read_csv(cv_path + 'validated.csv')
# valid = pd.read_table(cv_path + 'validated.tsv')
# train = pd.read_table(cv_path + 'train.tsv')
# dev = pd.read_table(cv_path + 'dev.tsv')
# test = pd.read_table(cv_path + 'test.tsv')

valid.shape#, train.shape, dev.shape, test.shape

(251659, 2)

In [7]:
# print(f"Step 0: {len(valid)}")

# valid["status"] = valid["path"].apply(lambda path: True if os.path.exists(path) else None)
# valid = valid.dropna(subset=["path"])
# valid = valid.drop("status", 1)
# print(f"Step 1: {len(valid)}")

# # df["prev_sentence"] = df["sentence"]
# valid["sentence"] = valid["sentence"].apply(lambda t: normalizer(t))
# valid = valid.dropna(subset=["sentence"])
# print(f"Step 2: {len(valid)}")

# valid['path'] = valid['path'].apply(lambda path: cv_path + 'clips/' + path)
# valid = valid[["sentence", "path"]]
# valid = valid.sample(frac=1)
# valid = valid.reset_index(drop=True)

valid.head()

Unnamed: 0,sentence,path
0,آیا این شراب شیرین است,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...
1,می توانم دوستم را بیاورم,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...
2,که شامل بسیاری از رفتارهای متفاوت میشود,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...
3,طرف چپ شکمم درد میکنه شبها نمیتونم روش بخوابم,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...
4,اسلایدسازم,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...


In [13]:
from joblib import Parallel, delayed

save_dir = '/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-corpus-6.1-2020-12-11/np/'

def save2np(file):
    x, _ = librosa.load(file, sr=16000)
    np.save(save_dir + file.split('/')[-1][:-4], x)

_ = Parallel(n_jobs=20)(delayed(save2np)(file) for file in tqdm(valid['path']))

HBox(children=(FloatProgress(value=0.0, max=251659.0), HTML(value='')))




In [15]:
valid['np_path'] = valid['path'].progress_apply(lambda path: save_dir + path.split('/')[-1][:-4] + '.npy')
valid.head()

HBox(children=(FloatProgress(value=0.0, max=251659.0), HTML(value='')))




Unnamed: 0,sentence,path,np_path
0,آیا این شراب شیرین است,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...
1,می توانم دوستم را بیاورم,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...
2,که شامل بسیاری از رفتارهای متفاوت میشود,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...
3,طرف چپ شکمم درد میکنه شبها نمیتونم روش بخوابم,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...
4,اسلایدسازم,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...


In [25]:
valid['duration'] = valid['np_path'].progress_apply(lambda p: np.load(p).shape[0]/ 16000)
valid.head()

HBox(children=(FloatProgress(value=0.0, max=251659.0), HTML(value='')))




Unnamed: 0,sentence,path,np_path,duration
0,آیا این شراب شیرین است,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,2.592
1,می توانم دوستم را بیاورم,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,2.904
2,که شامل بسیاری از رفتارهای متفاوت میشود,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,4.416
3,طرف چپ شکمم درد میکنه شبها نمیتونم روش بخوابم,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,5.664
4,اسلایدسازم,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,2.736


In [26]:
train_df, test_df = train_test_split(valid, test_size=0.1, random_state=101)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=101)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
val_df = test_df.reset_index(drop=True)

print(train_df.shape)
print(test_df.shape)
print(val_df.shape)

(203843, 4)
(25166, 4)
(25166, 4)


In [27]:
valid.to_csv(f"{cv_path}validated.csv", encoding="utf-8", index=False)
train_df.to_csv(f"{cv_path}train.csv", encoding="utf-8", index=False)
test_df.to_csv(f"{cv_path}test.csv", encoding="utf-8", index=False)
val_df.to_csv(f"{cv_path}val.csv", encoding="utf-8", index=False)

In [11]:
sents = valid['sentence']
a = set()
for s in sents:
    a.update(list(s))
vocab_list = list(a)
len(vocab_list)

36

In [14]:
special_vocab = ["<pad>", "<s>", "</s>", "<unk>", "|"]
vocab_list = [vocab for vocab in vocab_list if vocab not in [" ", "\u0307"]]
vocab_dict = {v: k for k, v in enumerate(special_vocab + vocab_list)}
print('vocab:')
print(len(vocab_dict))
print(vocab_dict)

with open('fa-vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

vocab:
40
{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '|': 4, 'ب': 5, 'ی': 6, 'غ': 7, 'ش': 8, 'آ': 9, 'ت': 10, 'خ': 11, 'پ': 12, 'ض': 13, 'ژ': 14, 'ئ': 15, 'ر': 16, 'ن': 17, 'ل': 18, 'س': 19, 'ء': 20, 'چ': 21, 'ح': 22, 'ط': 23, 'ظ': 24, 'گ': 25, 'ک': 26, 'و': 27, 'ذ': 28, 'د': 29, 'ف': 30, 'ق': 31, 'ج': 32, 'ا': 33, 'ث': 34, 'ص': 35, 'ه': 36, 'ز': 37, 'م': 38, 'ع': 39}


## SHEMO

In [20]:
abs_path_to_data = '/media/data/soroosh/dataset/ASR/shemo-fa/'
data = []
for txtfile in tqdm(glob.glob(f"{abs_path_to_data}/transcript/final text/*.ort")):
    with open(txtfile, "r", encoding="utf-8") as f:
        text = f.read()
        _id = txtfile.split("/")[-1].split(".")[0]
        male_path = f"{abs_path_to_data}/male/{_id}.wav"
        female_path = f"{abs_path_to_data}/female/{_id}.wav"
        
        if "M" in _id and os.path.exists(male_path):
            data.append({
                "_id": _id,
                "sentence": text,
                "path": male_path.strip(),
                "gender": "M"
            })
        
        if "F" in _id and os.path.exists(female_path):
            data.append({
                "_id": _id,
                "sentence": text.strip(),
                "path": female_path.strip(),
                "gender": "F"
            })

    
df = pd.DataFrame(data)
print(df.shape)
df.head()

100%|██████████| 3000/3000 [00:00<00:00, 28412.39it/s]

(2838, 4)





Unnamed: 0,_id,sentence,path,gender
0,F21W13,دورانت؟,/media/data/soroosh/dataset/ASR/shemo-fa//fema...,F
1,F24A49,عزیزم تو از همه بهتر می‌نویسی و همه آماتورهایی...,/media/data/soroosh/dataset/ASR/shemo-fa//fema...,F
2,F03A03,به فکر کارهای خودتی.,/media/data/soroosh/dataset/ASR/shemo-fa//fema...,F
3,M12A44,تو نمی‎تونی منو شستشوی مغزی بدی\n,/media/data/soroosh/dataset/ASR/shemo-fa//male...,M
4,M03A22,جیسون کامسون، نکنه گوشات کره؟\n,/media/data/soroosh/dataset/ASR/shemo-fa//male...,M


In [21]:
print(f"Step 0: {len(df)}")

df["status"] = df["path"].apply(lambda path: True if os.path.exists(path) else None)
df = df.dropna(subset=["path"])
df = df.drop("status", 1)
print(f"Step 1: {len(df)}")

# df["prev_sentence"] = df["sentence"]
df["sentence"] = df["sentence"].apply(lambda t: normalizer(t))
df = df.dropna(subset=["sentence"])
print(f"Step 2: {len(df)}")


# df = df[["sentence", "path"]]
df = df.sample(frac=1)
df = df.reset_index(drop=True)
df.head()

Step 0: 2838
Step 1: 2838
Step 2: 2838


Unnamed: 0,_id,sentence,path,gender
0,M29A12,فقط آن ها اسرار قصر مرا می دونن,/media/data/soroosh/dataset/ASR/shemo-fa//male...,M
1,F24N08,مشکل زیاده,/media/data/soroosh/dataset/ASR/shemo-fa//fema...,F
2,M02H03,بفرمایید تو رو خدا بفرمایید خوش اومدید بفرمایی...,/media/data/soroosh/dataset/ASR/shemo-fa//male...,M
3,M16A13,من یه عمره آرزو دارم صاحاب دکون بشم,/media/data/soroosh/dataset/ASR/shemo-fa//male...,M
4,F07S16,نه جورج اون نمرد من دروغ گفتم ویکی هم جلوی دهن...,/media/data/soroosh/dataset/ASR/shemo-fa//fema...,F


In [17]:
main_vocab = ["ح", "چ", "ج", "ث", "ت", "پ", "ب", "آ", "ا", "ش", "س", "ژ", "ز", "ر", "ذ", "د", "خ", "ق", "ف", "غ", "ع", "ظ", "ط", "ض", "ص", "ی", "ه", "و", "ن", "م", "ل", "گ", "ک"]
text = " ".join(df["sentence"].values.tolist())
vocab = list(sorted(set(text)))

for v in main_vocab:
    if v not in vocab:
        print("v", v)

print(len(main_vocab), len(vocab))
print(vocab)

33 36
[' ', 'ء', 'آ', 'ئ', 'ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ل', 'م', 'ن', 'ه', 'و', 'پ', 'چ', 'ژ', 'ک', 'گ', 'ی']


In [18]:
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np

idx = 0#np.random.randint(0, len(df))
print(idx)
sample = df.iloc[idx]

path = sample["path"]
print(sample["sentence"], "\n")
speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()

speech = librosa.resample(np.asarray(speech), sr, 16_000)
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)

0
تو بیمار شدی پل دوست نداری با هیچ کس معاشرت کنی از صبح تا شب کنج این خونه لعنتی چپیدی  



In [22]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=101, stratify=df["gender"])

train_df = train_df[["path", "sentence"]]
train_df = train_df.reset_index(drop=True)

test_df = test_df[["path", "sentence"]]
test_df = test_df.reset_index(drop=True)

print(train_df.shape)
print(test_df.shape)

(2554, 2)
(284, 2)


In [None]:
save_path = "/".join(abs_path_to_data.split('/'))
print(save_path)

train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)

print(train_df.shape)
print(test_df.shape)

In [3]:
path = '/media/data/soroosh/dataset/ASR/shemo-fa/'
train = pd.read_csv(path +'test.csv', delimiter='\t')
train.head()

Unnamed: 0,path,sentence
0,/media/data/soroosh/dataset/ASR/shemo-fa//male...,من به احترام شما خواستم اون تابلو رو از اتاق د...
1,/media/data/soroosh/dataset/ASR/shemo-fa//fema...,دکتر جهانگیری برای امیر از ماجرای یک شب سرد بر...
2,/media/data/soroosh/dataset/ASR/shemo-fa//male...,من روان شناس تیم پیوندم
3,/media/data/soroosh/dataset/ASR/shemo-fa//fema...,من کسانی رو نگه می دارم که دوسشون دارم و می خو...
4,/media/data/soroosh/dataset/ASR/shemo-fa//male...,ببرمش بزارمش تو کوچه


In [4]:
from joblib import Parallel, delayed

save_dir = '/media/data/soroosh/dataset/ASR/shemo-fa/np/'

def save2np(file):
    x, _ = librosa.load(file, sr=16000)
    np.save(save_dir + file.split('/')[-1][:-4], x)

_ = Parallel(n_jobs=20)(delayed(save2np)(file) for file in tqdm(train['path']))

HBox(children=(FloatProgress(value=0.0, max=2554.0), HTML(value='')))




In [5]:
train['np_path'] = train['path'].progress_apply(lambda path: save_dir + path.split('/')[-1][:-4] + '.npy')
train.head()

HBox(children=(FloatProgress(value=0.0, max=2554.0), HTML(value='')))




Unnamed: 0,path,sentence,np_path
0,/media/data/soroosh/dataset/ASR/shemo-fa//male...,من به احترام شما خواستم اون تابلو رو از اتاق د...,/media/data/soroosh/dataset/ASR/shemo-fa/np/M3...
1,/media/data/soroosh/dataset/ASR/shemo-fa//fema...,دکتر جهانگیری برای امیر از ماجرای یک شب سرد بر...,/media/data/soroosh/dataset/ASR/shemo-fa/np/F0...
2,/media/data/soroosh/dataset/ASR/shemo-fa//male...,من روان شناس تیم پیوندم,/media/data/soroosh/dataset/ASR/shemo-fa/np/M0...
3,/media/data/soroosh/dataset/ASR/shemo-fa//fema...,من کسانی رو نگه می دارم که دوسشون دارم و می خو...,/media/data/soroosh/dataset/ASR/shemo-fa/np/F2...
4,/media/data/soroosh/dataset/ASR/shemo-fa//male...,ببرمش بزارمش تو کوچه,/media/data/soroosh/dataset/ASR/shemo-fa/np/M0...


In [6]:
train['duration'] = train['np_path'].progress_apply(lambda p: np.load(p).shape[0]/ 16000)
train.head()

HBox(children=(FloatProgress(value=0.0, max=2554.0), HTML(value='')))




Unnamed: 0,path,sentence,np_path,duration
0,/media/data/soroosh/dataset/ASR/shemo-fa//male...,من به احترام شما خواستم اون تابلو رو از اتاق د...,/media/data/soroosh/dataset/ASR/shemo-fa/np/M3...,3.010125
1,/media/data/soroosh/dataset/ASR/shemo-fa//fema...,دکتر جهانگیری برای امیر از ماجرای یک شب سرد بر...,/media/data/soroosh/dataset/ASR/shemo-fa/np/F0...,27.983125
2,/media/data/soroosh/dataset/ASR/shemo-fa//male...,من روان شناس تیم پیوندم,/media/data/soroosh/dataset/ASR/shemo-fa/np/M0...,2.473875
3,/media/data/soroosh/dataset/ASR/shemo-fa//fema...,من کسانی رو نگه می دارم که دوسشون دارم و می خو...,/media/data/soroosh/dataset/ASR/shemo-fa/np/F2...,4.574063
4,/media/data/soroosh/dataset/ASR/shemo-fa//male...,ببرمش بزارمش تو کوچه,/media/data/soroosh/dataset/ASR/shemo-fa/np/M0...,1.477688


In [7]:
train.to_csv(path + 'train.csv', index=False)