<a href="https://colab.research.google.com/github/zakaria-kabir/Research_Bengali-Dialect-Detection/blob/main/Audio_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Mounting**

In [1]:
import os                                     
import google.colab                                                             
import sys

# mounting Google Drive in the runtime's virtual machine
if not os.path.isdir('/content/drive'):

    google.colab.drive.mount('/content/drive') 

Mounted at /content/drive


In [2]:
os.chdir('/content/drive/MyDrive/Research 2021/G9 1920392_Zannat Chowdhury_1921371_Md Zakaria Kabir/Development')

In [3]:
!ls

Audio_Preprocessing.ipynb  input_directory  playlist_collaboration_link.gdoc


#**Installation & Imports**

**Standard Imports**

In [None]:
import os
import glob
import json
import re
import csv

**Other Imports**

In [None]:
modules = {'librosa': False,
           'pandas': False,
           'audio_metadata': False,
           'yt_dlp': False,
           'numpy': False,
           'soundfile': False,
           'pydub': False,}

In [None]:
while True:

  try:

    import librosa
    modules['librosa'] = True

    import pandas as pd
    modules['pandas'] = True

    import audio_metadata
    modules['audio_metadata'] = True

    import yt_dlp
    modules['yt_dlp'] = True

    import numpy as np
    modules['numpy'] = True

    import soundfile as sf
    modules['soundfile'] = True

    from pydub import AudioSegment 
    from pydub.utils import make_chunks
    modules['pydub'] = True

    print('Successfully Imported!!')

    break

  except Exception:

    if modules['librosa'] == False:

      print('Installing librosa')

      !pip install librosa

      continue

    if modules['pandas'] == False:

      print('Installing pandas')

      !pip install pandas
      
      continue
    
    if modules['audio_metadata'] == False:

      print('Installing audio_metadata')

      !pip install -U audio_metadata
      
      continue

    if modules['yt_dlp'] == False:

      print('Installing yt_dlp')

      !python3 -m pip install -U yt-dlp
      
      continue

    if modules['numpy'] == False:

      print('Installing numpy')

      !pip install numpy
      
      continue

    if modules['soundfile'] == False:

      print('Installing soundfile')

      !pip install soundfile
      
      continue

    if modules['pydub'] == False:

      print('Installing pydub')

      !pip install pydub
      
      continue
    print('Packages Installed Successfully')

Successfully Imported!!


#**Initialization**

In [None]:
root_directory = '/content/drive/MyDrive/Research 2021/G9 1920392_Zannat Chowdhury_1921371_Md Zakaria Kabir/Development'

os.makedirs(root_directory, exist_ok=True)

input_directory = os.path.join(root_directory, 'input_directory')

os.makedirs(input_directory, exist_ok=True)

source_directory = os.path.join(input_directory, 'source_directory')

os.makedirs(source_directory, exist_ok=True)

trimmed_output = os.path.join(input_directory, 'trimmed_output')

os.makedirs(trimmed_output, exist_ok=True)

segmented_output = os.path.join(input_directory, 'segmented_output')

os.makedirs(segmented_output, exist_ok=True)
# dialect_list = ['bogura', 'chittagong', 'dinajpur', 'dhaka_old', 'comilla', 'chapai_nawabganj',
#  'barishal', 'bagherhat', 'rajshahi', 'jessore', 'khulna', 'kolkata' 'kushtia',
#  'manikganj', 'mymensingh', 'noakhali', 'rangpur', 'shatkhira', 'sirajgonj',
#  'sylhet']

In [None]:
download_files_process = False
update_trim_file_process = True
trim_audio_data_process = False
segment_process = False
audio_info_process = False
to_csv_process = False

#**Utility Functions**

### Audio Description 

In [None]:
def get_audio_info(dialect_list, dir, audio_info_process):

  if audio_info_process:
    
    audio_info = []

    for dialect in dialect_list:

      print('Current Dialect: {0}'.format(dialect))

      current_dialect_path = os.path.join(dir, dialect)

      for audio_path in glob.glob(os.path.join(current_dialect_path, '*.wav')):

        data, sampling_rate = librosa.load(audio_path, sr = None, mono=True)

        metadata = audio_metadata.load(audio_path)

        audio_info.append({'dialect': dialect,
                          'filename': audio_path.split('/')[-1],
                          'sampling_rate': sampling_rate,
                          'duration' : librosa.get_duration(y = data, sr = sampling_rate),
                          'audio_shape' : data.shape,
                          'n_channels': len(data.shape),
                          'filesize': f'{metadata.filesize/1024:.2f} KiB',
                          'bit_rate': f'{metadata.streaminfo.bitrate/1000:.1f} Kbps',
                          'bit_depth': metadata.streaminfo.bit_depth,
                          'filepath' : audio_path
                      })
        
    return pd.DataFrame(audio_info)

# **Audio download**

In [None]:
audio_url = {
            '#bogura':'https://www.youtube.com/playlist?list=PLh79TXh5wUA2sESaJxMUA-yKB753mnLpV',
            '#chittagong':'https://youtube.com/playlist?list=PLh79TXh5wUA1JJdT_o9fNuTuwRcW11wuJ',
            '#dinajpur':'https://www.youtube.com/playlist?list=PLh79TXh5wUA2iTmaHihqGJJXIPVrEvhtB',
            '#dhaka_old':'https://www.youtube.com/playlist?list=PLh79TXh5wUA2oivE_qqlZnhAdbZhJJ8nc',
            '#comilla':'https://www.youtube.com/playlist?list=PLh79TXh5wUA0W1NGAHkRxyHUSB9pb7K8O',
            '#chapai_nawabganj':'https://www.youtube.com/playlist?list=PLh79TXh5wUA3fNAJ9mGUVBF9gaPsYuCwb',
            '#barishal':'https://www.youtube.com/playlist?list=PLh79TXh5wUA1a9gvYcvnGDU2zT56OfzGj',
            '#bagherhat':'https://www.youtube.com/playlist?list=PLh79TXh5wUA0Uw-OvaVis7smYIxua_Q10',
            '#rajshahi':'https://www.youtube.com/playlist?list=PLh79TXh5wUA2dTraJg928KGAKNSaaY-2G',
            '#jessore':'https://www.youtube.com/playlist?list=PLh79TXh5wUA0RGss5T1EvrpJRxHXrr0_S',
            '#khulna':'https://www.youtube.com/playlist?list=PLh79TXh5wUA3tgiCsMq0ADkk4qYtLYMZT',
            '#kolkata':'https://www.youtube.com/playlist?list=PLh79TXh5wUA1X_brXuh6dCrB0Dd3USmwm',
            '#kushtia':'https://www.youtube.com/playlist?list=PLh79TXh5wUA2KFsndexCHzpxvOapGcyKe',
            '#manikganj':'https://www.youtube.com/playlist?list=PLh79TXh5wUA2irbH3RgbTO0dx4oxcsKPF',
            '#mymensingh':'https://www.youtube.com/playlist?list=PLh79TXh5wUA0-zv6ZZHMTyGwx0_iRrnWD',
            '#noakhali':'https://www.youtube.com/playlist?list=PLh79TXh5wUA1xJCfr-3YmZKKKpGQqopKG',
            '#rangpur':'https://www.youtube.com/playlist?list=PLh79TXh5wUA0xBXftNohSOmK6aDJG2DOn',
            '#shatkhira':'https://www.youtube.com/playlist?list=PLh79TXh5wUA3LjmFCWymquAm0ER037AYu',
            '#sirajgonj':'https://www.youtube.com/playlist?list=PLh79TXh5wUA0mfmyXNRkQRggLneN73cM0',
            '#sylhet':'https://www.youtube.com/playlist?list=PLh79TXh5wUA22CAiJs8p_p7MssS4kueRe',
             }

In [None]:
filepath = os.path.join(source_directory, 'playlist.txt')

with open(filepath, 'w+') as file:

  for key, value in audio_url.items():

    file.write('{0}\n{1}\n'.format(key, value))

In [None]:
def download_files(download_files_process):

  if download_files_process:

    !yt-dlp -f 'ba' -x --audio-format wav --download-archive '{source_directory}/downloaded_list.txt' -o '{source_directory}/%(playlist)s/%(playlist)s%(playlist_index)05d.%(ext)s' -a '{filepath}'

In [None]:
download_files(download_files_process)

# **Get Dialect List**

In [None]:
dialect_list = [dialect.replace('#', '') for dialect in audio_url.keys()]

#**Audio Triming**

**Initializing Trimming Points** 

In [None]:
def initialize_trimming_points(process):

  if process:

    trimming_points = {}

    for dialect in dialect_list:

      current_dialect_path = os.path.join(source_directory, dialect)

      for audio_path in glob.glob(os.path.join(current_dialect_path, '*.wav')):

        trimming_points[audio_path.split('/')[-1]] = None
        
    return trimming_points

**Write into file to note down the points**

In [None]:
def write_trim_points_into_file(trim_points_filename, update_trim_file_process):

  if update_trim_file_process:

    trimming_points=initialize_trimming_points(update_trim_file_process)

    if not os.path.isfile(trim_points_filename) :

      fp = open(trim_points_filename, 'x')

      fp.close()

    if os.stat(trim_points_filename).st_size == 0:

      json.dump(trimming_points, open(trim_points_filename,'w'),indent=2, sort_keys=True)

    else:

      trim_points_dict = json.load(open(os.path.join(source_directory, trim_points_filename)))

      if not trim_points_dict.keys()==trimming_points.keys():

        trimming_points.update(trim_points_dict)

        json.dump(trimming_points, open(trim_points_filename,'w'), indent=2, sort_keys=True, separators=(',', ': '))

**Read file to read the trimming points & trim, & save to trimed directory by same file and folder notation**

In [None]:
def trim_audio_dataset(trim_points_filename, trimmed_output, trim_audio_data_process):

  if trim_audio_data_process:

    trim_points_dict = json.load(open(trim_points_filename))

    for key, points in trim_points_dict.items():

      current_dialect = re.split('\d+', key)[0]

      current_filepath = os.path.join(source_directory, current_dialect, key)

      current_file_output_directory = os.path.join(trimmed_output, current_dialect)

      os.makedirs(current_file_output_directory, exist_ok=True)
      
      audio, sr = librosa.load(current_filepath, sr = None, mono = True)

      buffer = 0

      if points is not None:

        for current_point in points:

          current_point = np.array(current_point) - buffer

          audio = np.delete(audio,slice(current_point[0]*sr,-1 if current_point[1]<0 else current_point[1]*sr))

          buffer += np.diff(current_point)

      sf.write(os.path.join(current_file_output_directory, key), audio, sr)

      print(os.path.join(current_file_output_directory, key))

In [None]:
trim_points_filename = os.path.join(source_directory, 'trim_points.json')
write_trim_points_into_file(trim_points_filename, update_trim_file_process)
trim_audio_dataset(trim_points_filename, trimmed_output, trim_audio_data_process)

/content/drive/MyDrive/Research 2021/G9 1920392_Zannat Chowdhury_1921371_Md Zakaria Kabir/Development/input_directory/trimmed_output/barishal/barishal00002.wav
/content/drive/MyDrive/Research 2021/G9 1920392_Zannat Chowdhury_1921371_Md Zakaria Kabir/Development/input_directory/trimmed_output/barishal/barishal00003.wav
/content/drive/MyDrive/Research 2021/G9 1920392_Zannat Chowdhury_1921371_Md Zakaria Kabir/Development/input_directory/trimmed_output/barishal/barishal00004.wav
/content/drive/MyDrive/Research 2021/G9 1920392_Zannat Chowdhury_1921371_Md Zakaria Kabir/Development/input_directory/trimmed_output/barishal/barishal00005.wav
/content/drive/MyDrive/Research 2021/G9 1920392_Zannat Chowdhury_1921371_Md Zakaria Kabir/Development/input_directory/trimmed_output/barishal/barishal00006.wav
/content/drive/MyDrive/Research 2021/G9 1920392_Zannat Chowdhury_1921371_Md Zakaria Kabir/Development/input_directory/trimmed_output/barishal/barishal00007.wav
/content/drive/MyDrive/Research 2021/G9 

# **Audio Splitting**

*error: non-default argument follows default argument
correct example*

def example(a, b, c=None, r="w", d=[], *ae,  **ab):

(a,b) are positional parameter

(c=none) is optional parameter

(r="w") is keyword parameter

(d=[]) is list parameter

(*ae) is keyword-only

(*ab) is var-keyword parameter

In [None]:
def split_audio_into_segments(dialect_list, dir, segment_process, segment_time=10):
  
  if segment_process:

    # segment_time = int(input("Enter segment time (in Seconds):"))

    for dialect in dialect_list:

      print('Current Dialect: {0}'.format(dialect))

      current_dialect_path = os.path.join(dir, dialect)
      
      current_file_output_directory = os.path.join(segmented_output, dialect)

      os.makedirs(current_file_output_directory, exist_ok=True)

      count=0

      for audio_path in glob.glob(os.path.join(current_dialect_path, '*.wav')):

        audio = AudioSegment.from_file(audio_path, "wav") 

        chunk_length_ms = segment_time*1000

        chunks = make_chunks(audio,chunk_length_ms)

        for chunk in chunks: 
          
          if not len(chunk)<(segment_time-2)*1000:

            count+=1

            chunk_name = os.path.join(current_file_output_directory, dialect+"{:05}.wav".format(count))
          
            chunk.export(chunk_name, format="wav") 

In [None]:
split_audio_into_segments(dialect_list, trimmed_output, segment_process)

Current Dialect: bogura
Current Dialect: chittagong
Current Dialect: dinajpur
Current Dialect: dhaka_old
Current Dialect: comilla
Current Dialect: chapai_nawabganj
Current Dialect: barishal
Current Dialect: bagherhat
Current Dialect: rajshahi
Current Dialect: jessore
Current Dialect: khulna
Current Dialect: kolkata
Current Dialect: kushtia
Current Dialect: manikganj
Current Dialect: mymensingh
Current Dialect: noakhali
Current Dialect: rangpur
Current Dialect: shatkhira
Current Dialect: sirajgonj
Current Dialect: sylhet


In [None]:
audio_info_df = get_audio_info(dialect_list, segmented_output, audio_info_process)

Current Dialect: bogura
Current Dialect: chittagong
Current Dialect: dinajpur
Current Dialect: dhaka_old
Current Dialect: comilla
Current Dialect: chapai_nawabganj
Current Dialect: barishal
Current Dialect: bagherhat
Current Dialect: rajshahi
Current Dialect: jessore
Current Dialect: khulna
Current Dialect: kolkata
Current Dialect: kushtia
Current Dialect: manikganj
Current Dialect: mymensingh
Current Dialect: noakhali
Current Dialect: rangpur
Current Dialect: shatkhira
Current Dialect: sirajgonj
Current Dialect: sylhet


In [None]:
audio_info_df

Unnamed: 0,dialect,filename,sampling_rate,duration,audio_shape,n_channels,filesize,bit_rate,bit_depth,filepath
0,bogura,bogura00001.wav,48000,10.0,"(480000,)",1,937.54 KiB,768.0 Kbps,16,/content/drive/MyDrive/Research 2021/G9 192039...
1,bogura,bogura00002.wav,48000,10.0,"(480000,)",1,937.54 KiB,768.0 Kbps,16,/content/drive/MyDrive/Research 2021/G9 192039...
2,bogura,bogura00003.wav,48000,10.0,"(480000,)",1,937.54 KiB,768.0 Kbps,16,/content/drive/MyDrive/Research 2021/G9 192039...
3,bogura,bogura00004.wav,48000,10.0,"(480000,)",1,937.54 KiB,768.0 Kbps,16,/content/drive/MyDrive/Research 2021/G9 192039...
4,bogura,bogura00005.wav,48000,10.0,"(480000,)",1,937.54 KiB,768.0 Kbps,16,/content/drive/MyDrive/Research 2021/G9 192039...
...,...,...,...,...,...,...,...,...,...,...
6330,sylhet,sylhet00829.wav,48000,10.0,"(480000,)",1,937.54 KiB,768.0 Kbps,16,/content/drive/MyDrive/Research 2021/G9 192039...
6331,sylhet,sylhet00830.wav,48000,10.0,"(480000,)",1,937.54 KiB,768.0 Kbps,16,/content/drive/MyDrive/Research 2021/G9 192039...
6332,sylhet,sylhet00831.wav,48000,10.0,"(480000,)",1,937.54 KiB,768.0 Kbps,16,/content/drive/MyDrive/Research 2021/G9 192039...
6333,sylhet,sylhet00832.wav,48000,10.0,"(480000,)",1,937.54 KiB,768.0 Kbps,16,/content/drive/MyDrive/Research 2021/G9 192039...


# **To  CSV**

In [None]:
def export_to_csv(dialect_list, dir, to_csv_process):

  if to_csv_process:
    print("yes")

    audio_info = []

    for dialect in dialect_list:

      current_dialect_path = os.path.join(dir, dialect)

      for audio_path in glob.glob(os.path.join(current_dialect_path, '*.wav')):

        audio_info.append({'filename': audio_path.split('/')[-1],
                          'dialect': dialect,})

    audio_info_df = pd.DataFrame(audio_info)

    csv_file_path = os.path.join(dir, 'data.csv')

    if os.path.exists(csv_file_path) and os.stat(csv_file_path).st_size != 0:

      df=pd.read_csv(csv_file_path)

      merged_df = audio_info_df.merge(df, on='filename', how='left').drop(['dialect_y'], axis=1).rename(columns = {'dialect_x':'dialect'})
      
      headers = merged_df.columns.tolist()
      
      merged_df.to_csv(csv_file_path, columns = headers, index=False)

    else:

      c = ['gender', 'no of speaker', 'annotation']
      
      audio_info_df = audio_info_df.assign(**dict.fromkeys(c, ''))
      
      headers = audio_info_df.columns.tolist()
      
      audio_info_df.to_csv(csv_file_path, columns = headers, index=False)

In [None]:
export_to_csv(dialect_list, segmented_output, to_csv_process)

# **Trial**

In [None]:
df1 = pd.DataFrame(
    {
        "F": ["B0", "B1", "C1", "C2","S1"],
        "G": ["M", "B", "F", "M","B"],

    },
)


df2 = pd.DataFrame(
    {
        "F": ["B0", "B1", "B2", "C1", "C2","C3", "S1", "S2"],
        "G": ["", "", "", "", "", "", "", ""],
    },
)


frames = [df1, df2]

# df_merged = pd.concat([df1,df2]).drop_duplicates().reset_index(drop=False)
df_merged= df2.merge(df1, on='F', how='left').drop ( ['G_x'], axis=1)
df_merged

Unnamed: 0,F,G_y
0,B0,M
1,B1,B
2,B2,
3,C1,F
4,C2,M
5,C3,
6,S1,B
7,S2,


In [None]:
# sir_root_dir = "/content/drive/MyDrive/Research 2021/G9 1920392_Zannat Chowdhury_1921371_Md Zakaria Kabir/Abujar Sir's Regional Speech Dataset/regionData"
# barisal = "/content/drive/MyDrive/Research 2021/G9 1920392_Zannat Chowdhury_1921371_Md Zakaria Kabir/Abujar Sir's Regional Speech Dataset/regionData/Barishal"
# temp_combined = np.array([])
# count = 0
# test_path = "/content/drive/MyDrive/Research 2021/G9 1920392_Zannat Chowdhury_1921371_Md Zakaria Kabir/Abujar Sir's Regional Speech Dataset/Test/"
# os.makedirs(test_path, exist_ok=True) 
# def numericalSort(value):
#   return int(re.findall(r'\d+', value)[-1])
  
# for audio_path in sorted(glob.glob(os.path.join(barisal, '*.wav')),  key=numericalSort):
#   # audio = AudioSegment.from_wav(audio_path)
#   audio, sr = librosa.load(audio_path, sr = None, mono=True)


#   if len(temp_combined)/sr<10.0:
#     print(audio_path)
#     temp_combined = np.concatenate((temp_combined, audio), axis=None)
#   else:
#     count+=1
#     sf.write(f"{test_path}/combined_file{count}.wav", temp_combined, sr)
#     temp_combined = audio
  

In [None]:
# import csv

# def test_write_trim_points_into_file(trim_points_filename, update_trim_file_process=True):

#   if update_trim_file_process:

#     trimming_points=initialize_trimming_points(update_trim_file_process)

#     if not os.path.isfile(trim_points_filename) :

#       with open(trim_points_filename, 'w+') as f:
#           w = csv.DictWriter(f, trimming_points.keys())
#           w.writeheader()
#           w.writerow(trimming_points)


#     # else:

#     #   trim_points_dict = json.load(open(os.path.join(source_directory, trim_points_filename)))

#     #   if not trim_points_dict.keys()==trimming_points.keys():

#     #     trimming_points.update(trim_points_dict)

#     #     json.dump(trimming_points, open(trim_points_filename,'w'), indent=2, sort_keys=True)

In [None]:
# trim_points_filename = os.path.join(source_directory, 'test_trim_points.csv')
# test_write_trim_points_into_file(trim_points_filename)

In [None]:
# import csv

# somedict = {"test1": [(1,2),(3,4)], "testing2": [(1,2),(3,4)], "testing3": [(1,2),(3,4)], "testing4": '', "testing5": 5}
# df=pd.DataFrame(somedict.items())
# df.to_csv('mycsvfile.csv', index=False)
# # with open('mycsvfile.csv','w+') as f:

# #     w = csv.writer(f)
    
# #     w.writerows(somedict.items())
    
# points_dict = pd.read_csv("mycsvfile.csv", index_col=False)
# print(points_dict)
# points_dict= points_dict.to_dict('dict')
# print(points_dict)
# # for key, points in points_dict.items():
# #   print(key, points)