In [None]:
# https://www.kaggle.com/code/sagniksanyal/birdclef-2022-torchaudio-audiomentations-skimpy


<h1 style = "font-size:45px;font-family: Comic Sans MS;text-align: center;background-color:#800000;color:#FFFFFF">Audio Albumentations</h1>

<h3 style="font-family:Comic Sans MS">The importance of albumentations in computer vision to improve performance is well known. Similar is the case when we are working with audio data.Augmentations and audio transforms play an imporatant role here also.In this notebook,I have tried to cover nearly all the possible albumentations that can be applied to audio data.The purpose is to provide the basic intuition of the audio albumentations,by listening the change in the audio for yourself and visualizing the waveform difference.<br>
I have used both torchaudio transforms and the audiomentations library for covering all the albumentations.<br><br>
    I hope you like this and it helps you with this competition


In [None]:
!pip install audiomentations

In [None]:
import os
import pandas as pd
import numpy
import torch
import math
import torchaudio
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import plotly.express as px
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
import librosa
import librosa.display
import IPython.display as ipd
from IPython.display import Audio, display
import sklearn
import warnings
import gc 
import torchaudio.functional as F
import torchaudio.transforms as T
warnings.filterwarnings('ignore')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")



<h1 style = "font-family: Comic Sans MS">Some functions we will use</h1>

In [None]:
def play_audio(waveform, sample_rate):
    if type(waveform)!=numpy.ndarray:
        waveform = waveform.numpy()
    num_channels, num_frames = waveform.shape
    if num_channels == 1:
        display(Audio(waveform[0], rate=sample_rate))  # numpy into sound 
        # from IPython.display import Audio, display
    elif num_channels == 2:
        display(Audio((waveform[0], waveform[1]), rate=sample_rate))
    else: 
        raise ValueError("Waveform with more than 2 channels are not supported.")
        
def print_stats(waveform, sample_rate=None, src=None):
    if src:
        print("-" * 10)
        print("Source:", src)
        print("-" * 10)
    if sample_rate:
        print("Sample Rate:", sample_rate)
        print("Shape:", tuple(waveform.shape))
        print("Dtype:", waveform.dtype)
        print(f" - Max:     {waveform.max().item():6.3f}")
        print(f" - Min:     {waveform.min().item():6.3f}")
        print(f" - Mean:    {waveform.mean().item():6.3f}")
        print(f" - Std Dev: {waveform.std().item():6.3f}")
        print()
        print(waveform)
        print()
def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None):
    if type(waveform)!=numpy.ndarray:
        waveform = waveform.numpy()
    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate
    figure, axes = plt.subplots(num_channels, figsize=(12,6))
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=0.1,color = "#A300F9")  # default is line plot 
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f'Channel {c+1}')
        if xlim:
            axes[c].set_xlim(xlim)
        if ylim:
            axes[c].set_ylim(ylim)
    figure.suptitle(title)
    plt.show(block=False)
def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1, figsize=(12,6))
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].specgram(waveform[c], Fs=sample_rate)
        if num_channels > 1:
            axes[c].set_ylabel(f'Channel {c+1}')
        if xlim:
            axes[c].set_xlim(xlim)
    figure.suptitle(title)
    plt.show(block=False)
def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None):
    fig, axs = plt.subplots(1, 1)
    axs.set_title(title or 'Spectrogram (db)')
    axs.set_ylabel(ylabel)
    axs.set_xlabel('frame')
    im = axs.imshow(librosa.power_to_db(spec), origin='lower', aspect=aspect)
    if xmax:
        axs.set_xlim((0, xmax))
    fig.colorbar(im, ax=axs)
    plt.show(block=False)

# <h1 style = "font-family: Comic Sans MS">Load The Data</h1>

In [None]:
train_csv=pd.read_csv('../input/birdclef-2021/train_metadata.csv')
train_csv.head()

# <h1 style = "font-family: Comic Sans MS">GETTING A FEW SAMPLES</h1>

In [None]:
# train_csv[train_csv['primary_label'] =='astfly'].sample(1, random_state = 33)
# sample(*kargs)
# n = 1: get one sample, 
# frac = 0.5: fraction = 0.5, 
# replace = True: allow to have sampling on the same row


In [None]:
# Create Full Path so we can access data more easily
base_dir = '../input/birdclef-2021/train_short_audio'
train_csv['full_path'] = base_dir+ '/' + train_csv['primary_label'] + '/' + train_csv['filename']  
# get the full path of bird song in train_short_audio folder

# Now let's sample a fiew audio files
astfly = train_csv[train_csv['primary_label'] == "astfly"].sample(1, random_state = 33)['full_path'].values[0]  
# values: return array(['astfly'], dtype=object)
casvir = train_csv[train_csv['primary_label'] == 'casvir'].sample(1, random_state = 33)['full_path'].values[0]
subfly = train_csv[train_csv['primary_label'] == "subfly"].sample(1, random_state = 33)['full_path'].values[0]
wilfly = train_csv[train_csv['primary_label'] == 'wilfly'].sample(1, random_state = 33)['full_path'].values[0]
verdin = train_csv[train_csv['primary_label'] == 'verdin'].sample(1, random_state = 33)['full_path'].values[0]
solsan = train_csv[train_csv['primary_label'] == 'solsan'].sample(1, random_state = 33)['full_path'].values[0]


birds= ["astfly", "casvir", "subfly", "wilfly", "verdin",'solsan']

  # Loading audio data into Tensor

In [None]:
waveform, sample_rate = torchaudio.load(astfly)
print(waveform, sample_rate)

In [None]:
waveform, sample_rate = torchaudio.load(astfly)
waveform.to(device)
print_stats(waveform, sample_rate=sample_rate)
play_audio(waveform, sample_rate)

# SAVING AUDIO TO FILE

Saved the astfly audio (ogg format) that we loaded above in mp3 format 

In [None]:
path = "./audio.mp3"
torchaudio.save(path, waveform, sample_rate)  # save audio 

In [None]:
waveform, sample_rate = torchaudio.load(astfly)
waveform.shape

# <h1 style = "font-size:30px;font-family: Comic Sans MS;text-align:center">RESAMPLE</h1>

In [None]:
new_sample_rate = sample_rate/10
# sample rate is the number of samples of a sound that are taken per second to represent the event digitally.
transformed = torchaudio.transforms.Resample(sample_rate, new_sample_rate)(waveform)
print("Shape of transformed waveform: {}".format(transformed.size()))
play_audio(transformed, new_sample_rate)

In [None]:
plot_waveform(waveform,sample_rate,title='Original')
plot_waveform(transformed,new_sample_rate,title='resampled')

# <h1 style = "font-size:30px;font-family: Comic Sans MS;text-align: center"> APPLYING EFFECTS(SOX)</h1>
<h3 style="font-family:Comic Sans MS">
1. Speed changing<br>
2. Reverberation

In [None]:
waveform1, sample_rate1=torchaudio.load(astfly)
waveform1.to(device)
effects = [
  ["speed", "1.2"],  # increase the speed
                     # This only changes sample rate, so it is necessary to
                     # add `rate` effect with original sample rate after this.
  ["rate", f"{sample_rate1}"],
  ["reverb", "-w"],  # Reverbration gives some dramatic feeling
]
waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects)
waveform2.to(device)
plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-.1, 3.2))  # the length is >= 10 but limit it into (-1. 3.2)
plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-.1, 3.2))
print_stats(waveform1, sample_rate=sample_rate1, src="Original")
print_stats(waveform2, sample_rate=sample_rate2, src="Effects Applied")
gc.collect()

<h3 style="font-family:Comic Sans MS">See the effect for yourself

In [None]:
play_audio(waveform1, sample_rate1)
play_audio(waveform2, sample_rate2)

# <h1 style = "font-size:30px;font-family: Comic Sans MS;text-align: center">Adding background noise</h1>

<h3 style="font-family:Comic Sans MS">To add background noise to audio data, you can simply add audio Tensor and noise Tensor. A commonly used way to adjust the intensity of noise is to change Signal-to-Noise Ratio (SNR).

In [None]:
def _get_sample(path, resample=None):
    effects = [
      ["remix", "1"]
    ]
    if resample:
        effects.append(["rate", f'{resample}'])
    return torchaudio.sox_effects.apply_effects_file(path, effects=effects)

def get_noise_sample(*, resample=None):
    return _get_sample(casvir, resample=resample)  # cavir = /kaggle/input/birdclef-2021/train_short_audio/casvir/XC128912.ogg

def get_speech_sample(*, resample=None):
    return _get_sample(casvir, resample=resample)

In [None]:
sample_rate = 6000
speech, _ = get_speech_sample(resample=sample_rate)
speech.to(device)
noise, _ = get_noise_sample(resample=sample_rate)  # waveform2, sample_rate2 
noise.to(device)
noise = noise[:, :speech.shape[1]]
plot_waveform(noise, sample_rate, title="Background noise")
plot_specgram(noise, sample_rate, title="Background noise")
# how to read specgram: https://analyticsindiamag.com/hands-on-tutorial-on-visualizing-spectrograms-in-python/

In [None]:
play_audio(noise, sample_rate)


In [None]:
speech_power = speech.norm(p=2)
noise_power = noise.norm(p=2)
for snr_db in [20]:
    snr = math.exp(snr_db / 10)
    scale = snr * noise_power / speech_power
    noisy_speech = (scale * speech + noise) / 2
    plot_waveform(noisy_speech, sample_rate1, title=f"SNR: {snr_db} [dB]")  # SNR: Signal-to-noise ratio
    play_audio(noisy_speech, sample_rate1)
gc.collect()

# <h1 style = "font-size:30px;font-family: Comic Sans MS">SpecAugment</h1>
<h4 style="font-family:Comic Sans MS">SpecAugment is a popular augmentation technique applied on spectrogram.Torchaudio implements TimeStrech, TimeMasking and FrequencyMasking.

# <h1 style = "font-size:30px;font-family: Comic Sans MS;text-align: center">Time Masking</h1>

In [None]:
waveform, sample_rate = torchaudio.load(subfly)
waveform.to(device)
print(waveform.shape)
# play_audio(waveform, sample_rate)
plot_waveform(waveform, sample_rate, title="Original")
plot_specgram(waveform, sample_rate, title="Original")

In [None]:
torch.random.manual_seed(4)
# 提取短时傅里叶特征 # get the unique frame??
n_fft = 2048  # n_fft(int) 代表快速傅里叶变换的序列长度
win_length = None # win_length(int) 代表窗口大小，默认等于 n_fft
hop_length = 400  # hop_length(int) 代表相邻两个滑动窗口帧之间的距离，也即是帧移

spectrogram = T.Spectrogram(
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length
)  # import torchaudio.transforms as T


# Perform transformation
spec = spectrogram(waveform) # torch.tensor, shape: [1, 1025, 8259])
 
print(spec.shape)
plot_spectrogram(spec[0], title="Original")
# [1025, 8259]: x = frame (8259), y = freq (1025), value = amplitude

In [None]:

masking = T.TimeMasking(time_mask_param=1300) # max length of time mask; uniformly sample from frame axis
spec = masking(spec)
plot_spectrogram(spec[0], title="Masked along time axis")


In [None]:
# Griffin Lim算法利用frame之间相位的约束来实现迭代收敛，可以在缺乏原始相位信息的基础上利用频谱重构出语音信号。
griffin_lim = T.GriffinLim(
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length,
)
waveform_n=griffin_lim(spec)
plot_spectrogram(spec[0], title="Masked along time axis")

play_audio(waveform_n, sample_rate)

# <h1 style = "font-size:30px;font-family: Comic Sans MS;text-align: center">Frequency Masking</h1>

In [None]:
torch.random.manual_seed(4)
waveform, sample_rate = torchaudio.load(subfly)
waveform.to(device)
n_fft = 2048
win_length = None
hop_length = 400

spectrogram = T.Spectrogram(
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length
)
# Perform transformation
spec = spectrogram(waveform)
plot_spectrogram(spec[0], title="Original")
masking = T.FrequencyMasking(freq_mask_param=1000)  # max length of frequency mask, uniformly chosed from frame bins 
spec = masking(spec)
griffin_lim = T.GriffinLim(
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length,
)
waveform_n=griffin_lim(spec)
waveform.to(device)
plot_spectrogram(spec[0], title="Masked along frequency axis")
play_audio(waveform, sample_rate)
play_audio(waveform_n, sample_rate)
gc.collect()

# <h1 style = "font-size:30px;font-family: Comic Sans MS;text-align: center">FADE</h1>

In [None]:
fade=T.Fade(fade_in_len=200, fade_out_len=100, fade_shape='linear')
waveform, sample_rate = torchaudio.load(astfly)
plot_waveform(waveform, sample_rate, title='original')
play_audio(waveform, sample_rate)
waveform1=fade(waveform)
plot_waveform(waveform1, sample_rate, title='fade')
play_audio(waveform1, sample_rate)
gc.collect()

# <h1 style = "font-size:30px;font-family: Comic Sans MS;text-align: center">VOLUME TRANSFORM</h1>

In [None]:
vol=T.Vol(gain=29, gain_type='db')
waveform, sample_rate = torchaudio.load(subfly)
play_audio(waveform, sample_rate)
waveform1=vol(waveform)
play_audio(waveform1, sample_rate)
gc.collect()


# <h1 style = "font-size:30px;font-family: Comic Sans MS"> AUDIOMENTATIONS</h1>

# <h1 style = "font-size:30px;font-family: Comic Sans MS;text-align: center">Time Stretch & Clipping</h1>

In [None]:
from audiomentations import TimeStretch # albumentation
from audiomentations import Compose,ClippingDistortion
augmenter = Compose(
            [
                ClippingDistortion(
                    min_percentile_threshold=20, max_percentile_threshold=40, p=1.0
                ),TimeStretch(min_rate=0.8, max_rate=0.9, leave_length_unchanged=False, p=1.0)
            ]
        )
waveform, sample_rate = torchaudio.load(subfly)
play_audio(waveform, sample_rate)
waveform1 = augmenter(samples=waveform.numpy(), sample_rate=sample_rate)
waveform1=torch.from_numpy(waveform1)
play_audio(waveform1, sample_rate)

# <h1 style = "font-size:30px;font-family: Comic Sans MS;text-align: center">PITCH SHIFT & POLARITY INVERSION</h1>

In [None]:
from audiomentations import PitchShift,PolarityInversion
augmenter = Compose([PitchShift(min_semitones=-2, max_semitones=-1, p=1.0),PolarityInversion(p=1.0)])
waveform, sample_rate = torchaudio.load(subfly)
play_audio(waveform, sample_rate)
waveform1 = augmenter(samples=waveform.numpy(), sample_rate=sample_rate)
waveform1=torch.from_numpy(waveform1)
play_audio(waveform1, sample_rate)
plot_waveform(waveform, sample_rate, title='original')
plot_waveform(waveform1, sample_rate, title='Augmented')

# <h1 style = "font-size:30px;font-family: Comic Sans MS;text-align: center">FORWARD SHIFT</h1>

In [None]:
from audiomentations import Shift
forward_augmenter = Compose([Shift(min_fraction=0.5, max_fraction=0.5, p=1.0)])
waveform, sample_rate = torchaudio.load(subfly)
play_audio(waveform, sample_rate)
waveform1 = forward_augmenter(samples=waveform.numpy(), sample_rate=sample_rate)
waveform1=torch.from_numpy(waveform1)
play_audio(waveform1, sample_rate)
plot_waveform(waveform, sample_rate, title='original')
plot_waveform(waveform1, sample_rate, title='Forward Shift')

# <h1 style = "font-size:30px;font-family: Comic Sans MS;text-align: center">BACKWARD SHIFT</h1>

In [None]:
backward_augmenter = Compose([Shift(min_fraction=-0.25, max_fraction=-0.25, p=1.0)])
waveform, sample_rate = torchaudio.load(subfly)
play_audio(waveform, sample_rate)
waveform1 = backward_augmenter(samples=waveform.numpy(), sample_rate=sample_rate)
waveform1=torch.from_numpy(waveform1)
play_audio(waveform1, sample_rate)
plot_waveform(waveform, sample_rate, title='original')
plot_waveform(waveform1, sample_rate, title='Backward Shift')

# Skimpy

In [None]:
!pip -q install skimpy --user

In [None]:

from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True)
import numpy as np 
import pandas as pd 
import skimpy 
import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go

In [None]:
taxo = pd.read_csv("../input/birdclef-2022/eBird_Taxonomy_v2021.csv")
ss = pd.read_csv("../input/birdclef-2022/sample_submission.csv")
train = pd.read_csv("../input/birdclef-2022/train_metadata.csv")
test = pd.read_csv("../input/birdclef-2022/test.csv")
scored = pd.read_json("../input/birdclef-2022/scored_birds.json")
train_meta = pd.read_csv("../input/birdclef-2022/train_metadata.csv")

In [None]:
skimpy.skim(taxo)

In [None]:
skimpy.skim(train)

In [None]:
skimpy.skim(train_meta)

In [None]:
skimpy.skim(test)

In [None]:
fig = px.scatter_geo(train ,lat = "latitude", lon = "longitude", color = "primary_label")
fig.show()

In [None]:
fig = px.scatter_geo(data_frame = train , lat = "latitude", lon ="longitude", color = "rating", hover_data=["rating", "primary_label"])
fig.update_layout(
    title="rating with primary_labels",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    ),
    margin=dict(l=40, r=40, t=100, b=80)

)
fig.show()