In [18]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
import torchaudio
import numpy as np
from tqdm import tqdm
import soundfile

In [19]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")

Using cpu


In [20]:
epochs=50
lr=0.001
batch_size=32

In [21]:
class AudioDataset(Dataset):
  def __init__(self,root):
    self.dir_path=root
    self.classes=os.listdir(self.dir_path)
    print(self.classes)

    self.data_paths=[]
    self.labels=[]

    for root ,dirs,files in os.walk(self.dir_path):
      for file in files:

        label=os.path.basename(root)
        data_path=os.path.join(root,file)
        self.data_paths.append(data_path)
        self.labels.append(self.classes.index(label))
    print(f"{len(self.labels)} datas loaded from {len(set(self.labels))} classes")


  def __len__(self):
    return len(self.labels)
  

  def __getitem__(self, index):
    data_path = self.data_paths[index]
    label = self.labels[index]
    signal, sample_rate = torchaudio.load(data_path)
    signal_mono = torch.mean(signal, dim=0, keepdim=True)

    new_sample_rate = 8000
    transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
    signal_mono_transformed = transform(signal_mono)

    return signal_mono_transformed, label

In [22]:
dataset = AudioDataset("/content/drive/MyDrive/MusicClasiification/Dataset")
print(len(dataset))
train_size = int(len(dataset)*0.8)
test_size  = len(dataset)-train_size

train_dataset,test_dataset=torch.utils.data.random_split(dataset,[train_size,test_size])

train_data_loader=torch.utils.data.DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
test_data_loader=torch.utils.data.DataLoader(test_dataset,batch_size=batch_size,shuffle=True)

['sajjad', 'zeynab', 'amir', 'hossein', 'parisa', 'nahid', 'maryam', 'alireza', 'zahra', 'morteza', 'mohammadali']
1384 datas loaded from 11 classes
1384


##Model

In [23]:
class M5(nn.Module):
    def __init__(self, n_input=1, n_output=35, stride=8, n_channel=32):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(2 * n_channel)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        self.bn4 = nn.BatchNorm1d(2 * n_channel)
        self.pool4 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(2 * n_channel, n_output)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = F.avg_pool1d(x, x.shape[-1])
        x = torch.flatten(x, start_dim=1)
        x = self.fc1(x)
        x = F.softmax(x, dim=1)
        return x

    def accuracy(self, preds, labels):
        maxs, indices = torch.max(preds, 1)
        acc = torch.sum(indices == labels) / len(preds)
        return acc.cpu()

In [24]:


model = M5(n_output=11).to(device)
print(model)

# count_parameters
n = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of parameters: %s" % n)



M5(
  (conv1): Conv1d(1, 32, kernel_size=(80,), stride=(8,))
  (bn1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=64, out_features=11, bias=True)
)
Number

In [25]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss()

In [26]:
# train
model.train()

for epoch in range(epochs):
    train_loss = 0.0
    train_acc = 0.0
    for audios, labels in tqdm(train_data_loader):
        audios, labels = audios.to(device), labels.to(device)
        labels_one_hot = torch.nn.functional.one_hot(labels, num_classes=11).type(torch.FloatTensor).to(device)
        optimizer.zero_grad()

        preds = model(audios)
        loss = loss_function(preds, labels_one_hot)
        
        loss.backward()
        optimizer.step()

        train_loss += loss
        train_acc += model.accuracy(preds, labels)
    
    total_loss = train_loss / len(train_data_loader)
    total_acc = train_acc / len(train_data_loader)

    print(f"Epoch: {epoch}, Loss: {total_loss}, Acc: {total_acc}")



100%|██████████| 35/35 [00:09<00:00,  3.55it/s]


Epoch: 0, Loss: 2.293881416320801, Acc: 0.37128758430480957


100%|██████████| 35/35 [00:08<00:00,  4.29it/s]


Epoch: 1, Loss: 2.149981737136841, Acc: 0.5320488810539246


100%|██████████| 35/35 [00:09<00:00,  3.51it/s]


Epoch: 2, Loss: 2.040191173553467, Acc: 0.589191734790802


100%|██████████| 35/35 [00:07<00:00,  4.64it/s]


Epoch: 3, Loss: 1.9612280130386353, Acc: 0.6618421077728271


100%|██████████| 35/35 [00:07<00:00,  4.76it/s]


Epoch: 4, Loss: 1.8687164783477783, Acc: 0.7808740139007568


100%|██████████| 35/35 [00:08<00:00,  4.26it/s]


Epoch: 5, Loss: 1.800991415977478, Acc: 0.824953019618988


100%|██████████| 35/35 [00:07<00:00,  4.72it/s]


Epoch: 6, Loss: 1.7708051204681396, Acc: 0.8380168676376343


100%|██████████| 35/35 [00:07<00:00,  4.90it/s]


Epoch: 7, Loss: 1.7469348907470703, Acc: 0.8487311601638794


100%|██████████| 35/35 [00:07<00:00,  4.60it/s]


Epoch: 8, Loss: 1.722741961479187, Acc: 0.8678101897239685


100%|██████████| 35/35 [00:07<00:00,  4.72it/s]


Epoch: 9, Loss: 1.7201939821243286, Acc: 0.8734492659568787


100%|██████████| 35/35 [00:07<00:00,  4.84it/s]


Epoch: 10, Loss: 1.6984659433364868, Acc: 0.8907424807548523


100%|██████████| 35/35 [00:07<00:00,  4.96it/s]


Epoch: 11, Loss: 1.68107271194458, Acc: 0.9109962582588196


100%|██████████| 35/35 [00:07<00:00,  4.86it/s]


Epoch: 12, Loss: 1.6626230478286743, Acc: 0.9228853583335876


100%|██████████| 35/35 [00:07<00:00,  4.69it/s]


Epoch: 13, Loss: 1.6549856662750244, Acc: 0.9297462701797485


100%|██████████| 35/35 [00:07<00:00,  4.54it/s]


Epoch: 14, Loss: 1.6395279169082642, Acc: 0.9359962344169617


100%|██████████| 35/35 [00:07<00:00,  4.91it/s]


Epoch: 15, Loss: 1.6374763250350952, Acc: 0.9410714507102966


100%|██████████| 35/35 [00:07<00:00,  4.69it/s]


Epoch: 16, Loss: 1.635820984840393, Acc: 0.9389567971229553


100%|██████████| 35/35 [00:07<00:00,  4.75it/s]


Epoch: 17, Loss: 1.634590744972229, Acc: 0.9359962344169617


100%|██████████| 35/35 [00:06<00:00,  5.06it/s]


Epoch: 18, Loss: 1.6194764375686646, Acc: 0.9532424807548523


100%|██████████| 35/35 [00:07<00:00,  4.88it/s]


Epoch: 19, Loss: 1.6176259517669678, Acc: 0.9493891000747681


100%|██████████| 35/35 [00:07<00:00,  4.60it/s]


Epoch: 20, Loss: 1.6113109588623047, Acc: 0.956250011920929


100%|██████████| 35/35 [00:07<00:00,  4.91it/s]


Epoch: 21, Loss: 1.6050565242767334, Acc: 0.9574248194694519


100%|██████████| 35/35 [00:07<00:00,  4.72it/s]


Epoch: 22, Loss: 1.6013597249984741, Acc: 0.9618890881538391


100%|██████████| 35/35 [00:07<00:00,  4.52it/s]


Epoch: 23, Loss: 1.5992989540100098, Acc: 0.9616071581840515


100%|██████████| 35/35 [00:07<00:00,  4.49it/s]


Epoch: 24, Loss: 1.5998594760894775, Acc: 0.9580357074737549


100%|██████████| 35/35 [00:07<00:00,  4.49it/s]


Epoch: 25, Loss: 1.6010105609893799, Acc: 0.9645676612854004


100%|██████████| 35/35 [00:07<00:00,  4.79it/s]


Epoch: 26, Loss: 1.594930648803711, Acc: 0.9633928537368774


100%|██████████| 35/35 [00:07<00:00,  4.66it/s]


Epoch: 27, Loss: 1.5939571857452393, Acc: 0.969924807548523


100%|██████████| 35/35 [00:07<00:00,  4.86it/s]


Epoch: 28, Loss: 1.5907865762710571, Acc: 0.969924807548523


100%|██████████| 35/35 [00:09<00:00,  3.61it/s]


Epoch: 29, Loss: 1.5889170169830322, Acc: 0.9678571224212646


100%|██████████| 35/35 [00:07<00:00,  4.86it/s]


Epoch: 30, Loss: 1.5907667875289917, Acc: 0.9672462344169617


100%|██████████| 35/35 [00:07<00:00,  4.64it/s]


Epoch: 31, Loss: 1.5844792127609253, Acc: 0.9714285731315613


100%|██████████| 35/35 [00:07<00:00,  4.91it/s]


Epoch: 32, Loss: 1.5846906900405884, Acc: 0.9705356955528259


100%|██████████| 35/35 [00:07<00:00,  4.92it/s]


Epoch: 33, Loss: 1.5854365825653076, Acc: 0.968139111995697


100%|██████████| 35/35 [00:07<00:00,  4.99it/s]


Epoch: 34, Loss: 1.5806952714920044, Acc: 0.9743890762329102


100%|██████████| 35/35 [00:07<00:00,  4.82it/s]


Epoch: 35, Loss: 1.587461233139038, Acc: 0.9672462344169617


100%|██████████| 35/35 [00:07<00:00,  4.73it/s]


Epoch: 36, Loss: 1.579916000366211, Acc: 0.9723214507102966


100%|██████████| 35/35 [00:07<00:00,  4.87it/s]


Epoch: 37, Loss: 1.5799757242202759, Acc: 0.968139111995697


100%|██████████| 35/35 [00:07<00:00,  4.73it/s]


Epoch: 38, Loss: 1.5820120573043823, Acc: 0.9717105031013489


100%|██████████| 35/35 [00:07<00:00,  4.95it/s]


Epoch: 39, Loss: 1.6041148900985718, Acc: 0.9553571343421936


100%|██████████| 35/35 [00:07<00:00,  4.82it/s]


Epoch: 40, Loss: 1.5988109111785889, Acc: 0.9603853225708008


100%|██████████| 35/35 [00:07<00:00,  4.89it/s]


Epoch: 41, Loss: 1.5963910818099976, Acc: 0.9621710777282715


100%|██████████| 35/35 [00:07<00:00,  4.96it/s]


Epoch: 42, Loss: 1.5913654565811157, Acc: 0.9627819657325745


100%|██████████| 35/35 [00:07<00:00,  4.81it/s]


Epoch: 43, Loss: 1.5782586336135864, Acc: 0.9734962582588196


100%|██████████| 35/35 [00:07<00:00,  4.66it/s]


Epoch: 44, Loss: 1.5763903856277466, Acc: 0.9750000238418579


100%|██████████| 35/35 [00:07<00:00,  4.89it/s]


Epoch: 45, Loss: 1.5833202600479126, Acc: 0.9669643044471741


100%|██████████| 35/35 [00:07<00:00,  4.75it/s]


Epoch: 46, Loss: 1.5844258069992065, Acc: 0.9660714268684387


100%|██████████| 35/35 [00:07<00:00,  4.88it/s]


Epoch: 47, Loss: 1.5827075242996216, Acc: 0.96875


100%|██████████| 35/35 [00:07<00:00,  4.90it/s]


Epoch: 48, Loss: 1.5820802450180054, Acc: 0.9717105031013489


100%|██████████| 35/35 [00:07<00:00,  4.97it/s]

Epoch: 49, Loss: 1.5841917991638184, Acc: 0.9669643044471741





In [27]:
model.eval()

test_loss = 0.0
test_acc = 0.0
for audios, labels in tqdm(test_data_loader):
    audios, labels = audios.to(device), labels.to(device)
    labels_one_hot = torch.nn.functional.one_hot(labels, num_classes=11).type(torch.FloatTensor).to(device)

    preds = model(audios)
    loss = loss_function(preds, labels_one_hot)

    test_loss += loss
    test_acc += model.accuracy(preds, labels)

total_loss = test_loss / len(test_data_loader)
total_acc = test_acc / len(test_data_loader)

print(f"Loss: {total_loss}, Acc: {total_acc}")

100%|██████████| 9/9 [00:01<00:00,  5.26it/s]

Loss: 1.628161907196045, Acc: 0.9426257014274597





In [28]:
torch.save(model.state_dict(), "weights.pth")

In [29]:
# Inference

signal, sample_rate = torchaudio.load("/content/zeynab_1.ogg")

# preprocess
signal = torch.mean(signal, dim=0, keepdim=True)
new_sample_rate = 8000
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=new_sample_rate)
signal = transform(signal)
signal = signal[:, 32000:40000]
signal = signal.unsqueeze(0).to(device)

# process
preds = model(signal)

# postprocess
preds = preds.cpu().detach().numpy()
output = np.argmax(preds)
print(output)

1


# Inference

In [30]:
!pip install telebot
!pip install pyTelegramBotAPI

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [31]:
import telebot

In [37]:
ID=['sajjad', 'zeynab', 'amir', 'hossein', 'parisa', 'nahid', 'maryam', 'alireza', 'zahra', 'morteza', 'mohammadali']

In [42]:
ID[1]

'zeynab'

In [44]:
bot = telebot.TeleBot("")

In [46]:
@bot.message_handler(commands=['start'])
def say_hi(messages):
    bot.send_message(
        messages.chat.id, f'Hi {messages.from_user.first_name} Dear😎 ')
    bot.send_message(
        messages.chat.id, f' Now send me your voice  ☺...')
@bot.message_handler(content_types=['voice'])
def voice(message):
    audio_info = bot.get_file(message.voice.file_id)
    downloaded_file = bot.download_file(audio_info.file_path)
    src = audio_info.file_path

    with open(src, 'wb') as audio_file:
        audio_file.write(downloaded_file)
    
    signal, sample_rate = torchaudio.load(src)
    
    signal = torch.mean(signal, dim=0, keepdim=True)
    transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=8000)
    signal = transform(signal)
    signal = signal.unsqueeze(0).to(device)
     
    preds = model(signal)
   
    preds = preds.cpu().detach().numpy()
    output = np.argmax(preds)
    
    bot.reply_to(message, ID[output])
    print(output)

bot.polling()

6
0
10
