In [1]:
import os
import cv2
import itertools
import librosa
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import soundfile as sf
import warnings
import zipfile
from io import BytesIO
from tqdm import tqdm
from glob import glob
from PIL import Image
from google.colab import files
from google.cloud import storage
warnings.filterwarnings('ignore')

# Pytorch
import torch
import torchaudio
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
from torchaudio.transforms import MelSpectrogram, MFCC, SpectralCentroid

# Hugging Face Transformers
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer,  Wav2Vec2Config, Wav2Vec2ForSequenceClassification, Wav2Vec2Processor

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# Hardware report packages
import gc
import types
import pkg_resources

# Seed for reproducibility
np.random.seed(10)
torch.manual_seed(10)
torch.cuda.manual_seed(10)

In [2]:
device = 'cpu'

In [3]:
!find /content/sample_data/ -type f -exec rm {} +

### Import Data

In [4]:
firts_time = True

In [5]:
if firts_time:
    uploaded = files.upload()

    for fn in uploaded.keys():
        print('User uploaded file "{name}" with length {length} bytes'.format(
            name=fn, length=len(uploaded[fn])))


Saving projetos-aleatorios-379913-61df4a1c249e.json to projetos-aleatorios-379913-61df4a1c249e.json
User uploaded file "projetos-aleatorios-379913-61df4a1c249e.json" with length 2372 bytes


In [6]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = f'/content/projetos-aleatorios-379913-61df4a1c249e.json'

In [7]:
storage_client = storage.Client()

In [8]:
bucket_name = 'projeto_musical'
rar_file_name = 'train_tone_labels.csv'
local_rar_path = '/content/' + rar_file_name

# Define o bucket e o blob
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(rar_file_name)

#Baixa o arquivo RAR para o ambiente local do Colab
blob.download_to_filename(local_rar_path)

In [9]:
tone_labels = pd.read_csv('train_tone_labels.csv')
tone_labels.drop(columns=['Unnamed: 0'], inplace=True)

In [10]:
tone_labels[tone_labels['tone_idx'] == 4]

Unnamed: 0,file_path,chord_idx,tone_idx
800000,chords/variation_chord_audio_white_noise/t_DsG...,232,4
800001,chords/variation_chord_audio_white_noise/t_DGC...,243,4
800002,chords/variation_chord_audio_white_noise/t_DAD...,227,4
800003,chords/variation_chord_audio_white_noise/t_DsG...,248,4
800004,chords/variation_chord_audio_white_noise/t_EAD...,244,4
...,...,...,...
999995,chords/variation_chord_audio_white_noise/t_EAD...,242,4
999996,chords/variation_chord_audio_white_noise/t_DAD...,244,4
999997,chords/variation_chord_audio_world_noise/t_DsG...,252,4
999998,chords/variation_chord_audio_white_noise/t_EAD...,241,4


In [11]:
bucket_name = 'projeto_musical'
rar_file_name = 'base_dimensoes.xlsx'
local_rar_path = '/content/' + rar_file_name

# Define o bucket e o blob
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(rar_file_name)

#Baixa o arquivo RAR para o ambiente local do Colab
blob.download_to_filename(local_rar_path)

In [12]:
dicionario_dimensoes = pd.read_excel('base_dimensoes.xlsx', sheet_name='dicionario')

In [13]:
dicionario_dimensoes['tone_idx'] = pd.Categorical(dicionario_dimensoes['tom']).codes

In [14]:
bucket_name = 'projeto_musical'
rar_file_name = 'model_wave2seq_e_freeze_2_v4_tone.pth'
local_rar_path = '/content/' + rar_file_name

# Define o bucket e o blob
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(rar_file_name)

#Baixa o arquivo RAR para o ambiente local do Colab
blob.download_to_filename(local_rar_path)

### Inicialização do Projeto

In [15]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


In [16]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)


class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [17]:
def inicializa_modelo_wav2vec(num_classes, use_pretrained=True, freeze_feature_extractor=True):
    # Carregar a configuração pré-treinada
    config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-base-960h")
    config.num_labels = num_classes  # Ajustar o número de classes na última camada

    # Adicionar um atributo personalizado para pooling_mode, se necessário
    config.pooling_mode = "mean"  # Ou qualquer outro valor padrão relevante para o seu caso

    # Criar uma nova instância do modelo personalizado
    model = Wav2Vec2ForSpeechClassification(config)

    if use_pretrained:
        # Certifique-se de carregar os pesos pré-treinados apenas para o modelo base Wav2Vec2
        base_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
        model.wav2vec2 = base_model
        model.init_weights()  # Inicializar apenas os pesos da nova cabeça de classificação

    # Congelar as camadas do extrator de características, se necessário
    if freeze_feature_extractor:
        model.freeze_feature_extractor()

    # Carregar o processor
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

    return model, processor

In [18]:
from sklearn.metrics import classification_report
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, AutoConfig
import torch

In [19]:
num_classes = 7
model_wav, processor = inicializa_modelo_wav2vec(num_classes, True, True)
model = model_wav.to(device)

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

### Carregar Modelo Treinado

In [20]:
checkpoint_path = 'model_wave2seq_e_freeze_2_v4_tone.pth'
checkpoint = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [21]:
pip install pytube moviepy

Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytube
Successfully installed pytube-15.0.0


In [22]:
from pytube import YouTube
from moviepy.editor import AudioFileClip

def download_and_cut_youtube_audio(url, start_time, end_time, output_filename):
    """
    Baixa o áudio de um vídeo do YouTube e corta um segmento especificado.

    Args:
    - url (str): URL do vídeo do YouTube.
    - start_time (int): Início do corte em segundos.
    - end_time (int): Fim do corte em segundos.
    - output_filename (str): Nome do arquivo de saída para o áudio cortado.
    """
    # Baixar o áudio do vídeo do YouTube
    yt = YouTube(url)
    stream = yt.streams.filter(only_audio=True).first()
    stream.download(output_path='.', filename='temp_audio.mp4')

    # Carregar o arquivo de áudio baixado e cortar o segmento desejado
    audio_clip = AudioFileClip('temp_audio.mp4').subclip(start_time, end_time)

    # Salvar o segmento de áudio cortado em um arquivo
    audio_clip.write_audiofile(output_filename)

    # Fechar o clip para liberar recursos
    audio_clip.close()

In [23]:
# Cria um dicionário mapeando acordes para índices únicos
chord_to_idx = dicionario_dimensoes.drop_duplicates(subset=['tom']).set_index('tom')['tone_idx'].to_dict()

# Inverte o dicionário para mapear índices para acordes
idx_to_chord = {idx: chord for chord, idx in chord_to_idx.items()}

# Cria a lista de nomes de acordes ordenados pelos índices de acordes
label_names = [idx_to_chord[idx] for idx in sorted(idx_to_chord)]

# Confirma se o label_names está completo e corretamente ordenado
# assert len(label_names) == len(idx_to_chord) == max(idx_to_chord.keys()) + 1

In [24]:
idx_to_chord

{0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G'}

In [25]:
from IPython.display import Audio, display
import torch.nn.functional as F

In [26]:
def process_audio_and_predict(audio_path, target_sampling_rate):
    # Carregar e processar o áudio
    #y, sr = librosa.load(audio_path, sr=target_sampling_rate)
    y, sr = torchaudio.load(audio_path)
    if sr != target_sampling_rate:
      resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sampling_rate)
      y = resampler(y)

    y, _ = librosa.effects.trim(y, top_db=20, frame_length=512, hop_length=64)

    if y.ndim > 1 and y.size(0) > 1:  # Se há mais de uma dimensão e mais de um canal
      y = torch.mean(y, dim=0, keepdim=True)

    y = y.unsqueeze(0) if y.ndim == 1 else y

    if y.size(-1) > target_sampling_rate:  # Checando o tamanho da segunda dimensão
      y = y[:, :target_sampling_rate]
    elif y.size(-1) < target_sampling_rate:
      padding = target_sampling_rate - y.size(-1)
      y = torch.nn.functional.pad(y, (0, padding), 'constant')


    # Processa com wav2vec 2.0
    inputs = processor(y.squeeze(0), sampling_rate=target_sampling_rate, return_tensors="pt", padding=True)
    inputs = inputs.input_values.to(device)

    # Obter logits do modelo
    with torch.no_grad():
      logits = model(inputs).logits

    # Obter as top 5 probabilidades e índices
    probs = F.softmax(logits, dim=-1)
    top_probs, top_indices = probs.topk(7)

    # Converter para rótulos e pontuações
    predicted_labels = [label_names[idx] for idx in top_indices[0].cpu().numpy()]
    scores = top_probs[0].cpu().numpy() * 100  # Convertendo para porcentagem

    # Criar DataFrame para visualização
    predictions = pd.DataFrame({
        'Label': predicted_labels,
        'Score': [f"{score:.1f}%" for score in scores]
    })

    # Exibir áudio e previsões
    display(Audio(y, rate=target_sampling_rate))
    display(predictions)

    return predictions.to_dict('records')

In [31]:
def process_audio_and_predict_samples(audio_path, target_sampling_rate, rank=1):
    y, sr = torchaudio.load(audio_path)
    if sr != target_sampling_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sampling_rate)
        y = resampler(y)

    if y.ndim > 1 and y.size(0) > 1:
        y = torch.mean(y, dim=0, keepdim=True)

    # Dividindo o áudio em segmentos de 1 segundo
    one_sec_frames = target_sampling_rate
    segments = [y[:, i:i + one_sec_frames] for i in range(0, y.size(1), one_sec_frames)]

    predictions = []
    for seg in segments:
        if seg.size(-1) < one_sec_frames:
            padding = one_sec_frames - seg.size(-1)
            seg = torch.nn.functional.pad(seg, (0, padding), 'constant')

        inputs = processor(seg.squeeze(0), sampling_rate=target_sampling_rate, return_tensors="pt", padding=True)
        inputs = inputs.input_values.to(device)

        with torch.no_grad():
            logits = model(inputs).logits

        probs = F.softmax(logits, dim=-1)
        top_probs, top_indices = probs.topk(7)

        # Selecionando o label baseado no rank
        predicted_label = label_names[top_indices[0][rank - 1].cpu().numpy()]
        predictions.append(predicted_label)

    return predictions

In [None]:
def marcar_mudancas_notas(lista_notas):
    mudancas = []
    nota_anterior = None

    for i, nota in enumerate(lista_notas):
        if nota != nota_anterior:
            mudancas.append({'acorde': nota, 'sec': i})
            nota_anterior = nota

    return mudancas

In [28]:
download_and_cut_youtube_audio(
    url='https://www.youtube.com/watch?v=fWtbrrmha-k',
    start_time=6,
    end_time=7,
    output_filename='yt_eu_sei.mp3'
)

MoviePy - Writing audio in yt_eu_sei.mp3


                                                       

MoviePy - Done.




In [29]:
import librosa
import torch
import torchaudio
import pandas as pd
from IPython.display import Audio
import torch.nn.functional as F

In [30]:
path_to_audio_file = '/content/yt_eu_sei.mp3'
target_sampling_rate = 16000  # Ou qualquer que seja a taxa de amostragem esperada pelo seu modelo

# Obtenha as logits do modelo
logits = process_audio_and_predict_samples(path_to_audio_file, target_sampling_rate,1)

NameError: name 'process_audio_and_predict_samples' is not defined

In [None]:
logits

In [None]:
acordes = marcar_mudancas_notas(logits)

In [None]:
Audio('yt_eu_sei.mp3',rate=22500)

In [None]:
acordes

In [None]:
path_to_audio_file = 'yt_Am_bck_singin.mp3'
target_sampling_rate = 16000  # Ou qualquer que seja a taxa de amostragem esperada pelo seu modelo

# Obtenha as logits do modelo
logits = process_audio_and_predict(path_to_audio_file, target_sampling_rate)

In [None]:
path_to_audio_file = 'yt_d_fast.mp3'
target_sampling_rate = 16000  # Ou qualquer que seja a taxa de amostragem esperada pelo seu modelo

# Obtenha as logits do modelo
logits = process_audio_and_predict(path_to_audio_file, target_sampling_rate)

Unnamed: 0,Label,Score
0,D,94.0%
1,G,1.7%
2,C,1.3%
3,E,1.1%
4,F,0.8%
5,A,0.7%
6,B,0.3%


In [None]:
path_to_audio_file = '/content/yt_C9_fast.mp3'
target_sampling_rate = 16000  # Ou qualquer que seja a taxa de amostragem esperada pelo seu modelo

# Obtenha as logits do modelo
logits = process_audio_and_predict(path_to_audio_file, target_sampling_rate)

Unnamed: 0,Label,Score
0,D,51.1%
1,C,30.0%
2,E,10.6%
3,G,3.2%
4,B,2.5%
5,F,1.5%
6,A,1.1%


In [None]:
path_to_audio_file = '/content/yt_c9_strum.mp3'
target_sampling_rate = 16000  # Ou qualquer que seja a taxa de amostragem esperada pelo seu modelo

# Obtenha as logits do modelo
logits = process_audio_and_predict(path_to_audio_file, target_sampling_rate)

Unnamed: 0,Label,Score
0,G,54.9%
1,A,29.8%
2,F,10.4%
3,B,2.7%
4,D,1.0%
5,C,1.0%
6,E,0.2%
