In [16]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
!pip install torch torchaudio transformers datasets




In [18]:
from transformers import Wav2Vec2Processor, HubertForSequenceClassification
import torch
import numpy as np

# Model ve işlemci yükleniyor
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertForSequenceClassification.from_pretrained("facebook/hubert-large-ls960-ft", num_labels=3)  # 3 sınıf için
model.to("cuda")  # GPU kullanımı


Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HubertForSequenceClassification(
  (hubert): HubertModel(
    (feature_extractor): HubertFeatureEncoder(
      (conv_layers): ModuleList(
        (0): HubertLayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): HubertFeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (pro

In [19]:
import glob

# Ses dosyalarının yollarını listele
agresif_files = glob.glob("/content/drive/MyDrive/Yazgel_Muzik_Secilen_1000/agresif/*.wav")
huzunlu_files = glob.glob("/content/drive/MyDrive/Yazgel_Muzik_Secilen_1000/huzunlu/*.wav")
neseli_files = glob.glob("/content/drive/MyDrive/Yazgel_Muzik_Secilen_1000/neseli/*.wav")

# Tüm dosyaları ve etiketlerini birleştir
audio_paths = agresif_files + huzunlu_files + neseli_files
labels = [0] * len(agresif_files) + [1] * len(huzunlu_files) + [2] * len(neseli_files)

print(f"Toplam {len(audio_paths)} dosya bulundu.")


Toplam 6010 dosya bulundu.


In [20]:
import torchaudio
from transformers import Wav2Vec2Processor

# HuBERT için işlemci
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")

def preprocess_audio(audio_path):
    # Ses dosyasını yükle
    waveform, sample_rate = torchaudio.load(audio_path)
    # 16kHz'e yeniden örnekleme
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
    # Modelin beklediği şekilde işleme
    inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
    return inputs


In [21]:
# İlk ses dosyasını test etmek için:
inputs = preprocess_audio(audio_paths[0])
print(inputs)


{'input_values': tensor([[-0.5919, -1.9172, -1.8174,  ..., -0.2172,  0.8590, -0.1859],
        [-0.7631, -2.0116, -1.8164,  ...,  2.1737,  1.2146,  0.8881]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)}


**Veri Kümesini İşleme**

In [22]:
from torch.utils.data import Dataset

class AudioDataset(Dataset):
    def __init__(self, audio_paths, labels):
        self.audio_paths = audio_paths
        self.labels = labels

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        # Ses dosyasını işleme
        inputs = preprocess_audio(self.audio_paths[idx])
        # Etiket
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        # İşlenmiş girişleri ve etiketi döndür
        return inputs, label


**DataLoader ile Verileri Hazırlama**

In [23]:
from torch.utils.data import DataLoader

# Dataset oluştur
dataset = AudioDataset(audio_paths, labels)

# DataLoader oluştur
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)

# İlk batch'i test et
for batch in data_loader:
    inputs, labels = batch
    print(inputs.keys())  # input_values ve attention_mask
    print(labels)         # Sınıf etiketleri
    break


dict_keys(['input_values', 'attention_mask'])
tensor([0, 0, 0, 2, 1, 2, 2, 0, 0, 2, 1, 0, 1, 1, 0, 2])


**Model Eğitimine Geçiş**

16000*15sn=240000 : sabit uzunluğa getirme



In [24]:
import torch

def pad_or_truncate(waveform, target_length=240000):
    """
    Ses dalgasını sabit bir uzunluğa getirir.
    - waveform: Ses dalgası tensörü
    - target_length: Hedef uzunluk (ör. 15 saniye için 16kHz * 15 = 240000)
    """
    current_length = waveform.size(-1)
    if current_length > target_length:
        # Kırpma
        waveform = waveform[:, :target_length]
    elif current_length < target_length:
        # Doldurma
        padding = target_length - current_length
        waveform = torch.nn.functional.pad(waveform, (0, padding))
    return waveform


In [25]:
def preprocess_audio(audio_path, target_length=240000):
    # Ses dosyasını yükle
    waveform, sample_rate = torchaudio.load(audio_path)

    # 16kHz'e yeniden örnekleme
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Sabit uzunluğa getir
    waveform = pad_or_truncate(waveform, target_length=target_length)

    # Modelin beklediği şekilde işleme
    inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
    return inputs


In [26]:
# İlk batch'i test et
for batch in data_loader:
    inputs, labels = batch
    print(inputs["input_values"].shape)  # Sabit uzunlukta tensör
    print(labels)
    break


torch.Size([16, 2, 240000])
tensor([0, 0, 1, 2, 2, 0, 2, 0, 1, 1, 2, 0, 1, 2, 0, 1])


In [27]:
# Mono'ya dönüştür: Kanalları ortalama alarak birleştirme
input_values = inputs["input_values"].mean(dim=1).to("cuda")  # Şekil: [batch_size, sequence_length]
attention_mask = inputs["attention_mask"][:, 0, :].to("cuda")  # İlk kanalın maskesi



**Model Eğitim Döngüsü**

In [28]:
print(f"input_values.shape: {input_values.shape}")  # Beklenen: [batch_size, sequence_length]
print(f"attention_mask.shape: {attention_mask.shape}")  # Beklenen: [batch_size, sequence_length]


input_values.shape: torch.Size([16, 240000])
attention_mask.shape: torch.Size([16, 240000])


In [30]:
from transformers import HubertForSequenceClassification, Wav2Vec2Processor
from torch.utils.data import DataLoader, Dataset
from torch.amp import GradScaler, autocast
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_score,
    recall_score, f1_score, accuracy_score
)
import torch
import glob
import numpy as np
import matplotlib.pyplot as plt
import time

# Eğitim ve çıkarım zamanı hesaplama için timer
start_time = time.time()

# 1. Verilerin Hazırlanması
class AudioDataset(Dataset):
    def __init__(self, audio_paths, labels):
        self.audio_paths = audio_paths
        self.labels = labels

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        inputs = preprocess_audio(self.audio_paths[idx])
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return inputs, label

processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
def preprocess_audio(audio_path, target_length=240000):  # 15 saniye = 16kHz * 15
    waveform, sample_rate = torchaudio.load(audio_path)
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
    if waveform.size(1) > target_length:
        waveform = waveform[:, :target_length]
    elif waveform.size(1) < target_length:
        padding = target_length - waveform.size(1)
        waveform = torch.nn.functional.pad(waveform, (0, padding))
    inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
    return inputs

# 2. Verilerin Tanımlanması
agresif_files = glob.glob("/content/drive/MyDrive/Yazgel_Muzik_Secilen_1000/agresif/*.wav")
huzunlu_files = glob.glob("/content/drive/MyDrive/Yazgel_Muzik_Secilen_1000/huzunlu/*.wav")
neseli_files = glob.glob("/content/drive/MyDrive/Yazgel_Muzik_Secilen_1000/neseli/*.wav")

audio_paths = agresif_files + huzunlu_files + neseli_files
labels = [0] * len(agresif_files) + [1] * len(huzunlu_files) + [2] * len(neseli_files)

from sklearn.model_selection import train_test_split
train_paths, val_paths, train_labels, val_labels = train_test_split(audio_paths, labels, test_size=0.2, random_state=42)

train_dataset = AudioDataset(train_paths, train_labels)
val_dataset = AudioDataset(val_paths, val_labels)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# 3. Model, Optimizasyon ve Kayıp Fonksiyonu
model = HubertForSequenceClassification.from_pretrained("facebook/hubert-large-ls960-ft", num_labels=3)
model.to("cuda")
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()
scaler = GradScaler()

# 4. Eğitim ve Validasyon
train_losses = []
val_losses = []

for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs, labels = batch
        input_values = inputs["input_values"].mean(dim=1).to("cuda")
        attention_mask = inputs["attention_mask"][:, 0, :].to("cuda")
        labels = labels.to("cuda")

        with autocast(device_type="cuda"):
            outputs = model(input_values=input_values, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
    train_losses.append(total_loss / len(train_loader))
    print(f"Epoch {epoch + 1}, Training Loss: {total_loss / len(train_loader)}")

    # Validasyon
    model.eval()
    val_loss = 0
    correct = 0
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            input_values = inputs["input_values"].mean(dim=1).to("cuda")
            attention_mask = inputs["attention_mask"][:, 0, :].to("cuda")
            labels = labels.to("cuda")

            outputs = model(input_values=input_values, attention_mask=attention_mask)
            val_loss += loss_fn(outputs.logits, labels).item()
            preds = torch.argmax(outputs.logits, dim=1)

            correct += (preds == labels).sum().item()
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(preds.cpu().numpy())

    val_losses.append(val_loss / len(val_loader))
    print(f"Validation Loss: {val_loss / len(val_loader)}, Accuracy: {100 * correct / len(val_loader.dataset):.2f}%")

# 5. Performans Metrikleri ve Çıktılar
conf_matrix = confusion_matrix(true_labels, predictions)
print(f"Confusion Matrix:\n{conf_matrix}")

accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average='macro')
recall = recall_score(true_labels, predictions, average='macro')
f1 = f1_score(true_labels, predictions, average='macro')
specificity = np.mean([
    conf_matrix[i, i] / (np.sum(conf_matrix[:, i]) - conf_matrix[i, i] + conf_matrix[i, i])
    for i in range(3)
])
auc = roc_auc_score(label_binarize(true_labels, classes=[0, 1, 2]), label_binarize(predictions, classes=[0, 1, 2]), multi_class="ovr")

print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall (Sensitivity): {recall:.2f}, F1-Score: {f1:.2f}, Specificity: {specificity:.2f}, AUC: {auc:.2f}")

# ROC Eğrisi ve Loss Grafiği
for i in range(3):
    fpr, tpr, _ = roc_curve(label_binarize(true_labels, classes=[0, 1, 2])[:, i], label_binarize(predictions, classes=[0, 1, 2])[:, i])
    plt.plot(fpr, tpr, label=f"Class {i}")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

plt.plot(range(1, len(train_losses) + 1), train_losses, label="Train Loss")
plt.plot(range(1, len(val_losses) + 1), val_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss vs Epoch")
plt.legend()
plt.show()

print(f"Training Time: {time.time() - start_time:.2f} seconds")


Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Training Loss: 0.8731312045638454
Validation Loss: 0.7502613568622805, Accuracy: 67.47%
Epoch 2, Training Loss: 0.6607314028874808
Validation Loss: 0.5401880948052454, Accuracy: 80.12%
Epoch 3, Training Loss: 0.5401198153884558
Validation Loss: 0.5519181924850441, Accuracy: 78.04%
Confusion Matrix:
[[352  15  63]
 [ 27 295  65]
 [ 79  15 291]]


NameError: name 'label_binarize' is not defined