# **LABORATORIO 1 - DEEP LEARNING**

Integrantes:

a) Arturo Magno Barrantes Chuquimia 

b) Ricardo Amiel Acuña Villogas

## **Librerías**

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# **Explorando datasets de train**

In [27]:
audios = pd.read_csv('train-audios.csv')
audios.head()

Unnamed: 0,filename,SPHSUR,BOABIS,SCIPER,DENNAH,LEPLAT,RHIICT,BOALEP,BOAFAB,PHYCUV,...,SCINAS,LEPNOT,ADEMAR,BOAALM,PHYDIS,RHIORN,LEPFLA,SCIRIZ,DENELE,SCIALT
0,INCT20955_20190909_050000_0_3.wav,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,INCT20955_20190909_050000_1_4.wav,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,INCT20955_20190909_050000_2_5.wav,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,INCT20955_20190909_050000_3_6.wav,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,INCT20955_20190909_050000_4_7.wav,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Extraer un sample hata 32 porque probaremos con los primeros 30

In [28]:
audios_30 = audios.head(30)
audios_30.to_csv("train-audios-sample-first-30.csv", index=False)
audios_30

Unnamed: 0,filename,SPHSUR,BOABIS,SCIPER,DENNAH,LEPLAT,RHIICT,BOALEP,BOAFAB,PHYCUV,...,SCINAS,LEPNOT,ADEMAR,BOAALM,PHYDIS,RHIORN,LEPFLA,SCIRIZ,DENELE,SCIALT
0,INCT20955_20190909_050000_0_3.wav,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,INCT20955_20190909_050000_1_4.wav,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,INCT20955_20190909_050000_2_5.wav,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,INCT20955_20190909_050000_3_6.wav,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,INCT20955_20190909_050000_4_7.wav,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,INCT20955_20190909_050000_5_8.wav,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,INCT20955_20190909_050000_6_9.wav,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,INCT20955_20190909_050000_7_10.wav,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,INCT20955_20190909_050000_8_11.wav,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,INCT20955_20190909_050000_9_12.wav,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## **Dataset: Audios a espectrogramas**

In [29]:
class AudioDataset(Dataset):
    def __init__(self, audio_dir, csv_path=None, transform=None):
        self.audio_dir = audio_dir
        self.transform = transform
        self.mel_spec = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128)

        if csv_path:
            self.data = pd.read_csv(csv_path)
            self.files = self.data.iloc[:, 0].values
            self.labels = self.data.iloc[:, 1:].values.astype(np.float32)
            self.has_labels = True
        else:
            self.files = sorted([f for f in os.listdir(audio_dir) if f.endswith(".wav")])
            self.has_labels = False

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        filepath = os.path.join(self.audio_dir, self.files[idx])
        waveform, sr = torchaudio.load(filepath)
        if sr != 16000:
            waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)

        spec = self.mel_spec(waveform)
        spec = torchaudio.functional.amplitude_to_DB(spec, multiplier=10.0, amin=1e-10, db_multiplier=0)
        spec = spec.squeeze(0).unsqueeze(0)  # [128, time]

        if self.transform:
            spec = self.transform(spec)

        if self.has_labels:
            label = torch.tensor(self.labels[idx], dtype=torch.float32)
            return spec, label
        else:
            return spec, self.files[idx]  # devolvemos el nombre para identificar luego

## **Dataset: frames/videos**

In [None]:
'''
class VideoFrameDataset(Dataset):
    def __init__(self, csv_path, img_dir, transform=None):
        self.data = pd.read_csv(csv_path, delim_whitespace=True)
        self.img_dir = img_dir
        self.transform = transform
        self.label_cols = [col for col in self.data.columns if col not in ['original_vido_id', 'video_id', 'frame_id', 'path', 'type']]
        self.labels = self.data[self.label_cols].values.astype(np.float32)
        self.paths = self.data['path'].values

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.paths[idx])
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        label = self.labels[idx]
        return image, torch.tensor(label, dtype=torch.float32)'''

### **Modelo: ResNet18**

In [30]:
# Modelo CNN
class MultiLabelCNN(nn.Module):
    def __init__(self, n_classes):
        super(MultiLabelCNN, self).__init__()
        self.base = models.resnet18(pretrained=True)
        self.base.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.base.fc = nn.Sequential(
            nn.Linear(self.base.fc.in_features, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, n_classes),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.base(x)

#### **Training**

In [31]:
def train_model(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataloader)

# Evaluación
def eval_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
    return total_loss / len(dataloader)

# Predicción para set sin etiquetas
def predict(model, dataloader, device, threshold=0.5):
    model.eval()
    results = []
    with torch.no_grad():
        for inputs, filenames in dataloader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            preds = (outputs > threshold).float().cpu().numpy()
            for name, pred in zip(filenames, preds):
                results.append((name, pred.tolist()))
    return results

#### **Métricas (validación)**

In [32]:
def evaluate_multilabel(model, dataloader, device, threshold=0.5):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            preds = (outputs > threshold).float().cpu().numpy()
            labels = labels.cpu().numpy()
            all_preds.append(preds)
            all_labels.append(labels)

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_labels)

    precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)

    precision_micro = precision_score(y_true, y_pred, average='micro', zero_division=0)
    recall_micro = recall_score(y_true, y_pred, average='micro', zero_division=0)
    f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0)

    return {
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "f1_macro": f1_macro,
        "precision_micro": precision_micro,
        "recall_micro": recall_micro,
        "f1_micro": f1_micro
    }

In [None]:
#n_batch = 64
#n_classes = 42

n_batch = 16
n_classes = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiLabelCNN(n_classes=n_classes).to(device)

criterion = nn.BCEWithLogitsLoss() # Binary Cross Entropy
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Para audio o video (elige el dataset que corresponda)
# dataset = AudioDataset(...) o VideoFrameDataset(...)
#### csv=path='train-audios.csv'
#dataset = AudioDataset(csv_path='train-audios.csv',audio_dir='audios_train/train/')

#### Los primeros 30 audios del train
dataset = AudioDataset(csv_path="train-audios-sample-first-30.csv", audio_dir='audios_train/train/')

#### Los primeros 30 audios del test
testset = AudioDataset(audio_dir='audios_test/test/')
#dataset = VideoFrameDataset(csv_path='train-videos.csv',video_dir='videos_train/video/')

# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
dataloader = DataLoader(dataset, batch_size = n_batch, shuffle=True)
testloader = DataLoader(testset, batch_size = n_batch, shuffle=False)

# Validación
#train_audios, val_audios = train_test_split(audios, test_size=0.2, random_state=42)

#train_dataset = AudioDataset(audio_dir='audios_train/train/', df=train_audios)
#val_dataset = AudioDataset(audio_dir='audios_train/train/', df=val_audios)

#trainloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
#valloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

#Entrenamiento
for epoch in range(5):
    loss = train_model(model, dataloader, criterion, optimizer, device)
    #loss = train_model(model, train_audios, criterion, optimizer, device)
    #val = eval_model(model,val_audios, criterion, optimizer, device)
    #print(f"Época {epoch+1} - Pérdida Train: {loss:.4f} - Pérdida Validación: {val:.4f}")
    print(f"Época {epoch+1} - Pérdida Train: {loss:.4f}")
    
#val_metrics = evaluate_multilabel(model, val_audios, device)
#print(f"[Validación] Precision_macro: {val_metrics['precision_macro']:.3f}, F1_micro: {val_metrics['f1_micro']:.3f}")



RuntimeError: Couldn't find appropriate backend to handle uri audios_train/train/INCT20955_20190909_050000_11_14.wav and format None.

In [36]:
import torchaudio
waveform, sr = torchaudio.load("audios_train/train/INCT20955_20190909_050000_11_14.wav")
print(waveform.shape, sr)

RuntimeError: Couldn't find appropriate backend to handle uri audios_train/train/INCT20955_20190909_050000_11_14.wav and format None.

In [None]:
#Inferencia
results = predict(model, testloader, device)
# Guardar a CSV
df_results = pd.DataFrame([
    [filename] + preds for filename, preds in results
])
columns = ['filename'] + list(dataset.data.columns[1:])  # mismo orden de etiquetas
df_results.columns = columns
df_results.to_csv("audios_predicciones_test.csv", index=False)

# Mostrar los 5 primeros resultados
for name, preds in results[:5]:
    print(f"{name} → {preds}")