In [141]:
from datasets import load_dataset

ds = load_dataset("luli0034/music-tags-to-spectrogram", split='train')

In [142]:
import numpy as np
import torch
from torch import nn
from torchvision import models, transforms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from datasets import load_dataset
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.utils.data.dataloader import default_collate

In [143]:
BATCH_SIZE = 256

In [None]:
subset_size = int(len(ds) * 0.1)
subset = ds.select(range(subset_size))
print(subset)
ds = subset

Dataset({
    features: ['image', 'text'],
    num_rows: 771
})


In [145]:
ds = ds.train_test_split(test_size=0.2)

In [146]:
ds_train, ds_test = ds["train"], ds["test"]

In [147]:
class MusicDataset(Dataset):
    def __init__(self, ds, transform=None):
        self.transform = transform
        self.data_frame = ds
    
    def __len__(self):
        return len(self.data_frame)
    
    def __getitem__(self, index):
        try:
            genres = self.data_frame[index]["text"]
            if self.transform:
                image = self.transform(self.data_frame[index]["image"])
            return image, genres
        except Exception as e:
            print(e)
            return np.ones((256, 256, 3)), self.data_frame[index]["text"]


In [148]:
# def my_collate(batch):
#     batch = list(filter(lambda x: x is not None, batch))
#     return default_collate(batch)

In [149]:
device = torch.device("cuda")

In [150]:
def extract_image_features(dataloader, model):
    model.eval()
    features = []
    with torch.no_grad():
        for inputs, _ in dataloader:
            inputs = inputs.to(device)
            output = model(inputs)
            features.append(output.cpu().numpy())
    return np.vstack(features)


In [151]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x):
        return self.linear(x)

### Поэтому я превращу их в квадрат

In [152]:
image_transforms = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [153]:
train_dataset = MusicDataset(ds_train, transform=image_transforms)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [154]:
test_dataset = MusicDataset(ds_test, transform=image_transforms)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [155]:
model = models.resnet50(pretrained=True)
model.fc = nn.Identity()
model.to(device) 
resnet = model



In [173]:
all_genres = ds_train.remove_columns('image')
all_genres_test = ds_test.remove_columns('image')

In [174]:
all_genres_list = [genre['text'] for genre in all_genres]
all_genres_list_test = [genre['text'] for genre in all_genres_test]

In [175]:
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(all_genres)
y_test_encoder = mlb.transform(all_genres_list_test)



In [159]:
train_features = extract_image_features(train_loader, model)

In [160]:
len(train_features)

616

In [161]:
feature_tensor = torch.tensor(train_features, dtype=torch.float32).to(device)
labels_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)

In [162]:
model = LogisticRegressionModel(input_size=feature_tensor.shape[1], num_classes=labels_tensor.shape[1]).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

### Baseline CV Train

In [163]:
model.train()
for epoch in range(100):
    optimizer.zero_grad()
    outputs = model(feature_tensor)
    loss = criterion(outputs, labels_tensor)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch + 1} /100], Loss: {loss.item():.4f}')

Epoch [1 /100], Loss: 0.7837
Epoch [2 /100], Loss: 0.1548
Epoch [3 /100], Loss: 0.0968
Epoch [4 /100], Loss: 0.0714
Epoch [5 /100], Loss: 0.0569
Epoch [6 /100], Loss: 0.0474
Epoch [7 /100], Loss: 0.0407
Epoch [8 /100], Loss: 0.0357
Epoch [9 /100], Loss: 0.0318
Epoch [10 /100], Loss: 0.0287
Epoch [11 /100], Loss: 0.0261
Epoch [12 /100], Loss: 0.0240
Epoch [13 /100], Loss: 0.0222
Epoch [14 /100], Loss: 0.0207
Epoch [15 /100], Loss: 0.0193
Epoch [16 /100], Loss: 0.0182
Epoch [17 /100], Loss: 0.0171
Epoch [18 /100], Loss: 0.0162
Epoch [19 /100], Loss: 0.0154
Epoch [20 /100], Loss: 0.0146
Epoch [21 /100], Loss: 0.0140
Epoch [22 /100], Loss: 0.0133
Epoch [23 /100], Loss: 0.0128
Epoch [24 /100], Loss: 0.0123
Epoch [25 /100], Loss: 0.0118
Epoch [26 /100], Loss: 0.0114
Epoch [27 /100], Loss: 0.0109
Epoch [28 /100], Loss: 0.0106
Epoch [29 /100], Loss: 0.0102
Epoch [30 /100], Loss: 0.0099
Epoch [31 /100], Loss: 0.0096
Epoch [32 /100], Loss: 0.0093
Epoch [33 /100], Loss: 0.0090
Epoch [34 /100], Lo

In [164]:
test_features = extract_image_features(test_loader, resnet)

In [165]:
feature_tensor_test = torch.tensor(test_features, dtype=torch.float32).to(device)
labels_tensor_test = torch.tensor(y_test_encoder, dtype=torch.float32).to(device)

In [176]:
model.eval()
with torch.no_grad():
    test_ouptputs = model(feature_tensor_test)
    test_predictions = torch.sigmoid(test_ouptputs).cpu().numpy()
    test_predictions = (test_predictions > 0.5).astype(int)

In [177]:
print(classification_report(y_test_encoder, test_predictions, target_names=mlb.classes_))

ValueError: Number of classes, 2, does not match size of target_names, 1. Try specifying the labels parameter