In [None]:
from datasets import load_dataset

ds = load_dataset("luli0034/music-tags-to-spectrogram", split='train')

In [None]:
import numpy as np
import torch
from torch import nn
from torchvision import models, transforms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from datasets import load_dataset
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.utils.data.dataloader import default_collate

In [None]:
BATCH_SIZE = 256

In [None]:
subset_size = int(len(ds) * 0.5)
subset = ds.select(range(subset_size))
print(subset)
ds = subset

Dataset({
    features: ['image', 'text'],
    num_rows: 154
})


In [None]:
ds = ds.train_test_split(test_size=0.2)

In [None]:
ds_train, ds_test = ds["train"], ds["test"]

In [None]:
class MusicDataset(Dataset):
    def __init__(self, ds, transform=None):
        self.transform = transform
        self.data_frame = ds
    
    def __len__(self):
        return len(self.data_frame)
    
    def __getitem__(self, index):
        try:
            genres = self.data_frame[index]["text"]
            if self.transform:
                image = self.transform(self.data_frame[index]["image"])
            return image, genres
        except Exception as e:
            print(e)
            return np.ones((256, 256, 3)), self.data_frame[index]["text"]


In [None]:
# def my_collate(batch):
#     batch = list(filter(lambda x: x is not None, batch))
#     return default_collate(batch)

In [None]:
device = torch.device("cuda")

In [None]:
def extract_image_features(dataloader, model):
    model.eval()
    features = []
    with torch.no_grad():
        for inputs, _ in dataloader:
            inputs = inputs.to(device)
            output = model(inputs)
            features.append(output.cpu().numpy())
    return np.vstack(features)


In [None]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x):
        return self.linear(x)

### Поэтому я превращу их в квадрат

In [None]:
image_transforms = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
train_dataset = MusicDataset(ds_train, transform=image_transforms)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
test_dataset = MusicDataset(ds_test, transform=image_transforms)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
model = models.resnet50(pretrained=True)
model.fc = nn.Identity()
model.to(device) 
resnet = model

In [None]:
all_genres = ds_train.remove_columns('image')
all_genres_test = ds_test.remove_columns('image')

In [None]:
all_genres = [genre["text"] for genre in all_genres]
all_genres_test = [genre["text"] for genre in all_genres_test]

In [None]:
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(all_genres)
y_test_encoder = mlb.transform(all_genres_test)

In [None]:
y_test_encoder

array([[1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
        0, 0, 0, 0],
       [1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
        0, 0, 1, 0],
       [1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
        0, 0, 0, 0],
       [1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0],
       [1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
        1, 0, 0, 0],
       [1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
        0, 0, 1, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 1],
       [1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
        0, 0, 0, 0],
       [1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 

In [None]:
train_features = extract_image_features(train_loader, model)

In [None]:
len(train_features)

123

In [None]:
feature_tensor = torch.tensor(train_features, dtype=torch.float32).to(device)
labels_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)

In [None]:
model = LogisticRegressionModel(input_size=feature_tensor.shape[1], num_classes=labels_tensor.shape[1]).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

### Baseline CV Train

In [None]:
model.train()
for epoch in range(100):
    optimizer.zero_grad()
    outputs = model(feature_tensor)
    loss = criterion(outputs, labels_tensor)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch + 1} /100], Loss: {loss.item():.4f}')

Epoch [1 /100], Loss: 0.7156
Epoch [2 /100], Loss: 0.7018
Epoch [3 /100], Loss: 0.6889
Epoch [4 /100], Loss: 0.6768
Epoch [5 /100], Loss: 0.6656
Epoch [6 /100], Loss: 0.6551
Epoch [7 /100], Loss: 0.6453
Epoch [8 /100], Loss: 0.6362
Epoch [9 /100], Loss: 0.6277
Epoch [10 /100], Loss: 0.6197
Epoch [11 /100], Loss: 0.6123
Epoch [12 /100], Loss: 0.6053
Epoch [13 /100], Loss: 0.5988
Epoch [14 /100], Loss: 0.5928
Epoch [15 /100], Loss: 0.5871
Epoch [16 /100], Loss: 0.5817
Epoch [17 /100], Loss: 0.5768
Epoch [18 /100], Loss: 0.5721
Epoch [19 /100], Loss: 0.5677
Epoch [20 /100], Loss: 0.5635
Epoch [21 /100], Loss: 0.5597
Epoch [22 /100], Loss: 0.5560
Epoch [23 /100], Loss: 0.5526
Epoch [24 /100], Loss: 0.5494
Epoch [25 /100], Loss: 0.5463
Epoch [26 /100], Loss: 0.5435
Epoch [27 /100], Loss: 0.5408
Epoch [28 /100], Loss: 0.5382
Epoch [29 /100], Loss: 0.5358
Epoch [30 /100], Loss: 0.5335
Epoch [31 /100], Loss: 0.5314
Epoch [32 /100], Loss: 0.5293
Epoch [33 /100], Loss: 0.5274
Epoch [34 /100], Lo

In [None]:
test_features = extract_image_features(test_loader, resnet)

In [None]:
feature_tensor_test = torch.tensor(test_features, dtype=torch.float32).to(device)
labels_tensor_test = torch.tensor(y_test_encoder, dtype=torch.float32).to(device)

In [None]:
model.eval()
with torch.no_grad():
    test_ouptputs = model(feature_tensor_test)
    test_predictions = torch.sigmoid(test_ouptputs).cpu().numpy()
    test_predictions = (test_predictions > 0.5).astype(int)

In [None]:
print(classification_report(y_test_encoder, test_predictions, target_names=mlb.classes_))

              precision    recall  f1-score   support

                   0.94      1.00      0.97        29
           a       0.84      1.00      0.91        26
           b       0.00      0.00      0.00        11
           c       0.84      1.00      0.91        26
           d       0.00      0.00      0.00        15
           e       0.71      1.00      0.83        22
           f       0.00      0.00      0.00         3
           g       0.00      0.00      0.00         8
           h       0.00      0.00      0.00        10
           i       0.77      1.00      0.87        24
           j       0.00      0.00      0.00         1
           k       0.00      0.00      0.00        17
           l       0.84      1.00      0.91        26
           m       0.00      0.00      0.00         9
           n       0.90      1.00      0.95        28
           o       0.87      1.00      0.93        27
           p       0.00      0.00      0.00        10
           r       0.81    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
torch.save(model.state_dict(), "multiclass_model_simple.pth")