In [317]:
from datasets import load_dataset

ds = load_dataset("luli0034/music-tags-to-spectrogram", split='train')

In [318]:
import numpy as np
import torch
from torch import nn
from torchvision import models, transforms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from datasets import load_dataset
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.utils.data.dataloader import default_collate

In [319]:
BATCH_SIZE = 256

In [320]:
subset_size = int(len(ds) * 0.5)
subset = ds.select(range(subset_size))
print(subset)
ds = subset

Dataset({
    features: ['image', 'text'],
    num_rows: 771
})


In [321]:
ds = ds.train_test_split(test_size=0.2)

In [322]:
ds_train, ds_test = ds["train"], ds["test"]

In [323]:
class MusicDataset(Dataset):
    def __init__(self, ds, transform=None):
        self.transform = transform
        self.data_frame = ds
    
    def __len__(self):
        return len(self.data_frame)
    
    def __getitem__(self, index):
        try:
            genres = self.data_frame[index]["text"]
            if self.transform:
                image = self.transform(self.data_frame[index]["image"])
            return image, genres
        except Exception as e:
            print(e)
            return np.ones((256, 256, 3)), self.data_frame[index]["text"]


In [324]:
# def my_collate(batch):
#     batch = list(filter(lambda x: x is not None, batch))
#     return default_collate(batch)

In [325]:
device = torch.device("cuda")

In [326]:
def extract_image_features(dataloader, model):
    model.eval()
    features = []
    with torch.no_grad():
        for inputs, _ in dataloader:
            inputs = inputs.to(device)
            output = model(inputs)
            features.append(output.cpu().numpy())
    return np.vstack(features)


In [327]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x):
        return self.linear(x)

### Поэтому я превращу их в квадрат

In [328]:
image_transforms = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [329]:
train_dataset = MusicDataset(ds_train, transform=image_transforms)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [330]:
test_dataset = MusicDataset(ds_test, transform=image_transforms)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [331]:
model = models.resnet50(pretrained=True)
model.fc = nn.Identity()
model.to(device) 
resnet = model



In [332]:
all_genres = ds_train.remove_columns('image')
all_genres_test = ds_test.remove_columns('image')

In [333]:
all_genres = [genre["text"].split(" ") for genre in all_genres]
all_genres_test = [genre["text"].split(" ")  for genre in all_genres_test]

In [334]:
all_genres[0]

['atmospheric', 'ambient', 'darkambient']

In [335]:
all_genres_test[0]

['easylistening', 'soundtrack', 'ambient', 'chillout']

In [336]:
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(all_genres)
y_test_encoder = mlb.transform(all_genres_test)



In [337]:
mlb.classes_

array(['60s', '70s', '80s', '90s', 'accordion', 'acidjazz',
       'acousticbassguitar', 'acousticguitar', 'african', 'alternative',
       'alternativerock', 'ambient', 'atmospheric', 'bass', 'beat',
       'blues', 'bossanova', 'brass', 'breakbeat', 'cello', 'celtic',
       'chanson', 'chillout', 'choir', 'clarinet', 'classical',
       'classicalguitar', 'club', 'computer', 'contemporary', 'country',
       'dance', 'darkambient', 'deephouse', 'doublebass', 'downtempo',
       'drummachine', 'drumnbass', 'drums', 'dubstep', 'easylistening',
       'edm', 'electricguitar', 'electricpiano', 'electronic',
       'electronica', 'electropop', 'ethno', 'eurodance', 'experimental',
       'flute', 'folk', 'funk', 'fusion', 'gothic', 'grunge', 'guitar',
       'hard', 'hardrock', 'harp', 'hiphop', 'house', 'improvisation',
       'indie', 'industrial', 'instrumentalpop', 'instrumentalrock',
       'jazz', 'jazzfunk', 'keyboard', 'latin', 'lounge', 'medieval',
       'metal', 'minimal', 'ne

In [None]:
train_features = extract_image_features(train_loader, model)

In [None]:
len(train_features)

616

In [None]:
feature_tensor = torch.tensor(train_features, dtype=torch.float32).to(device)
labels_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)

In [None]:
model = LogisticRegressionModel(input_size=feature_tensor.shape[1], num_classes=labels_tensor.shape[1]).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

### Baseline CV Train

In [None]:
model.train()
for epoch in range(100):
    optimizer.zero_grad()
    outputs = model(feature_tensor)
    loss = criterion(outputs, labels_tensor)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch + 1} /100], Loss: {loss.item():.4f}')

Epoch [1 /100], Loss: 0.6932
Epoch [2 /100], Loss: 0.6809
Epoch [3 /100], Loss: 0.6692
Epoch [4 /100], Loss: 0.6583
Epoch [5 /100], Loss: 0.6479
Epoch [6 /100], Loss: 0.6382
Epoch [7 /100], Loss: 0.6290
Epoch [8 /100], Loss: 0.6204
Epoch [9 /100], Loss: 0.6122
Epoch [10 /100], Loss: 0.6045
Epoch [11 /100], Loss: 0.5972
Epoch [12 /100], Loss: 0.5903
Epoch [13 /100], Loss: 0.5838
Epoch [14 /100], Loss: 0.5777
Epoch [15 /100], Loss: 0.5719
Epoch [16 /100], Loss: 0.5664
Epoch [17 /100], Loss: 0.5612
Epoch [18 /100], Loss: 0.5563
Epoch [19 /100], Loss: 0.5516
Epoch [20 /100], Loss: 0.5472
Epoch [21 /100], Loss: 0.5430
Epoch [22 /100], Loss: 0.5391
Epoch [23 /100], Loss: 0.5353
Epoch [24 /100], Loss: 0.5317
Epoch [25 /100], Loss: 0.5283
Epoch [26 /100], Loss: 0.5251
Epoch [27 /100], Loss: 0.5220
Epoch [28 /100], Loss: 0.5190
Epoch [29 /100], Loss: 0.5162
Epoch [30 /100], Loss: 0.5136
Epoch [31 /100], Loss: 0.5110
Epoch [32 /100], Loss: 0.5086
Epoch [33 /100], Loss: 0.5063
Epoch [34 /100], Lo

In [None]:
test_features = extract_image_features(test_loader, resnet)

In [None]:
feature_tensor_test = torch.tensor(test_features, dtype=torch.float32).to(device)
labels_tensor_test = torch.tensor(y_test_encoder, dtype=torch.float32).to(device)

In [None]:
model.eval()
with torch.no_grad():
    test_ouptputs = model(feature_tensor_test)
    test_predictions = torch.sigmoid(test_ouptputs).cpu().numpy()
    test_predictions = (test_predictions > 0.5).astype(int)

In [None]:
print(classification_report(y_test_encoder, test_predictions, target_names=mlb.classes_))

              precision    recall  f1-score   support

                   0.77      1.00      0.87       119
           0       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         2
           a       0.77      1.00      0.87       120
           b       0.00      0.00      0.00        36
           c       0.69      1.00      0.82       107
           d       1.00      0.02      0.03        66
           e       0.68      1.00      0.81       105
           f       0.00      0.00      0.00        17
           g       0.00      0.00      0.00        41
           h       0.00      0.00      0.00        60
           i       0.75      1.00      0.86       117
           j       0.00      0.00      0.00         2
           k       0.00      0.00      0.00        64
           l       0.67    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
torch.save(model.state_dict(), "multiclass_model_simple.pth")