In [56]:
from datasets import load_dataset

ds = load_dataset("luli0034/music-tags-to-spectrogram", split='train')

In [80]:
import numpy as np
import torch
from torch import nn
from torchvision import models, transforms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from datasets import load_dataset
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.utils.data.dataloader import default_collate

## Возьму только 10 процентов датасета, чтобы просто проверить, что модель может обучиться

In [58]:
ds = ds.train_test_split(test_size=0.2)

In [59]:
ds_train, ds_test = ds["train"], ds["test"]

In [None]:
class MusicDataset(Dataset):
    def __init__(self, ds, transform=None):
        self.transform = transform
        self.data_frame = ds
    
    def __len__(self):
        return len(self.data_frame)
    
    def __getitem__(self, index):
        try:
            genres = self.data_frame[index]["text"]
            if self.transform:
                image = self.transform(self.data_frame[index]["image"])
            return image, genres
        except Exception as e:
            print(e)
            # return np.ones((256, 256, 3)), ["classical"] # TODO: 


In [101]:
def my_collate(batch):
    batch = list(filter(lambda x: x is not None, batch))
    return default_collate(batch)

In [102]:
device = torch.device("cuda")

In [103]:
def extract_image_features(dataloader, model):
    model.eval()
    features = []
    with torch.no_grad():
        for inputs, _ in dataloader:
            inputs = inputs.to(device)
            output = model(inputs)
            features.append(output.cpu().numpy())
    return np.vstack(features)


In [104]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x):
        return self.linear(x)

### Поэтому я превращу их в квадрат

In [105]:
image_transforms = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [106]:
train_dataset = MusicDataset(ds_train, transform=image_transforms)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=my_collate)

In [107]:
test_dataset = MusicDataset(ds_test, transform=image_transforms)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=my_collate)

In [108]:
model = models.resnet50(pretrained=True)
model.fc = nn.Identity()
model.to(device) 
resnet = model



In [109]:
all_genres = ds_train.remove_columns('image')

In [110]:
all_genres_list = [genre['text'] for genre in all_genres]

In [111]:
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(all_genres)
y_test_encoder = mlb.transform(y_train)



In [None]:
train_features = extract_image_features(train_loader, model)

unrecognized data stream contents when reading image file


In [None]:
feature_tensor = torch.tensor(train_features, dtype=torch.float32).to(device)
labels_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)

In [None]:
model = LogisticRegressionModel(input_size=feature_tensor.shape[1], num_classes=labels_tensor.shape[1]).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

### Baseline CV Train

In [None]:
model.train()
for epoch in range(100):
    optimizer.zero_grad()
    outputs = model(feature_tensor)
    loss = criterion(outputs.squeeze(), labels_tensor)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch + 1} /100], Loss: {loss.item():.4f}')

Epoch [1 /100], Loss: 0.7226
Epoch [2 /100], Loss: 0.7077
Epoch [3 /100], Loss: 0.6937
Epoch [4 /100], Loss: 0.6805
Epoch [5 /100], Loss: 0.6682
Epoch [6 /100], Loss: 0.6566
Epoch [7 /100], Loss: 0.6457
Epoch [8 /100], Loss: 0.6355
Epoch [9 /100], Loss: 0.6258
Epoch [10 /100], Loss: 0.6168
Epoch [11 /100], Loss: 0.6083
Epoch [12 /100], Loss: 0.6004
Epoch [13 /100], Loss: 0.5929
Epoch [14 /100], Loss: 0.5858
Epoch [15 /100], Loss: 0.5792
Epoch [16 /100], Loss: 0.5730
Epoch [17 /100], Loss: 0.5671
Epoch [18 /100], Loss: 0.5616
Epoch [19 /100], Loss: 0.5563
Epoch [20 /100], Loss: 0.5514
Epoch [21 /100], Loss: 0.5468
Epoch [22 /100], Loss: 0.5424
Epoch [23 /100], Loss: 0.5382
Epoch [24 /100], Loss: 0.5343
Epoch [25 /100], Loss: 0.5306
Epoch [26 /100], Loss: 0.5271
Epoch [27 /100], Loss: 0.5238
Epoch [28 /100], Loss: 0.5206
Epoch [29 /100], Loss: 0.5176
Epoch [30 /100], Loss: 0.5148
Epoch [31 /100], Loss: 0.5121
Epoch [32 /100], Loss: 0.5096
Epoch [33 /100], Loss: 0.5071
Epoch [34 /100], Lo

In [None]:
test_features = extract_image_features(test_loader, resnet)

In [None]:
feature_tensor_test = torch.tensor(test_features, dtype=torch.float32).to(device)
labels_tensor_test = torch.tensor(y_test_encoder, dtype=torch.float32).to(device)

In [None]:
model.eval()
with torch.no_grad():
    test_ouptputs = model(feature_tensor_test)
    test_predictions = torch.sigmoid(test_ouptputs).cpu().numpy()
    test_predictions = (test_predictions > 0.5).astype(int)

## Напомню взял только 10 процентов

In [None]:
print(classification_report(y_test_encoder, test_features, target_names=mlb.classes_))

NameError: name 'y_test_encoder' is not defined