# Here we will use ptune learning method as baseline for genre classification


In [2]:
import enum
import logging

class DatasetTypes(enum.Enum):
    whole = 0  # Returns all dataset
    small = 1  # Returns only 1000 first rows from dataset

class Params:
    def __init__(self, exp_name='genre_classification', random_seed=1337, n_epoch=10, batch_size=8, dataset_type=DatasetTypes.whole, 
                 learning_rate=1e-4, weight_decay=1e-5):
        self.random_seed = random_seed
        self.exp_name = exp_name
        self.n_epoch = n_epoch
        self.batch_size = batch_size
        self.dataset_type = dataset_type
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        
    def __str__(self):
        return ", ".join(f"{k}: {v}" for k, v in vars(self).items())
    
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

logger = logging.getLogger(__name__)

In [3]:
import pandas as pd
import logging
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data._utils.collate import default_collate

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

logger = logging.getLogger(__name__)

class LyricsGenreDataset(Dataset):
    def __init__(self, lyrics_list, features, labels, tokenizer, max_length=512):
        self.lyrics = lyrics_list
        self.features = features  # pandas df
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.lyrics)

    def __getitem__(self, idx):
        text = self.lyrics[idx]
        label = self.labels[idx]
        features = self.features.iloc[idx].to_dict()  # <--- превращаем в словарь

        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),  # [seq_len]
            'attention_mask': encoding['attention_mask'].squeeze(0),  # [seq_len]
            'labels': torch.tensor(label, dtype=torch.float),  # [num_labels]
            'features': features  # <--- теперь это dict, collate будет работать
        }


def one_hot_encoded_to_genre_list(predictions, idx2genre: dict = None):
    ''' Predictions is array on n_genres size, where 1 if lyrics belongs to that genre and 0 if not'''    
    genre_list = []
    for i, value in enumerate(predictions):
        if value == 1:
            genre_list.append(idx2genre[i])
    
    return genre_list


def get_datasets(df_path, tokenizer, dataset_type=DatasetTypes.whole, debug=False, train_size=0.7, test_size=0.15, val_size=0.15, random_seed=1337):
    ''' Params:
            df_path - path to .csv format file. Expected that it have 'lyrics' and 'genre' fields as base. Other fields will go to feature field of dataset
        Returns:
            dicts with torch datasets and some extra objects'''
    assert abs(train_size + val_size + test_size - 1.0) < 1e-6, "Sizes must sum to 1.0"
    
    df = pd.read_csv(df_path)
    df = df[:1000] if dataset_type == DatasetTypes.small else df  # For experiments use

    if debug: 
        logger.info(str(df.head()))
        
    target = df['genre'].unique()
    
    if debug:
        logger.info(str(sorted(target)))

    all_genre_strings = df['genre'].unique()

    # Разделяем по запятой и складываем в множество (чтобы получить только уникальные жанры)
    all_genres = set()

    for genre_string in all_genre_strings:
        genres = genre_string.split(',')
        all_genres.update(genres)

    # Преобразуем в отсортированный список (по желанию)
    all_genres_list = sorted(all_genres)

    if debug:
        logger.info(str(all_genres_list))
        logger.info(str(f'Genres length: {len(all_genres_list)}'))
        
    df['genre_list'] = df['genre'].apply(lambda x: [g.strip() for g in x.split(',')])

    # Используем MultiLabelBinarizer для преобразования в one-hot
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df['genre_list'])
    genres = mlb.classes_

    if debug:
        logger.info(f"Unique genres number: {len(mlb.classes_)}")
        
    idx2genre = {i: genre for i, genre in enumerate(genres)}
    genre2idx = {genre: i for i, genre in enumerate(genres)}
    
    if debug:
        genres_count = {genre_name: 0 for genre_name in genres}
        for index, row in df.iterrows():
            for genre in genres:
                if genre in row['genre_list']:
                    genres_count[genre] += 1

        logger.info('Genres count')
        for key, value in genres_count.items():
            logger.info(f"{key}: {value}")

    df_shuffled = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)

    df_shuffled['genre_list'] = df_shuffled['genre'].apply(lambda x: [g.strip() for g in x.split(',')])

    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df_shuffled['genre_list'])

    X_temp, X_test, y_temp, y_test = train_test_split(
        df_shuffled, y, test_size=test_size, random_state=random_seed
    )

    # Отношение валидационной выборки к оставшемуся (train + val)
    val_ratio_of_temp = val_size / (train_size + val_size)

    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_ratio_of_temp, random_state=random_seed
    )

    if debug:
        logger.info('Dataset sizes:')
        logger.info('Train size:', len(X_train))
        logger.info('Val size:', len(X_val))
        logger.info('Test size:', len(X_test))
        
    train_dataset = LyricsGenreDataset(X_train['lyrics'].tolist(), X_train, y_train, tokenizer)
    val_dataset = LyricsGenreDataset(X_val['lyrics'].tolist(), X_val, y_val, tokenizer)
    test_dataset = LyricsGenreDataset(X_test['lyrics'].tolist(), X_test, y_test, tokenizer)
    
    return {
        'train_dataset': train_dataset,
        'val_dataset': val_dataset,
        'test_dataset': test_dataset,
        'genres': genres,
        'idx2genre': idx2genre,
        'genre2idx': genre2idx}
    
    
def get_dataloaders(train_dataset: LyricsGenreDataset, val_dataset: LyricsGenreDataset, test_dataset: LyricsGenreDataset, batch_size):
    # We eill use custion collate fn because we want features dict be in our dataset
    def custom_collate_fn(batch):
        batch_dict = default_collate([
            {k: v for k, v in item.items() if k != 'features'}
            for item in batch
        ])

        # Собираем features отдельно
        if 'features' in batch[0]:
            feature_dicts = [item['features'] for item in batch]
            batch_dict['features'] = feature_dicts  # просто список словарей

        return batch_dict
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=custom_collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=custom_collate_fn)
    
    return train_loader, val_loader, test_loader


In [None]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

from abc import ABC, abstractmethod

# ModelEvalInterface - interface for model evaluation. Model should get row of df as input and then return vector of predictions (prediction if row belongs to some class of not).
class GenrePredictorInterface(ABC):
    @abstractmethod
    def predict(self, batch_features: dict) -> np.array:
        """
        Get batched input that contains 'input_ids', 'labels', 'features'
        Returns prediction in binary format [batch_size, num_classes] as numpy array
        """
        pass

def evaluate_model(model: GenrePredictorInterface, dataloader, device='cpu'):
    all_preds = []
    all_targets = []

    for batch in tqdm(dataloader, desc="Evaluating"):
        targets = batch['labels']
        preds = model.predict(batch)

        all_preds.append(preds)
        all_targets.append(targets.cpu().numpy())

    y_pred = np.vstack(all_preds)
    y_true = np.vstack(all_targets)

    prec = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

    # confusion_matrix — многоклассовая, тут нужна "ошибочная матрица" в multilabel стиле
    # Мы сделаем aggregated confusion-like матрицу:
    
    num_classes = y_true.shape[1]
    error_matrix = np.zeros((num_classes, num_classes), dtype=int)

    for i in range(len(y_true)):
        true_labels = np.where(y_true[i] == 1)[0]
        pred_labels = np.where(y_pred[i] == 1)[0]

        for pred in pred_labels:
            for true in true_labels:
                error_matrix[pred, true] += 1

    metrics = {
        'precision': prec,
        'recall': recall,
        'f1': f1,
        'error_matrix': error_matrix
    }

    return metrics

In [5]:
import numpy as np
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

from transformers import AutoModelForCausalLM, AutoTokenizer
import json
import re

In [6]:
# simpliest model for demonstration scenario
model_name = "Qwen/Qwen3-0.6B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map=device
)

tokenizer_config.json:   0%|          | 0.00/9.73k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

2025-05-25 15:12:41.193201: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748185961.408085      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748185961.470340      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

## Get dataset with all genres and 1,294,054 examples

In [None]:
path_to_csv = '../data/all_genres_downsampled.csv'
data_dict = get_datasets(path_to_csv, tokenizer, dataset_type=DatasetTypes.small)

train_dataset, val_dataset, test_dataset = data_dict['train_dataset'], data_dict['val_dataset'], data_dict['test_dataset']
idx2genre, genre2idx = data_dict['idx2genre'], data_dict['genre2idx']
genres = [key for key, _ in genre2idx.items()]

batch_size = 16
train_loader, val_loader, test_loader = get_dataloaders(train_dataset, val_dataset, test_dataset, batch_size)

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    PromptTuningConfig,
    get_peft_model,
    PromptTuningInit
)

prompt_config = PromptTuningConfig(
    task_type="CAUSAL_LM",
    prompt_tuning_init=PromptTuningInit.TEXT,
    prompt_tuning_init_text=f"""    You are an expert music genre classifier. Analyze these song lyrics features:
    1. Themes (love, party, rebellion)
    2. Language style (slang, poetic)
    3. Rhythmic patterns
    4. Cultural context
    
    Possible genres: {', '.join(genres)}
    
    Example 1:
    Lyrics: "We’re rolling down the street with the bass turned up, neon lights flashing..."
    Genre: Hip-Hop
    
    Example 2:
    Lyrics: "Sweet child o' mine, you're the only one that's on my mind"
    Genre: Rock
    
    Note: Be concise. Classify the following:
    Lyrics: {{input_lyrics}}
    Genre:""".replace("{", "{{").replace("}", "}}"),  # Подставляем реальные жанры из датасета
    num_virtual_tokens=50,
    tokenizer_name_or_path=model_name,
    inference_mode=False
)

model = get_peft_model(model, prompt_config)
model.gradient_checkpointing_enable()
model.print_trainable_parameters()    # обучаются только soft-prompts
model.to(device)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_scheduler

num_epochs = 100
warmup_steps = 500
learning_rate = 5e-5

# Optimizer: only prompt embeddings are trainable
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

loss_fn = torch.nn.CrossEntropyLoss(smoothing=0.1)

num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="cosine", optimizer=optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)

In [None]:
# Map genres to label token ids
label_token_ids = {g: tokenizer.encode(' ' + g, add_special_tokens=False)[0] for g in idx2genre.values()}

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)  # genre indices

        # Build prompt: virtual tokens + input text
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=input_ids,  # dummy for LM
            return_dict=True
        )
        lm_logits = outputs.logits  # [batch, seq_len, vocab]

        # Use last token logits to predict genre token
        last_logit = lm_logits[:, -1, :]  # [batch, vocab]
        target_ids = torch.tensor(
            [label_token_ids[idx2genre[i.item()]] for i in labels],
            device=device
        )

        loss = loss_fn(last_logit, target_ids)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    torch.cuda.empty_cache()
    print(f"Epoch {epoch+1}/{num_epochs} - Train loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                return_dict=True
            )
            logits = outputs.logits[:, -1, :]
            preds = logits.argmax(dim=-1)
            target_ids = torch.tensor(
                [label_token_ids[idx2genre[i.item()]] for i in labels],
                device=device
            )
            correct += (preds == target_ids).sum().item()
            total += labels.size(0)
            torch.cuda.empty_cache()
    print(f"Epoch {epoch+1}/{num_epochs} - Val accuracy: {correct/total:.4f}")


In [None]:
label_token_ids = {g: tokenizer.encode(' ' + g, add_special_tokens=False)[0] for g in idx2genre.values()}
id2label_token = {v: k for k, v in label_token_ids.items()}

model.eval()
print("\nTesting on test set:")
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        logits = outputs.logits[:, -1, :]
        preds = logits.argmax(dim=-1)
        for i in range(len(labels)):
            pred_token = preds[i].item()
            true_label = idx2genre[labels[i].item()]
            pred_label = id2label_token.get(pred_token, 'UNKNOWN')
            print(f"Predicted genre is: {pred_label:<15} | Actual genre is: {true_label}")
