In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"
df = pd.read_csv(url)

In [None]:
df.head()

In [None]:
df["genre"].unique()

In [None]:
engineered_features = ['dating', 'violence', 'world/life', 'night/time','shake the audience','family/gospel', 'romantic', 'communication','obscene', 'music', 'movement/places', 'light/visual perceptions','family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability','loudness', 'acousticness', 'instrumentalness', 'valence', 'energy', "genre"]      
df_ef = df[engineered_features]
df_lyrics = df[['lyrics', 'genre']]

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

from sklearn.preprocessing import StandardScaler

def prepare_df(df):
    y = df["genre"]
    y = le.fit_transform(y)
    X = df.drop(columns=["genre"])

    # Use StandardScaler for more stable training
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X, y

In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from imblearn.over_sampling import SMOTE

# Load the data
url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"
df = pd.read_csv(url)

# Select only the engineered features
engineered_features = ['dating', 'violence', 'world/life', 'night/time', 'shake the audience', 'family/gospel', 
                       'romantic', 'communication', 'obscene', 'music', 'movement/places', 
                       'light/visual perceptions', 'family/spiritual', 'like/girls', 'sadness', 
                       'feelings', 'danceability', 'loudness', 'acousticness', 'instrumentalness', 
                       'valence', 'energy', 'genre']
df_ef = df[engineered_features]

# Encode the target labels
le = LabelEncoder()
df_ef.loc[:, "genre"] = le.fit_transform(df_ef["genre"])
# Separate features and labels
X = df_ef.drop(columns=["genre"]).values
y = df_ef["genre"].values

# Use MinMax scaling for more consistent gradients
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Oversample the minority classes
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_test, dtype=torch.float32)
y_val_tensor = torch.tensor(y_test, dtype=torch.long)

# Create DataLoader for training and validation sets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

# Define the DNN model with more aggressive regularization
class DeepDNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(DeepDNN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.4),
            
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.4),
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.4),
            
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        return self.layers(x)

# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]
num_classes = len(le.classes_)
model = DeepDNN(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.002, weight_decay=1e-4)

# Use a more aggressive learning rate scheduler
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30)

# Training loop with early stopping
num_epochs = 100
best_val_acc = 0
patience = 10
early_stop_counter = 0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    val_acc = 100 * correct / total
    scheduler.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, "
          f"Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {val_acc:.2f}%")

    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        early_stop_counter = 0
        print(f"New best validation accuracy: {val_acc:.2f}%")
    else:
        early_stop_counter += 1

    if early_stop_counter >= patience:
        print("Early stopping triggered.")
        break

print(f"Best Validation Accuracy: {best_val_acc:.2f}%")

ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
num_classes = len(le.classes_)
print("Number of classes:", num_classes)

In [None]:
print(pd.Series(y_train).value_counts())

In [78]:
!pip install imblearn

