In [1]:
# Future imports
from __future__ import annotations

# Standard library imports
import csv
import os
import sys
import warnings
from typing import Set

# Third-party imports
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import soundfile as sf
import tensorflow as tf
import torch
import whisper
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.layers import (
    Add,
    Conv1D,
    Dense,
    Dropout,
    Flatten,
    GlobalAveragePooling1D,
    Input,
    LayerNormalization,
    MultiHeadAttention,
)

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm

In [2]:
import pandas as pd

# Read the input CSV file
df = pd.read_csv("C:/Users/yingx/OneDrive/Documents/GitHub/Capstone/data/static_annotations_averaged_songs_1_2000.csv")  # replace with your actual filename

# Define a function to determine emotion
def get_emotion(row):
    if row["valence_mean"] <= 5 and row["arousal_mean"] <= 5:
        return "Sad"
    elif row["valence_mean"] <= 5 and row["arousal_mean"] >= 5:
        return "Anger"
    elif row["valence_mean"] >= 5 and row["arousal_mean"] <= 5:
        return "Relaxed"
    elif row["valence_mean"] >= 5 and row["arousal_mean"] >= 5:
        return "Happy"
    else:
        return "Unknown"

# Apply the function to each row
df["emotion"] = df.apply(get_emotion, axis=1)

# Create a new DataFrame with only id and emotion
output = df[["song_id", "emotion"]].rename(columns={"song_id": "id"})

# Save to a new CSV file
output.to_csv("emotion2.csv", index=False)

print("emotion2.csv has been created successfully!")


emotion2.csv has been created successfully!


In [4]:
# Merge features.csv, transcription.csv, static_annotations_averaged_songs.csv, and emotion.csv into a singular CSV file

# Read csv
features_df = pd.read_csv(r'C:/Users/yingx/data/features.csv')
transcription_df = pd.read_csv(r'C:/Users/yingx/data/transcription.csv')
emotion_df = pd.read_csv(r'C:/Users/yingx/OneDrive/Documents/GitHub/Capstone/emotion2.csv')

# Merge on 'id' cloumn
merged_df = features_df.merge(transcription_df, on='id', how='left').merge(emotion_df, on='id', how='left')

# Save the merged DataFrame to a new CSV file
merged_df.to_csv(r'C:/Users/yingx/data/merged_data.csv', index=False)
print("Data merged successfully into merged_data.csv")
print(merged_df.head())


Data merged successfully into merged_data.csv
   id   duration       tempo  spectral_centroid  zero_crossing_rate  \
0   2  45.060998  139.674831        1898.573651            0.036521   
1   3  45.034875   95.703125        1049.662623            0.013481   
2   4  45.034875   84.720799        1918.833508            0.037387   
3   5  45.034875  123.046875        1900.762700            0.036257   
4   7  45.060998  120.185320        1268.646669            0.012906   

                                      chroma_feature  \
0  [0.40816259384155273, 0.33845534920692444, 0.3...   
1  [0.6668778657913208, 0.6146547794342041, 0.617...   
2  [0.4336817264556885, 0.3621821701526642, 0.355...   
3  [0.4125290513038635, 0.40957579016685486, 0.38...   
4  [0.5245635509490967, 0.48823004961013794, 0.54...   

                                               mfccs       rms   harmonicity  \
0  [-200.25965881347656, 171.7724609375, -32.7078...  0.105231 -8.804523e-06   
1  [-285.9258728027344, 159.13

In [8]:
# Split the merged data into training and testing sets
# 1. Read the merged data
merged_df = pd.read_csv(r'C:/Users/yingx/data/merged_data.csv')
# 2. Keep only rows where emotion is not NaN (only labeled songs)
labeled_df = merged_df.dropna(subset=['emotion'])
# 3. Define features and target
X = labeled_df.drop(columns=['id', 'emotion'])
y = labeled_df['emotion']
# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

print("Split completed successfully!")

Train shape: (1395, 9) (1395,)
Test shape: (349, 9) (349,)
Split completed successfully!


In [None]:
# Prepare data for the model
# First, handle columns that contain string representations of lists
import ast

# Convert string representations of lists to actual numeric values
def convert_string_lists_to_floats(df):
    """Convert columns containing string representations of lists to numeric columns"""
    df_copy = df.copy()
    
    for col in df_copy.columns:
        try:
            # Check if column contains string representations of lists
            sample = str(df_copy[col].iloc[0])
            if sample.startswith('['):
                # Convert string representation to list, then to float
                df_copy[col] = df_copy[col].apply(lambda x: float(ast.literal_eval(x)[0]) if isinstance(x, str) else x)
        except:
            # If conversion fails, leave the column as is
            pass
    
    return df_copy

# Apply conversion to training and test data
X_train = convert_string_lists_to_floats(X_train)
X_test = convert_string_lists_to_floats(X_test)

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
num_classes = len(label_encoder.classes_)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for CNN (add channel dimension)
# Shape: (samples, time_steps, channels)
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

# Convert labels to categorical
y_train_categorical = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_categorical = to_categorical(y_test_encoded, num_classes=num_classes)

print(f"X_train shape: {X_train_reshaped.shape}")
print(f"X_test shape: {X_test_reshaped.shape}")
print(f"y_train shape: {y_train_categorical.shape}")
print(f"y_test shape: {y_test_categorical.shape}")
print(f"Number of emotion classes: {num_classes}")
print(f"Emotion classes: {label_encoder.classes_}")


ValueError: could not convert string to float: '[0.3425864279270172, 0.274483859539032, 0.3878319561481476, 0.357040137052536, 0.5404819846153259, 0.344219446182251, 0.3159320056438446, 0.4085683524608612, 0.29132962226867676, 0.34003308415412903, 0.26292145252227783, 0.37946224212646484]'

In [None]:
# Build Transformer CNN Model
def build_transformer_cnn_model(input_shape, num_classes, num_heads=4, ff_dim=128):
    """
    Build a Transformer CNN model combining CNN layers with Transformer attention.
    
    Args:
        input_shape: Shape of input data (time_steps, channels)
        num_classes: Number of emotion classes
        num_heads: Number of attention heads
        ff_dim: Dimension of feed-forward network
    
    Returns:
        Compiled Keras model
    """
    inputs = Input(shape=input_shape)
    
    # CNN feature extraction
    x = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')(inputs)
    x = Dropout(0.2)(x)
    x = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Conv1D(filters=256, kernel_size=3, padding='same', activation='relu')(x)
    x = Dropout(0.2)(x)
    
    # Transformer blocks
    # Layer normalization
    x = LayerNormalization(epsilon=1e-6)(x)
    
    # Multi-head attention
    attention_output = MultiHeadAttention(
        num_heads=num_heads,
        key_dim=256 // num_heads,
        dropout=0.1
    )(x, x)
    
    # Residual connection
    x = Add()([x, attention_output])
    x = LayerNormalization(epsilon=1e-6)(x)
    
    # Feed-forward network
    x = Dense(ff_dim, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(256)(x)
    
    # Residual connection
    residual = x
    x = Add()([x, attention_output])
    x = LayerNormalization(epsilon=1e-6)(x)
    
    # Global average pooling
    x = GlobalAveragePooling1D()(x)
    
    # Classification layers
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)
    outputs = Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

# Build the model
model = build_transformer_cnn_model(
    input_shape=X_train_reshaped.shape[1:],
    num_classes=num_classes,
    num_heads=4,
    ff_dim=128
)

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Display model architecture
print("Model Architecture:")
model.summary()


In [None]:
# Train the Transformer CNN model
# Define callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-6,
    verbose=1
)

# Train the model
print("Training Transformer CNN Model...")
history = model.fit(
    X_train_reshaped,
    y_train_categorical,
    validation_data=(X_test_reshaped, y_test_categorical),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

print("\nTraining completed!")

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_reshaped, y_test_categorical, verbose=0)
print(f"\nTest Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Make predictions
y_pred = model.predict(X_test_reshaped)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test_categorical, axis=1)

# Convert back to emotion labels
y_pred_emotions = label_encoder.inverse_transform(y_pred_classes)
y_test_emotions = label_encoder.inverse_transform(y_test_classes)

print(f"\nSample Predictions:")
print(f"Predicted: {y_pred_emotions[:10]}")
print(f"Actual:    {y_test_emotions[:10].values}")

# Save the model
model.save('emotion_predictor.h5')
print("\nModel saved as 'emotion_predictor.h5'")
