In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

# Load the dataset (limiting to the first 10,000 rows if necessary)
df = pd.read_csv("data/song_lyrics_en.csv")


In [2]:

# Use 'song_document' as features and 'popularity_bin' as the target
X = df['song_document']
y = df['popularity_bin']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [3]:
slice_num = 10000

In [4]:
import numpy as np
tag_embeddings = np.load('data/tag_embeddings.npy')
artist_embeddings = np.load('data/artist_embeddings.npy')

In [5]:
tag_embeddings = tag_embeddings[:, 0, :]
tag_embeddings = tag_embeddings[:slice_num]
tag_embeddings.shape

(10000, 3)

In [6]:
artist_embeddings = artist_embeddings[:, 0, :]
artist_embeddings = artist_embeddings[:slice_num]
artist_embeddings.shape

(10000, 600)

In [7]:
artist_embedding_train, artist_embedding_test = train_test_split(artist_embeddings, test_size=0.1, random_state=42)
tag_embedding_train, tag_embedding_test = train_test_split(tag_embeddings, test_size=0.1, random_state=42)

In [10]:
tag_embedding_train.shape

(9000, 3)

In [13]:
artist_embedding_train.shape

(9000, 600)

In [8]:

# Text Encoding: Bag of Words
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# Text Encoding: TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [11]:
X_train_tfidf.shape

(9000, 67866)

In [14]:
from scipy.sparse import hstack
# Concatenate Bag of Words (BoW) with the embeddings
X_train_bow_concat = hstack([X_train_bow, tag_embedding_train, artist_embedding_train])
X_test_bow_concat = hstack([X_test_bow, tag_embedding_test, artist_embedding_test])

# Concatenate TF-IDF with the embeddings
X_train_tfidf_concat = hstack([X_train_tfidf, tag_embedding_train, artist_embedding_train])
X_test_tfidf_concat = hstack([X_test_tfidf, tag_embedding_test, artist_embedding_test])


In [15]:

# Model Training: SVC for Bag of Words
svc_bow = SVC()
svc_bow.fit(X_train_bow_concat, y_train)
y_pred_bow = svc_bow.predict(X_test_bow_concat)

# Model Training: SVC for TF-IDF
svc_tfidf = SVC()
svc_tfidf.fit(X_train_tfidf_concat, y_train)
y_pred_tfidf = svc_tfidf.predict(X_test_tfidf_concat)

# Evaluation: Accuracy and F1-score for Bag of Words
accuracy_bow = accuracy_score(y_test, y_pred_bow)
f1_bow = f1_score(y_test, y_pred_bow, average='weighted')

# Evaluation: Accuracy and F1-score for TF-IDF
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf, average='weighted')

# Display the results
print(f"Bag of Words - Accuracy: {accuracy_bow:.4f}, F1-score: {f1_bow:.4f}")
print(f"TF-IDF - Accuracy: {accuracy_tfidf:.4f}, F1-score: {f1_tfidf:.4f}")


Bag of Words - Accuracy: 0.5970, F1-score: 0.5167
TF-IDF - Accuracy: 0.6700, F1-score: 0.6391
