In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

# Load the dataset (limiting to the first 10,000 rows if necessary)
df = pd.read_csv("data/song_lyrics_en.csv").head(10000)

# Use 'song_document' as features and 'popularity_bin' as the target
X = df['song_document']
y = df['popularity_bin']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Text Encoding: Bag of Words
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# Text Encoding: TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Model Training: SVC for Bag of Words
svc_bow = SVC()
svc_bow.fit(X_train_bow, y_train)
y_pred_bow = svc_bow.predict(X_test_bow)

# Model Training: SVC for TF-IDF
svc_tfidf = SVC()
svc_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = svc_tfidf.predict(X_test_tfidf)

# Evaluation: Accuracy and F1-score for Bag of Words
accuracy_bow = accuracy_score(y_test, y_pred_bow)
f1_bow = f1_score(y_test, y_pred_bow, average='weighted')

# Evaluation: Accuracy and F1-score for TF-IDF
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf, average='weighted')

# Display the results
print(f"Bag of Words - Accuracy: {accuracy_bow:.4f}, F1-score: {f1_bow:.4f}")
print(f"TF-IDF - Accuracy: {accuracy_tfidf:.4f}, F1-score: {f1_tfidf:.4f}")
