In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

import joblib


In [None]:
CSV_PATH = "song_lyrics_425000_sampled.csv"

df = pd.read_csv(CSV_PATH)

# Make sure columns exist
assert "clean_lyrics" in df.columns
assert "tag" in df.columns

# Keep valid rows only
df = df[df["clean_lyrics"].notna()]
df = df[df["tag"].notna()]
df = df[df["clean_lyrics"].str.strip() != ""]

# Keep only these 5 main genres
target_genres = ["pop", "rock", "rap", "rb", "country"]
df = df[df["tag"].isin(target_genres)]

print("Data size:", df.shape)
print(df["tag"].value_counts())


Data size: (424999, 12)
tag
country    85000
rap        85000
rb         85000
rock       85000
pop        84999
Name: count, dtype: int64


In [None]:
X_text = df["clean_lyrics"].values
y_tag = df["tag"].values

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text,
    y_tag,
    test_size=0.1,
    random_state=42,
    stratify=y_tag
)

print("Train size:", len(X_train_text))
print("Test size:", len(X_test_text))

genres = sorted(list(set(y_train)))
print("Genres:", genres)


Train size: 382499
Test size: 42500
Genres: ['country', 'pop', 'rap', 'rb', 'rock']


In [None]:
binary_models = {}
binary_vectorizers = {}

for g in genres:
    print(f"\n===============================")
    print(f"Training binary model for: {g}")
    print(f"===============================")

    # Binary target: 1 = this genre, 0 = others
    y_train_bin = (y_train == g).astype(int)

    # Vectorizer (unique per genre)
    vect = TfidfVectorizer(
        max_features=150_000,
        ngram_range=(1, 2),
        min_df=3,
        stop_words="english",
        sublinear_tf=True
    )
    X_train_vec = vect.fit_transform(X_train_text)

    # Logistic regression classifier
    model = LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        C=2.0,
        solver="saga",
        n_jobs=-1
    )
    model.fit(X_train_vec, y_train_bin)

    # Save
    binary_models[g] = model
    binary_vectorizers[g] = vect

    # Display internal performance
    y_pred_bin = model.predict(X_train_vec)
    print(classification_report(y_train_bin, y_pred_bin, digits=3))



Training binary model for: country
              precision    recall  f1-score   support

           0      0.981     0.895     0.936    305999
           1      0.688     0.930     0.791     76500

    accuracy                          0.902    382499
   macro avg      0.835     0.912     0.863    382499
weighted avg      0.922     0.902     0.907    382499


Training binary model for: pop
              precision    recall  f1-score   support

           0      0.964     0.762     0.851    306000
           1      0.482     0.886     0.624     76499

    accuracy                          0.787    382499
   macro avg      0.723     0.824     0.738    382499
weighted avg      0.867     0.787     0.806    382499


Training binary model for: rap
              precision    recall  f1-score   support

           0      0.986     0.943     0.964    305999
           1      0.806     0.945     0.870     76500

    accuracy                          0.944    382499
   macro avg      0.896     

In [None]:
def predict_genre_separate_models(text: str):
    scores = {}

    for g in genres:
        vect = binary_vectorizers[g]
        model = binary_models[g]

        X_input = vect.transform([text])
        prob = model.predict_proba(X_input)[0][1]  # prob of positive class
        scores[g] = prob

    best_genre = max(scores, key=scores.get)
    return best_genre, scores


In [None]:
y_pred = []

for text in X_test_text:
    pred_g, _ = predict_genre_separate_models(text)
    y_pred.append(pred_g)

print("\n===============================")
print("Evaluation: Separate Binary Models")
print("===============================")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))


Evaluation: Separate Binary Models
Accuracy: 0.6122588235294117
              precision    recall  f1-score   support

     country      0.673     0.723     0.697      8500
         pop      0.386     0.325     0.353      8500
         rap      0.822     0.812     0.817      8500
          rb      0.608     0.628     0.618      8500
        rock      0.538     0.573     0.555      8500

    accuracy                          0.612     42500
   macro avg      0.606     0.612     0.608     42500
weighted avg      0.606     0.612     0.608     42500



In [None]:
joblib.dump(binary_models, "genre_binary_models.pkl")
joblib.dump(binary_vectorizers, "genre_binary_vectorizers.pkl")

print("Saved: genre_binary_models.pkl")
print("Saved: genre_binary_vectorizers.pkl")


Saved: genre_binary_models.pkl
Saved: genre_binary_vectorizers.pkl


In [None]:
lyrics_example = """
One thing, I don't know why
It doesn't even matter how hard you try
Keep that in mind, I designed this rhyme
To explain in due time, all I know
Time is a valuable thing
Watch it fly by as the pendulum swings
Watch it count down to the end of the day
The clock ticks life away, it's so unreal
Didn't look out below
Watch the time go right out the window
Tryin' to hold on, d-didn't even know
I wasted it all just to watch you go

I kept everything inside
And even though I tried, it all fell apart
What it meant to me will eventually be
A memory of a time when I tried so hard

I tried so hard and got so far
But in the end, it doesn't even matter
I had to fall to lose it all
But in the end, it doesn't even matter
"""

predicted, score_table = predict_genre_separate_models(lyrics_example)

print("Predicted genre:", predicted)


Predicted genre: rock


In [None]:
def predict_genre_with_percentages(text: str, top_k=None):
    """
    text: lyrics string
    top_k: if set (e.g. 3), only return top_k genres

    returns:
        best_genre: genre with highest probability
        scores_sorted: list of (genre, probability_float) sorted desc
    """
    scores = {}

    for g in genres:
        vect = binary_vectorizers[g]
        model = binary_models[g]

        X_input = vect.transform([text])
        prob = model.predict_proba(X_input)[0][1]  # prob that text is this genre
        scores[g] = prob

    # Sort by probability descending
    scores_sorted = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    if top_k is not None:
        scores_sorted = scores_sorted[:top_k]

    best_genre = scores_sorted[0][0]
    return best_genre, scores_sorted


In [None]:
def print_genre_scores(scores_sorted):
    """
    scores_sorted: list of (genre, prob) from predict_genre_with_percentages
    """
    print("Genre probabilities:")
    for genre, prob in scores_sorted:
        pct = prob * 100
        print(f"  {genre:8s} : {pct:5.2f}%")


In [None]:
lyrics_example = """
I was so high I did not recognize
The fire burning in her eyes
The chaos that controlled my mind
Whispered goodbye as she got on a plane
Never to return again
But always in my heart, oh
This love has taken its toll on me
She said goodbye too many times before
And her heart is breaking in front of me
And I have no choice, 'cause I won't say goodbye anymore
Whoa
Whoa
Whoa
I tried my best to feed her appetite
Keep her coming every night
So hard to keep her satisfied, oh
Kept playing love like it was just a game
Pretending to feel the same
Then turn around and leave again, but uh-oh
This love has taken its toll on me
She said goodbye too many times before
And her heart is breaking in front of me
And I have no choice, 'cause I won't say goodbye anymore
Whoa
Whoa
Whoa
I'll fix these broken things, repair your broken wings
And make sure everything's alright (it's alright, it's alright), oh, oh
My pressure on your hips, sinking my fingertips
Into every inch of you because I know that's what you want me to do
This love has taken its toll on me
She said goodbye too many times before
Her heart is breaking in front of me
And I have no choice 'cause I won't say goodbye anymore
This love has taken its toll on me
She said goodbye too many times before
And my heart is breaking in front of me
She said goodbye too many times before
This love has taken its toll on me (oh, yeah, yeah)
She said goodbye too many times before (yeah)
And her heart is breaking in front of me (yeah, oh)
And I have no choice 'cause I won't say goodbye anymore (yeah)3
"""

best_genre, scores_sorted = predict_genre_with_percentages(lyrics_example)
print("Predicted main genre:", best_genre)
print_genre_scores(scores_sorted)


Predicted main genre: pop
Genre probabilities:
  pop      : 82.59%
  rock     : 38.15%
  rb       : 37.73%
  country  : 14.11%
  rap      :  1.16%
