In [1]:
# build_binary_era_datasets.py
import os
import pandas as pd
from sklearn.model_selection import train_test_split

RAW_PATH = "../datasets_old/song_lyrics_map_era.csv"
OUTPUT_DIR = "../binary_datasets_thunder"
os.makedirs(OUTPUT_DIR, exist_ok=True)

df = pd.read_csv(RAW_PATH)

ALL_ERAS = sorted(df["song_era"].unique())
print("ALL ERAS:", ALL_ERAS)

# Global hard cap for positives & negatives per binary dataset
MAX_TOTAL_LIMIT = 200_000


def compute_balanced_samples(
    pos_count: int,
    negative_sizes: dict,
    max_total_limit: int | None = None,
):
    """
    pos_count          = number of items in target class
    negative_sizes     = dict of other_eras -> size
    max_total_limit    = optional hard cap (e.g. 200k) applied to pos & neg
    """
    # Average size across other classes
    other_counts = list(negative_sizes.values())
    max_neg_per_class = sum(other_counts) / len(other_counts)

    # Initial target for negatives = min(pos, avg_of_others)
    neg_total_target = min(pos_count, int(max_neg_per_class))

    # Can't sample more than we actually have in all negatives combined
    max_possible_neg = sum(other_counts)
    neg_total_target = min(neg_total_target, max_possible_neg)

    # Apply global max cap (if any)
    if max_total_limit is not None:
        neg_total_target = min(neg_total_target, max_total_limit)

    # Also cap positives to this target so pos/neg are equal
    pos_samples = min(pos_count, neg_total_target)
    neg_total_target = pos_samples  # enforce balance

    if neg_total_target == 0:
        return {
            "pos_samples": 0,
            "neg_total": 0,
            "neg_per_class": {era: 0 for era in negative_sizes},
        }

    num_classes = len(negative_sizes)
    base_each = neg_total_target // num_classes
    leftover = neg_total_target - base_each * num_classes

    distribution = {era: 0 for era in negative_sizes}
    free_slots = 0

    # First pass: give base_each to each class (capped by availability)
    for era, size in negative_sizes.items():
        take = min(size, base_each)
        distribution[era] = take
        free_slots += (base_each - take)

    # Add leftover into free_slots pool
    free_slots += leftover

    # Second pass: fill remaining slots fairly among classes that still have room
    # Note: this is simple but effective; dataset is large so perf is fine.
    while free_slots > 0:
        progressed = False
        for era, size in negative_sizes.items():
            if free_slots <= 0:
                break
            if distribution[era] < size:
                distribution[era] += 1
                free_slots -= 1
                progressed = True
        if not progressed:
            # No class can take more; break to avoid infinite loop
            break

    # Final neg_total might be slightly less than target if we ran out of data
    final_neg_total = sum(distribution.values())
    # To keep strict balance, also shrink pos_samples if needed
    pos_samples = min(pos_samples, final_neg_total)

    return {
        "pos_samples": pos_samples,
        "neg_total": final_neg_total,
        "neg_per_class": distribution,
    }


for era in ALL_ERAS:
    print(f"\n========================================")
    print(f"BUILDING DATASET FOR ERA = {era}")
    print("========================================")

    era_folder = os.path.join(OUTPUT_DIR, era)
    os.makedirs(era_folder, exist_ok=True)

    # 1) POSITIVE samples (all rows where era == target)
    pos_all = df[df["song_era"] == era].copy()
    num_pos = len(pos_all)
    print(f"Positive available: {num_pos}")

    # 2) NEGATIVE class sizes (per other era)
    negative_sizes = {}
    for other_era in ALL_ERAS:
        if other_era == era:
            continue
        negative_sizes[other_era] = (df["song_era"] == other_era).sum()

    print("Negative sizes:", negative_sizes)

    # 3) Compute how many to sample (pos + per-negative-class)
    sampling_plan = compute_balanced_samples(
        pos_count=num_pos,
        negative_sizes=negative_sizes,
        max_total_limit=MAX_TOTAL_LIMIT,
    )

    pos_samples = sampling_plan["pos_samples"]
    neg_per_class = sampling_plan["neg_per_class"]
    neg_total = sampling_plan["neg_total"]

    print(f"Sampling positives: {pos_samples}")
    print(f"Total negatives: {neg_total}")
    print(f"Negative per class: {neg_per_class}")

    if pos_samples == 0 or neg_total == 0:
        print(f"Skip era {era}: not enough data to build balanced dataset.")
        continue

    # 4) Sample positives
    pos_df = pos_all.sample(n=pos_samples, random_state=42)

    # 5) Sample negatives per class using computed distribution
    neg_dfs = []
    for other_era, n_samples in neg_per_class.items():
        if n_samples <= 0:
            continue
        subset = df[df["song_era"] == other_era]
        sampled = subset.sample(n=n_samples, random_state=42)
        neg_dfs.append(sampled)

    neg_df = pd.concat(neg_dfs, axis=0).reset_index(drop=True)

    print(f"Sampled positives: {len(pos_df)}")
    print(f"Sampled negatives: {len(neg_df)}")

    # 6) Combine dataset
    combined = pd.concat([pos_df, neg_df], axis=0).reset_index(drop=True)

    # 7) Add binary label
    binary_col = f"is_{era}"
    combined[binary_col] = (combined["song_era"] == era).astype(int)

    print("Final combined dataset size:", len(combined))
    print("Label balance:\n", combined[binary_col].value_counts())

    # 8) Train/Val/Test split (stratified)
    train_df, test_df = train_test_split(
        combined,
        test_size=0.10,
        random_state=42,
        stratify=combined[binary_col],
    )
    train_df, val_df = train_test_split(
        train_df,
        test_size=0.10,
        random_state=42,
        stratify=train_df[binary_col],
    )

    print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

    # 9) Save datasets
    train_df.to_csv(os.path.join(era_folder, "train.csv"), index=False)
    val_df.to_csv(os.path.join(era_folder, "val.csv"), index=False)
    test_df.to_csv(os.path.join(era_folder, "test.csv"), index=False)

print("\nAll binary datasets created successfully with balanced pos/neg & fair per-era negatives!")

ALL ERAS: ['1970s', '1980s', '1990s', '2000s', '2010s', '2020s']

BUILDING DATASET FOR ERA = 1970s
Positive available: 64361
Negative sizes: {'1980s': np.int64(95967), '1990s': np.int64(219135), '2000s': np.int64(443960), '2010s': np.int64(1676867), '2020s': np.int64(725925)}
Sampling positives: 64361
Total negatives: 64361
Negative per class: {'1980s': 12873, '1990s': 12872, '2000s': 12872, '2010s': 12872, '2020s': 12872}
Sampled positives: 64361
Sampled negatives: 64361
Final combined dataset size: 128722
Label balance:
 is_1970s
1    64361
0    64361
Name: count, dtype: int64
Train: 104264 | Val: 11585 | Test: 12873

BUILDING DATASET FOR ERA = 1980s
Positive available: 95967
Negative sizes: {'1970s': np.int64(64361), '1990s': np.int64(219135), '2000s': np.int64(443960), '2010s': np.int64(1676867), '2020s': np.int64(725925)}
Sampling positives: 95967
Total negatives: 95967
Negative per class: {'1970s': 19194, '1990s': 19194, '2000s': 19193, '2010s': 19193, '2020s': 19193}
Sampled pos

In [3]:
# train_logreg_binary.py
import os
import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


BASE_DIR = "../binary_datasets_thunder"
SAVE_DIR = "../logreg_binary_thunder"
TEXT_COL = "clean_lyrics"

os.makedirs(SAVE_DIR, exist_ok=True)

eras = sorted(os.listdir(BASE_DIR))
print("Found binary dataset eras:", eras)


for era in eras:
    print("\n========================================")
    print(f" TRAINING LOGISTIC REGRESSION FOR ERA = {era}")
    print("========================================")

    era_folder = os.path.join(BASE_DIR, era)

    # ---------- 1. LOAD DATA ----------
    train_df = pd.read_csv(f"{era_folder}/train.csv")
    val_df   = pd.read_csv(f"{era_folder}/val.csv")
    test_df  = pd.read_csv(f"{era_folder}/test.csv")

    bin_col = f"is_{era}"

    y_train = train_df[bin_col].values
    y_val   = val_df[bin_col].values
    y_test  = test_df[bin_col].values

    print(f"Training samples: {len(train_df)}, Positive ratio: {train_df[bin_col].mean():.3f}")

    # ---------- 2. TF-IDF (fit on train only) ----------
    vectorizer = TfidfVectorizer(
        stop_words="english",
        max_features=80_000,
        ngram_range=(1, 2),
        min_df=3,
    )

    print("Fitting TF-IDF...")
    X_train = vectorizer.fit_transform(train_df[TEXT_COL])
    X_val   = vectorizer.transform(val_df[TEXT_COL])
    X_test  = vectorizer.transform(test_df[TEXT_COL])

    print("TF-IDF shape:", X_train.shape)

    # ---------- 3. Logistic Regression ----------
    clf = LogisticRegression(
        max_iter=1500,
        class_weight="balanced",
        solver="liblinear",   # BEST for binary
        n_jobs=-1,
    )

    print("Training Logistic Regression model...")
    clf.fit(X_train, y_train)

    # ---------- 4. EVALUATION ----------
    def eval_split(name, X, y):
        preds = clf.predict(X)
        acc = accuracy_score(y, preds)
        print(f"\n=== {name} ACCURACY: {acc:.4f}")
        print(classification_report(y, preds))
        return acc

    eval_split("TRAIN", X_train, y_train)
    eval_split("VAL",   X_val,   y_val)
    eval_split("TEST",  X_test,  y_test)

    # ---------- 5. SAVE MODEL ----------
    save_path = f"{SAVE_DIR}/{era}"
    os.makedirs(save_path, exist_ok=True)

    joblib.dump(clf, f"{save_path}/logreg.joblib")
    joblib.dump(vectorizer, f"{save_path}/tfidf.joblib")

    print(f"Saved model & TF-IDF vectorizer â†’ {save_path}")

print("\nðŸŽ‰ All binary logistic regression models trained successfully!")

Found binary dataset eras: ['1970s', '1980s', '1990s', '2000s', '2010s', '2020s']

 TRAINING LOGISTIC REGRESSION FOR ERA = 1970s


Training samples: 104264, Positive ratio: 0.500
Fitting TF-IDF...
TF-IDF shape: (104264, 80000)
Training Logistic Regression model...





=== TRAIN ACCURACY: 0.8388
              precision    recall  f1-score   support

           0       0.87      0.79      0.83     52132
           1       0.81      0.88      0.85     52132

    accuracy                           0.84    104264
   macro avg       0.84      0.84      0.84    104264
weighted avg       0.84      0.84      0.84    104264


=== VAL ACCURACY: 0.7625
              precision    recall  f1-score   support

           0       0.79      0.71      0.75      5792
           1       0.74      0.81      0.77      5793

    accuracy                           0.76     11585
   macro avg       0.76      0.76      0.76     11585
weighted avg       0.76      0.76      0.76     11585


=== TEST ACCURACY: 0.7694
              precision    recall  f1-score   support

           0       0.80      0.72      0.76      6437
           1       0.75      0.81      0.78      6436

    accuracy                           0.77     12873
   macro avg       0.77      0.77      0.77    




=== TRAIN ACCURACY: 0.7829
              precision    recall  f1-score   support

           0       0.82      0.73      0.77     77733
           1       0.76      0.84      0.79     77733

    accuracy                           0.78    155466
   macro avg       0.79      0.78      0.78    155466
weighted avg       0.79      0.78      0.78    155466


=== VAL ACCURACY: 0.6920
              precision    recall  f1-score   support

           0       0.71      0.64      0.68      8637
           1       0.67      0.74      0.71      8637

    accuracy                           0.69     17274
   macro avg       0.69      0.69      0.69     17274
weighted avg       0.69      0.69      0.69     17274


=== TEST ACCURACY: 0.6917
              precision    recall  f1-score   support

           0       0.72      0.64      0.67      9597
           1       0.67      0.75      0.71      9597

    accuracy                           0.69     19194
   macro avg       0.69      0.69      0.69    




=== TRAIN ACCURACY: 0.7325
              precision    recall  f1-score   support

           0       0.75      0.70      0.72    162000
           1       0.72      0.76      0.74    162000

    accuracy                           0.73    324000
   macro avg       0.73      0.73      0.73    324000
weighted avg       0.73      0.73      0.73    324000


=== VAL ACCURACY: 0.6441
              precision    recall  f1-score   support

           0       0.65      0.62      0.63     18000
           1       0.64      0.67      0.65     18000

    accuracy                           0.64     36000
   macro avg       0.64      0.64      0.64     36000
weighted avg       0.64      0.64      0.64     36000


=== TEST ACCURACY: 0.6437
              precision    recall  f1-score   support

           0       0.65      0.61      0.63     20000
           1       0.64      0.67      0.65     20000

    accuracy                           0.64     40000
   macro avg       0.64      0.64      0.64    




=== TRAIN ACCURACY: 0.7348
              precision    recall  f1-score   support

           0       0.74      0.72      0.73    162000
           1       0.73      0.75      0.74    162000

    accuracy                           0.73    324000
   macro avg       0.73      0.73      0.73    324000
weighted avg       0.73      0.73      0.73    324000


=== VAL ACCURACY: 0.6467
              precision    recall  f1-score   support

           0       0.65      0.63      0.64     18000
           1       0.64      0.66      0.65     18000

    accuracy                           0.65     36000
   macro avg       0.65      0.65      0.65     36000
weighted avg       0.65      0.65      0.65     36000


=== TEST ACCURACY: 0.6497
              precision    recall  f1-score   support

           0       0.65      0.64      0.64     20000
           1       0.65      0.66      0.65     20000

    accuracy                           0.65     40000
   macro avg       0.65      0.65      0.65    




=== TRAIN ACCURACY: 0.7569
              precision    recall  f1-score   support

           0       0.75      0.77      0.76    110072
           1       0.76      0.74      0.75    109928

    accuracy                           0.76    220000
   macro avg       0.76      0.76      0.76    220000
weighted avg       0.76      0.76      0.76    220000


=== VAL ACCURACY: 0.6737
              precision    recall  f1-score   support

           0       0.67      0.69      0.68     18000
           1       0.68      0.66      0.67     18000

    accuracy                           0.67     36000
   macro avg       0.67      0.67      0.67     36000
weighted avg       0.67      0.67      0.67     36000


=== TEST ACCURACY: 0.6772
              precision    recall  f1-score   support

           0       0.67      0.69      0.68     20000
           1       0.68      0.66      0.67     20000

    accuracy                           0.68     40000
   macro avg       0.68      0.68      0.68    




=== TRAIN ACCURACY: 0.8308
              precision    recall  f1-score   support

           0       0.82      0.85      0.83    162000
           1       0.85      0.81      0.83    162000

    accuracy                           0.83    324000
   macro avg       0.83      0.83      0.83    324000
weighted avg       0.83      0.83      0.83    324000


=== VAL ACCURACY: 0.7903
              precision    recall  f1-score   support

           0       0.78      0.81      0.79     18000
           1       0.80      0.77      0.79     18000

    accuracy                           0.79     36000
   macro avg       0.79      0.79      0.79     36000
weighted avg       0.79      0.79      0.79     36000


=== TEST ACCURACY: 0.7926
              precision    recall  f1-score   support

           0       0.78      0.82      0.80     20000
           1       0.81      0.77      0.79     20000

    accuracy                           0.79     40000
   macro avg       0.79      0.79      0.79    

In [6]:
!uv pip install gensim

[2mUsing Python 3.11.13 environment at: /workspace/.venv[0m
[2K[2mResolved [1m5 packages[0m [2min 178ms[0m[0m                                         [0m
[2K[37mâ ™[0m [2mPreparing packages...[0m (0/1)                                                   
[2K[1A[37mâ ™[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m     0 B/26.52 MiB           [1A
[2K[1A[37mâ ™[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 16.00 KiB/26.52 MiB         [1A
[2K[1A[37mâ ™[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 32.00 KiB/26.52 MiB         [1A
[2K[1A[37mâ ™[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 42.92 KiB/26.52 MiB         [1A
[2K[1A[37mâ ™[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 58.92 KiB/26.52 MiB         [1A
[2K[1A[37mâ ™[0m [2mPreparing packages...[0m (0/1)--------------[0m[0m 62.92 KiB/26.52 MiB         [1A
[2K[1A[37mâ ™[0m [2mPreparing packages...[0m (0/1)----

In [8]:
# train_logreg_binary_word2vec.py
import os
import joblib
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# -------------------------------------------------
# CONFIG
# -------------------------------------------------
BASE_DIR = "../binary_datasets_thunder"
SAVE_DIR = "../logreg_binary_word2vec"
TEXT_COL = "clean_lyrics"

os.makedirs(SAVE_DIR, exist_ok=True)

EMBED_DIM = 768
WINDOW = 15
MIN_COUNT = 5
WORKERS = 32

eras = sorted(os.listdir(BASE_DIR))
print("Found binary dataset eras:", eras)


# -------------------------------------------------
# TOKENIZER
# -------------------------------------------------
def tokenize(text: str):
    return text.lower().split()


# -------------------------------------------------
# SENTENCE EMBEDDING
# -------------------------------------------------
def sentence_embedding(tokens, wv):
    vectors = [wv[word] for word in tokens if word in wv]
    if not vectors:
        return np.zeros(EMBED_DIM)
    return np.mean(vectors, axis=0)


def embed_corpus(list_tokenized, wv):
    return np.vstack([sentence_embedding(tokens, wv) for tokens in list_tokenized])


# -------------------------------------------------
# TRAINING LOOP (ONE MODEL PER ERA)
# -------------------------------------------------
for era in eras:
    print("\n========================================")
    print(f" TRAINING WORD2VEC + LOGISTIC REGRESSION FOR ERA = {era}")
    print("========================================")

    era_folder = os.path.join(BASE_DIR, era)

    # ---------- 1. LOAD ----------
    train_df = pd.read_csv(f"{era_folder}/train.csv")
    val_df   = pd.read_csv(f"{era_folder}/val.csv")
    test_df  = pd.read_csv(f"{era_folder}/test.csv")

    bin_col = f"is_{era}"

    y_train = train_df[bin_col].values
    y_val   = val_df[bin_col].values
    y_test  = test_df[bin_col].values

    print(f"Training samples: {len(train_df)}, Positive ratio: {train_df[bin_col].mean():.3f}")


    # ---------- 2. Tokenize ----------
    train_tokens = train_df[TEXT_COL].astype(str).apply(tokenize).tolist()
    val_tokens   = val_df[TEXT_COL].astype(str).apply(tokenize).tolist()
    test_tokens  = test_df[TEXT_COL].astype(str).apply(tokenize).tolist()


    # ---------- 3. Train Word2Vec on TRAIN only ----------
    print("Training Word2Vec...")
    w2v = Word2Vec(
        sentences=train_tokens,
        vector_size=EMBED_DIM,
        window=WINDOW,
        min_count=MIN_COUNT,
        workers=WORKERS,
    )
    wv = w2v.wv

    print("Vocabulary size:", len(wv))


    # ---------- 4. Convert to sentence embeddings ----------
    print("Embedding datasets...")
    X_train = embed_corpus(train_tokens, wv)
    X_val   = embed_corpus(val_tokens, wv)
    X_test  = embed_corpus(test_tokens, wv)

    print("Embedding shape:", X_train.shape)


    # ---------- 5. Logistic Regression ----------
    clf = LogisticRegression(
        max_iter=1500,
        class_weight="balanced",
        solver="liblinear",
        n_jobs=-1,
    )

    print("Training Logistic Regression...")
    clf.fit(X_train, y_train)


    # ---------- 6. Evaluation ----------
    def eval_split(name, X, y):
        preds = clf.predict(X)
        acc = accuracy_score(y, preds)
        print(f"\n=== {name} ACCURACY: {acc:.4f}")
        print(classification_report(y, preds))
        return acc

    eval_split("TRAIN", X_train, y_train)
    eval_split("VAL",   X_val,   y_val)
    eval_split("TEST",  X_test,  y_test)


    # ---------- 7. Save ----------
    save_path = f"{SAVE_DIR}/{era}"
    os.makedirs(save_path, exist_ok=True)

    joblib.dump(clf, f"{save_path}/logreg.joblib")
    w2v.save(f"{save_path}/word2vec.model")

    print(f"Saved LogisticRegression + Word2Vec â†’ {save_path}")

print("\nðŸŽ‰ All Word2Vec + Logistic Regression binary models trained successfully!")

Found binary dataset eras: ['1970s', '1980s', '1990s', '2000s', '2010s', '2020s']

 TRAINING WORD2VEC + LOGISTIC REGRESSION FOR ERA = 1970s
Training samples: 104264, Positive ratio: 0.500
Training Word2Vec...
Vocabulary size: 58832
Embedding datasets...
Embedding shape: (104264, 768)
Training Logistic Regression...





=== TRAIN ACCURACY: 0.7635
              precision    recall  f1-score   support

           0       0.78      0.73      0.76     52132
           1       0.75      0.80      0.77     52132

    accuracy                           0.76    104264
   macro avg       0.76      0.76      0.76    104264
weighted avg       0.76      0.76      0.76    104264


=== VAL ACCURACY: 0.7538
              precision    recall  f1-score   support

           0       0.78      0.71      0.74      5792
           1       0.73      0.79      0.76      5793

    accuracy                           0.75     11585
   macro avg       0.76      0.75      0.75     11585
weighted avg       0.76      0.75      0.75     11585


=== TEST ACCURACY: 0.7614
              precision    recall  f1-score   support

           0       0.78      0.73      0.75      6437
           1       0.75      0.79      0.77      6436

    accuracy                           0.76     12873
   macro avg       0.76      0.76      0.76    




=== TRAIN ACCURACY: 0.6850
              precision    recall  f1-score   support

           0       0.70      0.64      0.67     77733
           1       0.67      0.73      0.70     77733

    accuracy                           0.68    155466
   macro avg       0.69      0.68      0.68    155466
weighted avg       0.69      0.68      0.68    155466


=== VAL ACCURACY: 0.6825
              precision    recall  f1-score   support

           0       0.70      0.64      0.67      8637
           1       0.67      0.72      0.70      8637

    accuracy                           0.68     17274
   macro avg       0.68      0.68      0.68     17274
weighted avg       0.68      0.68      0.68     17274


=== TEST ACCURACY: 0.6780
              precision    recall  f1-score   support

           0       0.69      0.64      0.66      9597
           1       0.66      0.72      0.69      9597

    accuracy                           0.68     19194
   macro avg       0.68      0.68      0.68    




=== TRAIN ACCURACY: 0.6345
              precision    recall  f1-score   support

           0       0.65      0.58      0.61    162000
           1       0.62      0.69      0.65    162000

    accuracy                           0.63    324000
   macro avg       0.64      0.63      0.63    324000
weighted avg       0.64      0.63      0.63    324000


=== VAL ACCURACY: 0.6351
              precision    recall  f1-score   support

           0       0.65      0.58      0.62     18000
           1       0.62      0.69      0.65     18000

    accuracy                           0.64     36000
   macro avg       0.64      0.64      0.63     36000
weighted avg       0.64      0.64      0.63     36000


=== TEST ACCURACY: 0.6331
              precision    recall  f1-score   support

           0       0.65      0.58      0.61     20000
           1       0.62      0.69      0.65     20000

    accuracy                           0.63     40000
   macro avg       0.63      0.63      0.63    




=== TRAIN ACCURACY: 0.6428
              precision    recall  f1-score   support

           0       0.65      0.62      0.63    162000
           1       0.64      0.67      0.65    162000

    accuracy                           0.64    324000
   macro avg       0.64      0.64      0.64    324000
weighted avg       0.64      0.64      0.64    324000


=== VAL ACCURACY: 0.6394
              precision    recall  f1-score   support

           0       0.65      0.61      0.63     18000
           1       0.63      0.67      0.65     18000

    accuracy                           0.64     36000
   macro avg       0.64      0.64      0.64     36000
weighted avg       0.64      0.64      0.64     36000


=== TEST ACCURACY: 0.6370
              precision    recall  f1-score   support

           0       0.65      0.61      0.63     20000
           1       0.63      0.67      0.65     20000

    accuracy                           0.64     40000
   macro avg       0.64      0.64      0.64    




=== TRAIN ACCURACY: 0.6755
              precision    recall  f1-score   support

           0       0.68      0.67      0.67    110072
           1       0.67      0.68      0.68    109928

    accuracy                           0.68    220000
   macro avg       0.68      0.68      0.68    220000
weighted avg       0.68      0.68      0.68    220000


=== VAL ACCURACY: 0.6647
              precision    recall  f1-score   support

           0       0.67      0.66      0.66     18000
           1       0.66      0.67      0.67     18000

    accuracy                           0.66     36000
   macro avg       0.66      0.66      0.66     36000
weighted avg       0.66      0.66      0.66     36000


=== TEST ACCURACY: 0.6737
              precision    recall  f1-score   support

           0       0.67      0.67      0.67     20000
           1       0.67      0.68      0.67     20000

    accuracy                           0.67     40000
   macro avg       0.67      0.67      0.67    




=== TRAIN ACCURACY: 0.7874
              precision    recall  f1-score   support

           0       0.78      0.80      0.79    162000
           1       0.80      0.77      0.78    162000

    accuracy                           0.79    324000
   macro avg       0.79      0.79      0.79    324000
weighted avg       0.79      0.79      0.79    324000


=== VAL ACCURACY: 0.7850
              precision    recall  f1-score   support

           0       0.78      0.80      0.79     18000
           1       0.79      0.77      0.78     18000

    accuracy                           0.79     36000
   macro avg       0.79      0.78      0.78     36000
weighted avg       0.79      0.79      0.78     36000


=== TEST ACCURACY: 0.7894
              precision    recall  f1-score   support

           0       0.78      0.80      0.79     20000
           1       0.80      0.78      0.79     20000

    accuracy                           0.79     40000
   macro avg       0.79      0.79      0.79    

In [None]:
# train_logreg_multiclass.py
import os
import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# -----------------------------------------
# CONFIG
# -----------------------------------------
TRAIN = "../datasets_old/train_split.csv"
VAL   = "../datasets_old/val_split.csv"
TEST  = "../datasets_old/test_split.csv"

TEXT_COL = "clean_lyrics"
LABEL_COL = "labels"

SAVE_DIR = "../logreg_multiclass"
os.makedirs(SAVE_DIR, exist_ok=True)


# -----------------------------------------
# LOAD DATA
# -----------------------------------------
train_df = pd.read_csv(TRAIN)
val_df   = pd.read_csv(VAL)
test_df  = pd.read_csv(TEST)

y_train = train_df[LABEL_COL].values
y_val   = val_df[LABEL_COL].values
y_test  = test_df[LABEL_COL].values

print("Train size:", len(train_df))
print("Val size:  ", len(val_df))
print("Test size: ", len(test_df))
print("Label classes:", sorted(train_df[LABEL_COL].unique()))


# -----------------------------------------
# TF-IDF
# -----------------------------------------
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=120_000,
    ngram_range=(1, 2),
    min_df=3,
)

print("Fitting TF-IDF...")
X_train = vectorizer.fit_transform(train_df[TEXT_COL])
X_val   = vectorizer.transform(val_df[TEXT_COL])
X_test  = vectorizer.transform(test_df[TEXT_COL])

print("TF-IDF shape:", X_train.shape)


# -----------------------------------------
# MULTI-CLASS LOGISTIC REGRESSION
# -----------------------------------------
clf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",     # helps class imbalance
    solver="liblinear",          # supports OvR for multiclass
    multi_class="ovr",           # multi-class LR
    n_jobs=-1,
)

print("Training Logistic Regression...")
clf.fit(X_train, y_train)


# -----------------------------------------
# EVALUATION
# -----------------------------------------
def evaluate(name, X, y):
    preds = clf.predict(X)
    acc = accuracy_score(y, preds)
    print(f"\n=== {name} Accuracy: {acc:.4f}")
    print(classification_report(y, preds))
    return acc

evaluate("TRAIN", X_train, y_train)
evaluate("VAL",   X_val,   y_val)
evaluate("TEST",  X_test,  y_test)


# -----------------------------------------
# SAVE MODEL + TF-IDF
# -----------------------------------------
joblib.dump(clf, f"{SAVE_DIR}/logreg_multiclass.joblib")
joblib.dump(vectorizer, f"{SAVE_DIR}/tfidf_multiclass.joblib")

print(f"\nðŸŽ‰ Saved model & vectorizer in {SAVE_DIR}")

In [4]:
# train_year_regression.py
import os
import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# ------------------------------------------
# CONFIG
# ------------------------------------------
TEXT_COL = "clean_lyrics"
LABEL_COL = "labels"    # numeric year already
DATA_DIR = "../datasets_old"
SAVE_DIR = "../regression_year_model"
os.makedirs(SAVE_DIR, exist_ok=True)

# ------------------------------------------
# LOAD DATA
# ------------------------------------------
train_df = pd.read_csv(f"{DATA_DIR}/train_split.csv")
val_df   = pd.read_csv(f"{DATA_DIR}/val_split.csv")
test_df  = pd.read_csv(f"{DATA_DIR}/test_split.csv")

y_train = train_df[LABEL_COL].values
y_val   = val_df[LABEL_COL].values
y_test  = test_df[LABEL_COL].values


# ------------------------------------------
# TF-IDF (fit only on train)
# ------------------------------------------
print("Fitting TF-IDF vectorizer...")

vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=120_000,
    ngram_range=(1, 2),
    min_df=3,
)

X_train = vectorizer.fit_transform(train_df[TEXT_COL])
X_val   = vectorizer.transform(val_df[TEXT_COL])
X_test  = vectorizer.transform(test_df[TEXT_COL])

print("TF-IDF shape:", X_train.shape)


# ------------------------------------------
# REGRESSION MODEL (Ridge)
# ------------------------------------------
print("Training Ridge Regression model...")

model = Ridge(
    alpha=1.0,       # L2 regularized (best for TF-IDF)
    max_iter=3000,
)

model.fit(X_train, y_train)


# ------------------------------------------
# EVALUATION FUNCTION
# ------------------------------------------
def evaluate(name, X, y):
    preds = model.predict(X)

    mse = mean_squared_error(y, preds)
    mae = mean_absolute_error(y, preds)
    r2  = r2_score(y, preds)

    print(f"\n=== {name} ===")
    print(f"MAE: {mae:.3f}")
    print(f"MSE: {mse:.3f}")
    print(f"RMSE: {np.sqrt(mse):.3f}")
    print(f"RÂ²: {r2:.4f}")

    return preds


# ------------------------------------------
# REPORT METRICS
# ------------------------------------------
evaluate("TRAIN", X_train, y_train)
evaluate("VAL",   X_val,   y_val)
evaluate("TEST",  X_test,  y_test)


# ------------------------------------------
# SAVE MODEL + TF-IDF
# ------------------------------------------
joblib.dump(model, f"{SAVE_DIR}/ridge_regressor.joblib")
joblib.dump(vectorizer, f"{SAVE_DIR}/tfidf.joblib")

print("\nModel saved â†’", SAVE_DIR)

Fitting TF-IDF vectorizer...
TF-IDF shape: (2613233, 120000)
Training Ridge Regression model...

=== TRAIN ===
MAE: 6.020
MSE: 70.026
RMSE: 8.368
RÂ²: 0.3346

=== VAL ===
MAE: 6.264
MSE: 75.471
RMSE: 8.687
RÂ²: 0.2800

=== TEST ===
MAE: 6.265
MSE: 75.717
RMSE: 8.702
RÂ²: 0.2824

Model saved â†’ ../regression_year_model
