In [None]:
import pandas as pd

df = pd.read_csv("/Users/supakornfluk/Understanding-Language-Change-in-Thai-Music/song_lyrics_50000_sampled.csv")

In [2]:
df["tag"].unique()

array(['country', 'pop', 'rap', 'rb', 'rock'], dtype=object)

In [3]:
df["clean_lyrics"]

0        there's a road babe that could take us to glor...
1        you honked the horn on your muddy truck and ti...
2        once i lived a life of wine and roses and i dr...
3          i walked down a long country mile over fence...
4          i should have never called last night we bre...
                               ...                        
49995    dream on my pain my scar my blame you’ve been ...
49996    i'm a lazy boy there's no doubt about it might...
49997    sinner rider rides in with the storm the devil...
49998      enemies and friends the scale it tips toward...
49999    my pale friend how do you feel are you sick fr...
Name: clean_lyrics, Length: 50000, dtype: object

In [4]:
len(df)


50000

In [5]:
print(df.columns)


Index(['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id',
       'language_cld3', 'language_ft', 'language', 'clean_lyrics'],
      dtype='object')


In [None]:
import re
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/supakornfluk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
english_stopwords = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(f"[{string.punctuation}]", " ", text)       # ลบ punctuation
    text = re.sub(r"[^\w\s]", " ", text)                     # ลบ special chars
    text = re.sub(r"\s+", " ", text).strip()                 # ลบ whitespace เกิน
    text = re.sub(r"[^\x00-\x7F]+", " ", text)               # ลบ emoji / non-ASCII
    text = ' '.join([w for w in text.split() if w not in english_stopwords])
    return text


In [9]:
df['clean_lyrics'] = df['clean_lyrics'].apply(clean_text)

In [11]:
print(df.columns)


Index(['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id',
       'language_cld3', 'language_ft', 'language', 'clean_lyrics'],
      dtype='object')


In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')   # บางเวอร์ชันของ nltk ต้องใช้เพิ่ม



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/supakornfluk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/supakornfluk/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from nltk.stem import WordNetLemmatizer

lemm = WordNetLemmatizer()
df["tokens"] = df["tokens"].apply(lambda x: [lemm.lemmatize(w) for w in x])


In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')   # optional แต่ช่วยให้ lemmatization ดีขึ้น


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/supakornfluk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/supakornfluk/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [28]:
import pandas as pd

genres = ["country", "pop", "rap", "rb", "rock"]
n_samples = 10000  # จำนวนเพลงต่อกลุ่ม

genre_datasets = {}

for g in genres:
    # เพลง genre g
    genre_df = df[df["tag"] == g].sample(n=n_samples, random_state=42, replace=True)
    
    # เพลงไม่ใช่ genre g
    non_genre_df = df[df["tag"] != g].sample(n=n_samples, random_state=42, replace=True)
    
    # รวมกัน
    combined_df = pd.concat([genre_df, non_genre_df]).sample(frac=1, random_state=42).reset_index(drop=True)
    
    # สร้าง label เป็น 1/0 สำหรับ one-vs-all
    combined_df["label"] = (combined_df["tag"] == g).astype(int)
    
    genre_datasets[g] = combined_df
    print(f"{g} dataset: {combined_df['label'].value_counts()}")


country dataset: label
0    10000
1    10000
Name: count, dtype: int64
pop dataset: label
0    10000
1    10000
Name: count, dtype: int64
rap dataset: label
0    10000
1    10000
Name: count, dtype: int64
rb dataset: label
0    10000
1    10000
Name: count, dtype: int64
rock dataset: label
0    10000
1    10000
Name: count, dtype: int64


In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

genres = ["country", "pop", "rap", "rb", "rock"]
results = {}

for g in genres:
    print(f"\n=== Training for {g} ===")
    
    df_g = genre_datasets[g]  # dataset ที่เตรียมไว้
    
    X = df_g["final_text"]
    y = df_g["label"]
    
    # TF-IDF
    vectorizer = TfidfVectorizer(
        max_features=500000,
        ngram_range=(1,2),
        min_df=3,
        stop_words="english",
        sublinear_tf=True
    )
    X_tfidf = vectorizer.fit_transform(X)
    
    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_tfidf, y, test_size=0.1, random_state=42, stratify=y
    )
    
    # Logistic Regression
    model = LogisticRegression(
        max_iter=10000,
        solver="saga",
        C=1.0,
        n_jobs=-1,
        class_weight="balanced"
    )
    model.fit(X_train, y_train)
    
    # Predict + Evaluate
    pred = model.predict(X_test)
    print(classification_report(y_test, pred))
    
    # เก็บทั้ง vectorizer + model
    results[g] = {"model": model, "vectorizer": vectorizer}



=== Training for country ===
              precision    recall  f1-score   support

           0       0.86      0.82      0.84      1000
           1       0.83      0.87      0.85      1000

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000


=== Training for pop ===
              precision    recall  f1-score   support

           0       0.78      0.69      0.73      1000
           1       0.72      0.80      0.76      1000

    accuracy                           0.75      2000
   macro avg       0.75      0.75      0.75      2000
weighted avg       0.75      0.75      0.75      2000


=== Training for rap ===
              precision    recall  f1-score   support

           0       0.91      0.92      0.91      1000
           1       0.92      0.90      0.91      1000

    accuracy                           0.91      2000
   macro avg       0.91      0.91      0.91   

In [69]:
import ipywidgets as widgets
from IPython.display import display, clear_output

# -----------------------------
# Input widget
# -----------------------------
input_box = widgets.Textarea(
    value='',
    placeholder='Paste song lyrics here...',
    description='Lyrics:',
    layout=widgets.Layout(width='100%', height='120px')
)

output_box = widgets.Output()

button = widgets.Button(description="Predict Genre", button_style='success')

# -----------------------------
# Callback function
# -----------------------------
def on_button_clicked(b):
    with output_box:
        clear_output()
        text = input_box.value
        if not text.strip():
            print("Please enter some lyrics.")
            return
        
        pred_genre = predict_genre(text)
        print(f"Predicted genre: {pred_genre}")

button.on_click(on_button_clicked)

# -----------------------------
# Display widgets
# -----------------------------
display(input_box, button, output_box)


Textarea(value='', description='Lyrics:', layout=Layout(height='120px', width='100%'), placeholder='Paste song…

Button(button_style='success', description='Predict Genre', style=ButtonStyle())

Output()

In [None]:
import joblib
import numpy as np

# โหลดโมเดลทั้งหมด
multi_results = joblib.load("genre_models_results.pkl")        # multi-class models
binary_models = joblib.load("genre_binary_models.pkl")         # binary models
binary_vectorizers = joblib.load("genre_binary_vectorizers.pkl") 

def safe_predict_proba(model, X):
    """ใช้ predict_proba หรือ decision_function แบบ sigmoid สำหรับ binary/logistic"""
    # ตรวจสอบ feature size ก่อน
    if X.shape[1] != model.n_features_in_:
        # เติม 0 ให้ feature น้อยกว่า model expects
        X_new = np.zeros((X.shape[0], model.n_features_in_))
        X_new[:, :X.shape[1]] = X.toarray()
        X = X_new
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[0][1]  # probability ของ class '1'
    else:
        score = model.decision_function(X)[0]
        return 1 / (1 + np.exp(-score))

def predict_genre(text, weight_multi=0.6, weight_bin=0.4):
    text_cleaned = clean_text(text)
    scores_multi = {}
    scores_bin = {}
    scores_combined = {}

    for g in multi_results:
        # Multi-class
        vect_multi = multi_results[g]["vectorizer"]
        model_multi = multi_results[g]["model"]
        X_multi = vect_multi.transform([text_cleaned])
        prob_multi = safe_predict_proba(model_multi, X_multi)
        scores_multi[g] = prob_multi

        # Binary
        vect_bin = binary_vectorizers[g]
        model_bin = binary_models[g]
        X_bin = vect_bin.transform([text_cleaned])
        prob_bin = safe_predict_proba(model_bin, X_bin)
        scores_bin[g] = prob_bin

        # รวมคะแนน
        final_score = weight_multi * prob_multi + weight_bin * prob_bin
        scores_combined[g] = final_score

    # genre ที่คะแนนสูงสุด
    pred_multi = max(scores_multi, key=scores_multi.get)
    pred_bin = max(scores_bin, key=scores_bin.get)
    pred_combined = max(scores_combined, key=scores_combined.get)

    return {
        "multi_class": pred_multi,
        "binary": pred_bin,
        "combined": pred_combined,
        "scores_multi": scores_multi,
        "scores_bin": scores_bin,
        "scores_combined": scores_combined
    }

# -----------------------------
# ตัวอย่างใช้งาน
# -----------------------------
lyrics = """
Made a meal and threw it up on Sunday
I've got a lot of things to learn
Said I would and I'll be leaving one day
Before my heart starts to burn

So what's the matter with you?
Sing me something new
Don't you know the cold and wind and rain don't know
They only seem to come and go away

Times are hard when things have got no meaning
I've found a key upon the floor
Maybe you and I will not believe in
The things we find behind the door

So what's the matter with you?
Sing me something new
Don't you know the cold and wind and rain don't know
They only seem to come and go away

Stand by me, nobody knows the way it's gonna be
Stand by me, nobody knows the way it's gonna be
Stand by me, nobody knows the way it's gonna be
Stand by me, nobody knows
Yeah, nobody knows, the way it's gonna be

If you're leaving will you take me with you?
I'm tired of talking on my phone
There is one thing I can never give you
My heart will never be your home

So what's the matter with you?
Sing me something new
Don't you know the cold and wind and rain don't know
They only seem to come and go away

Stand by me, nobody knows the way it's gonna be
Stand by me, nobody knows the way it's gonna be
Stand by me, nobody knows the way it's gonna be
Stand by me, nobody knows
Yeah, nobody knows, the way it's gonna be

The way it's gonna be, yeah
Maybe I can see, yeah
Don't you know the cold and wind and rain don't know
They only seem to come and go away (hey, hey)

Stand by me, nobody knows the way it's gonna be
Stand by me, nobody knows the way it's gonna be
Stand by me, nobody knows the way it's gonna be
Stand by me, nobody knows
Yeah, God only knows, the way it's gonna be
"""
predictions = predict_genre(lyrics)

print("Multi-class model prediction:", predictions["multi_class"])
print("Binary model prediction:", predictions["binary"])
print("Combined prediction:", predictions["combined"])
print("\nScores (combined):", predictions["scores_combined"])


Multi-class model prediction: country
Binary model prediction: pop
Combined prediction: country

Scores (combined): {'country': np.float64(0.5347146321324435), 'pop': np.float64(0.47515933568143154), 'rap': np.float64(0.02160863058459997), 'rb': np.float64(0.2107473133220858), 'rock': np.float64(0.432566512203008)}


In [48]:
import joblib

# Export full results dict
joblib.dump(results, "genre_models_results.pkl")
print("All genre models and vectorizers exported to 'genre_models_results.pkl'!")


All genre models and vectorizers exported to 'genre_models_results.pkl'!


In [63]:
import pandas as pd
import joblib
import numpy as np

# โหลดโมเดล
fluk = joblib.load("genre_models_results.pkl")        # multi-class models
tutor1 = joblib.load("genre_binary_models.pkl")       # binary models
tutor2 = joblib.load("genre_binary_vectorizers.pkl")  # vectorizers

def safe_predict_proba(model, X):
    if X.shape[1] != model.n_features_in_:
        X_new = np.zeros((X.shape[0], model.n_features_in_))
        X_new[:, :X.shape[1]] = X.toarray()
        X = X_new
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[0][1]
    else:
        score = model.decision_function(X)[0]
        return 1 / (1 + np.exp(-score))

def predict_genre_row(text, multi_results=fluk, binary_models=tutor1, binary_vectorizers=tutor2):
    text_cleaned = clean_text(text)
    scores_multi = {}
    scores_bin = {}
    scores_combined = {}

    for g in multi_results:
        # Multi-class
        vect_multi = multi_results[g]["vectorizer"]
        model_multi = multi_results[g]["model"]
        X_multi = vect_multi.transform([text_cleaned])
        prob_multi = safe_predict_proba(model_multi, X_multi)
        scores_multi[g] = prob_multi

        # Binary
        vect_bin = binary_vectorizers[g]
        model_bin = binary_models[g]
        X_bin = vect_bin.transform([text_cleaned])
        prob_bin = safe_predict_proba(model_bin, X_bin)
        scores_bin[g] = prob_bin

        # รวมคะแนน
        final_score = 0.6 * prob_multi + 0.4 * prob_bin
        scores_combined[g] = final_score

    # return predictions ทั้ง 3 แบบ
    pred_multi = max(scores_multi, key=scores_multi.get)
    pred_bin = max(scores_bin, key=scores_bin.get)
    pred_combined = max(scores_combined, key=scores_combined.get)

    return pd.Series({
        "pred_multi": pred_multi,
        "pred_binary": pred_bin,
        "pred_combined": pred_combined
    })

# -------------------------
# อ่านไฟล์ CSV
# -------------------------
df = pd.read_csv("song_lyrics_50000_sampled.csv")

# Apply ฟังก์ชัน predict_genre_row
df[["pred_multi", "pred_binary", "pred_combined"]] = df["lyrics"].apply(predict_genre_row)

# บันทึก CSV ใหม่
df.to_csv("song_lyrics_50000_sampled_with_predictions.csv", index=False)


In [55]:
for g in binary_models:
    print(g, 
          "model expects =", binary_models[g].n_features_in_,
          "vectorizer produces =", binary_vectorizers[g].transform(["text"]).shape[1])


country model expects = 801302 vectorizer produces = 150000
pop model expects = 901302 vectorizer produces = 150000
rap model expects = 801302 vectorizer produces = 150000
rb model expects = 801302 vectorizer produces = 150000
rock model expects = 801302 vectorizer produces = 150000


In [64]:
df = pd.read_csv("song_lyrics_50000_sampled_with_predictions.csv")

In [65]:
df

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language,clean_lyrics,pred_multi,pred_binary,pred_combined
0,Those Were Couldve Been the Days,country,The Deslondes,2015,574,{},"There's a road, babe, that could take us to gl...",2425928,en,en,en,there's a road babe that could take us to glor...,country,pop,pop
1,Kiss Me Cowboy,country,Tommy Atkins,2020,228,{},You honked the horn on your muddy truck\nAnd t...,5995699,en,en,en,you honked the horn on your muddy truck and ti...,country,country,country
2,I Threw Away The Rose,country,Charley Pride,1967,214,{},Once I lived a life of wine and roses and I dr...,1343371,en,en,en,once i lived a life of wine and roses and i dr...,country,country,country
3,A Garden,country,Ye Vagabonds,2015,11,{},[Verse 1]\nI walked down\nA long country mile\...,5769270,en,en,en,i walked down a long country mile over fence...,country,country,country
4,Dirt Road Remix,country,Kidd G & BRELAND,2022,113,{},[Verse 1: Kidd G]\nI should have never called ...,7831879,en,en,en,i should have never called last night we bre...,country,country,country
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Afloat,rock,Riverside,2015,5138,{},"Dream on, my pain\nMy scar\nMy blame\nYou’ve b...",2310586,en,en,en,dream on my pain my scar my blame you’ve been ...,rock,rb,rock
49996,Lazy Boy Dash,rock,Jimmie's Chicken Shack,1999,756,"{""Jimmie\\'s Chicken Shack""}",I'm a lazy boy there's no doubt about it\nMigh...,1173973,en,en,en,i'm a lazy boy there's no doubt about it might...,rock,rock,rock
49997,Sinner Unleashed in the East,rock,Judas Priest,1979,126,{},"Sinner rider, rides in with the storm\nThe dev...",5227285,en,en,en,sinner rider rides in with the storm the devil...,rock,rock,rock
49998,Required Reading,rock,Into It. Over It.,2016,1726,{},"[Verse 1]\nEnemies and friends\nThe scale, it ...",2436055,en,en,en,enemies and friends the scale it tips toward...,rock,rock,rock
