In [1]:
!pip install pandas numpy scikit-learn scipy joblib pyarrow streamlit




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\Acer\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip


In [2]:
import os, re, json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import joblib

# Paths (everything will be saved in current dir)
DATA_PATH = "spotify_millsongdata.csv"   # <-- put your dataset in same folder
TFIDF_VECTORIZER_PATH = "tfidf_vectorizer.pkl"
TFIDF_MATRIX_PATH = "tfidf_matrix.npz"
NUMERIC_SCALER_PATH = "numeric_scaler.pkl"
NUMERIC_MATRIX_PATH = "numeric_matrix.npz"
INDEX_PATH = "index.parquet"
SCHEMA_PATH = "schema.json"

MAX_FEATURES_TFIDF = 80000
MIN_DF_TFIDF = 2
NGRAM_RANGE = (1,2)
TOP_K = 50


In [3]:
def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.lower().strip()
    s = re.sub(r"\s+", " ", s)
    return s

def save_json(obj, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def coalesce(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None


In [4]:
df = pd.read_csv(DATA_PATH)
print("Dataset Shape:", df.shape)

# infer schema
schema = {
    "track_name": coalesce(df, ["song","track_name","title","name"]),
    "artist": coalesce(df, ["artist","artists","artist_name"]),
    "lyrics": coalesce(df, ["text","lyrics"]),
    "album": coalesce(df, ["album","album_name"]),
    "genres": coalesce(df, ["genres","genre","tags"]),
    "numeric_cols": [c for c in [
        "danceability","energy","loudness","speechiness","acousticness",
        "instrumentalness","liveness","valence","tempo","duration_ms","popularity"
    ] if c in df.columns]
}

# clean text cols
for key in ["track_name","artist","album","lyrics","genres"]:
    col = schema.get(key)
    if col and col in df.columns:
        df[col] = df[col].astype(str).map(normalize_text)

df.head()


Dataset Shape: (57650, 4)


Unnamed: 0,artist,song,link,text
0,abba,ahe's my kind of girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"look at her face, it's a wonderful face and it..."
1,abba,"andante, andante",/a/abba/andante+andante_20002708.html,"take it easy with me, please touch me gently l..."
2,abba,as good as new,/a/abba/as+good+as+new_20003033.html,i'll never know why i had to go why i had to p...
3,abba,bang,/a/abba/bang_20598415.html,making somebody happy is a question of give an...
4,abba,bang-a-boomerang,/a/abba/bang+a+boomerang_20002668.html,making somebody happy is a question of give an...


In [5]:
def build_text_corpus(df, schema):
    parts = []
    for key in ["track_name","artist","album","genres","lyrics"]:
        col = schema.get(key)
        if col and col in df.columns:
            parts.append(df[col].fillna(""))
    return parts[0].astype(str).str.cat(parts[1:], sep=" ") if parts else pd.Series([""]*len(df))

# text features
corpus = build_text_corpus(df, schema)
vec = TfidfVectorizer(stop_words="english",
                      max_features=MAX_FEATURES_TFIDF,
                      min_df=MIN_DF_TFIDF,
                      ngram_range=NGRAM_RANGE)
X_text = vec.fit_transform(corpus)

# numeric features
num_cols = schema.get("numeric_cols") or []
if num_cols:
    X_num = df[num_cols].apply(pd.to_numeric, errors="coerce").fillna(0).values
    scaler = StandardScaler()
    X_num_scaled = scaler.fit_transform(X_num)
    X_num_sparse = sparse.csr_matrix(X_num_scaled)
else:
    scaler, X_num_sparse = None, None

print("TF-IDF Shape:", X_text.shape)
if X_num_sparse is not None:
    print("Numeric Shape:", X_num_sparse.shape)


TF-IDF Shape: (57650, 80000)


In [6]:
from scipy.sparse import save_npz

save_npz(TFIDF_MATRIX_PATH, X_text)
joblib.dump(vec, TFIDF_VECTORIZER_PATH)

if scaler is not None:
    save_npz(NUMERIC_MATRIX_PATH, X_num_sparse)
    joblib.dump(scaler, NUMERIC_SCALER_PATH)

df_index = df[[schema["track_name"], schema["artist"]]].reset_index(drop=True)
df_index.to_parquet(INDEX_PATH, index=False)
save_json(schema, SCHEMA_PATH)
print("✅ Artifacts saved!")


✅ Artifacts saved!


In [7]:
TEXT_WEIGHT = 0.85
NUMERIC_WEIGHT = 0.15
ARTIST_BOOST = 0.1

class Recommender:
    def __init__(self):
        from scipy.sparse import load_npz
        self.vec = joblib.load(TFIDF_VECTORIZER_PATH)
        self.X_text = load_npz(TFIDF_MATRIX_PATH)
        try:
            self.scaler = joblib.load(NUMERIC_SCALER_PATH)
            self.X_num = load_npz(NUMERIC_MATRIX_PATH)
        except:
            self.scaler, self.X_num = None, None
        self.index_df = pd.read_parquet(INDEX_PATH)
        self.schema = load_json(SCHEMA_PATH)

    def recommend(self, title, top_n=10):
        mask = self.index_df[self.schema["track_name"]].str.contains(title.lower(), na=False)
        if not mask.any():
            raise ValueError(f"'{title}' not found.")
        idx = mask.idxmax()
        return self.recommend_by_index(idx, top_n)

    def recommend_by_index(self, idx, top_n=10):
        sim_text = cosine_similarity(self.X_text[idx], self.X_text).flatten()
        if self.X_num is not None:
            sim_num = cosine_similarity(self.X_num[idx], self.X_num).flatten()
        else:
            sim_num = np.zeros_like(sim_text)

        sim = TEXT_WEIGHT*sim_text + NUMERIC_WEIGHT*sim_num

        # artist boost
        query_artist = self.index_df.iloc[idx][self.schema["artist"]]
        artist_mask = self.index_df[self.schema["artist"]] == query_artist
        sim = sim + ARTIST_BOOST*artist_mask.astype(float)

        sim[idx] = -1  # exclude self
        rec_idx = np.argsort(sim)[::-1][:top_n]
        return self.index_df.iloc[rec_idx].assign(similarity=sim[rec_idx])


In [8]:
rec = Recommender()
rec.recommend("love", top_n=5)


Unnamed: 0,song,artist,similarity
10460,i'm gonna sing you a sad song,kenny rogers,0.358853
35157,i'm gonna sing,hank williams,0.308778
47,king kong song,abba,0.277603
21770,love song,x-treme,0.263316
39908,sing 'em good my friend,kenny chesney,0.254489
