In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
 
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.neighbors import NearestNeighbors

In [None]:
# step 1 Load data
import pandas as pd

# Dataset is in the SAME folder as this notebook
df = pd.read_csv("dataset.csv")

# Basic sanity cleanup
df = df.drop_duplicates(subset=["track_id"]).reset_index(drop=True)

print("Dataset loaded successfully")
print("Shape:", df.shape)
df.head()

Dataset loaded successfully
Shape: (89741, 21)


Unnamed: 0,index,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [None]:
# step 2 Define features and target
# Target: popularity score (regression)
TARGET = "popularity"

# Numeric audio features (low-dimensional, stable)
audio_features = [
    "duration_ms", "danceability", "energy", "key", "loudness", "mode",
    "speechiness", "acousticness", "instrumentalness", "liveness",
    "valence", "tempo", "time_signature"
]

# Categorical features (used for popularity prediction only)
categorical_features = ["explicit", "track_genre"]

# Keep only columns we need (avoid memory waste)
keep_cols = ["track_id", "track_name", "artists", "album_name", TARGET] + audio_features + categorical_features
df = df[keep_cols].copy()

print(df.isna().mean().sort_values(ascending=False).head(10))


artists           0.000011
album_name        0.000011
track_name        0.000011
track_id          0.000000
acousticness      0.000000
explicit          0.000000
time_signature    0.000000
tempo             0.000000
valence           0.000000
liveness          0.000000
dtype: float64


In [None]:
# step 3 Split data into train and test sets
X = df[audio_features + categorical_features]
y = df[TARGET].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

Train size: (71792, 15)
Test size: (17949, 15)


In [None]:
# step 4 Define preprocessing pipelines
# Numeric preprocessing: impute missing values + scale
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical preprocessing: impute + one-hot encode
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine numeric + categorical preprocessing
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, audio_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop"
)


In [None]:
# step 5 Train and evaluate Ridge Regression model
ridge_model = Ridge(alpha=3.0, random_state=42)

ridge_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", ridge_model)
])

ridge_pipeline.fit(X_train, y_train)

y_pred_ridge = ridge_pipeline.predict(X_test)

rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"[Ridge] RMSE={rmse_ridge:.3f}, R2={r2_ridge:.3f}")


[Ridge] RMSE=16.844, R2=0.321


In [None]:
# step 6 Build audio embedding for similarity 
# For recommendation similarity, we only use numeric audio features (13 dimensions)
audio_df = df[audio_features].copy()
audio_df = audio_df.fillna(audio_df.median(numeric_only=True))

audio_scaler = StandardScaler()
X_audio_scaled = audio_scaler.fit_transform(audio_df)  # shape: (n_songs, 13)

# Map track_id to row index
id_to_idx = {tid: i for i, tid in enumerate(df["track_id"].values)}

print("X_audio_scaled shape:", X_audio_scaled.shape)


X_audio_scaled shape: (89741, 13)


In [14]:
# step 7 Build user profile from liked songs
def build_user_profile(liked_track_ids):
    """
    Average the embeddings of the liked songs to represent user preference.
    """
    valid_ids = [tid for tid in liked_track_ids if tid in id_to_idx]
    if len(valid_ids) == 0:
        raise ValueError("None of the liked track_ids exist in the dataset.")

    idxs = [id_to_idx[tid] for tid in valid_ids]
    user_vec = X_audio_scaled[idxs].mean(axis=0)  # (13,)
    return user_vec, valid_ids


In [15]:
# step 8 Two-stage recommendation (candidate generation + ranking)
def recommend_for_user(
    liked_track_ids,
    k=20,
    alpha=0.75,
    candidate_pool=5000,
    filter_by_genre=True
):
    """
    Two-stage recommender (memory safe):
    Stage 1: Candidate Generation
      - Take top-N songs by predicted popularity
      - Optionally restrict candidates to the genres of liked songs
    Stage 2: Ranking
      - Rank candidates by cosine similarity to user profile
      - Combine similarity + predicted popularity into final_score

    final_score = alpha * similarity + (1 - alpha) * normalized_pred_popularity
    """
    user_vec, valid_ids = build_user_profile(liked_track_ids)

    # Start from all songs except liked ones
    candidates = df.loc[~df["track_id"].isin(valid_ids)].copy()

    # Optional: restrict candidates to liked genres (reduces search space)
    if filter_by_genre and "track_genre" in df.columns:
        liked_genres = df.loc[df["track_id"].isin(valid_ids), "track_genre"].dropna().unique().tolist()
        if len(liked_genres) > 0:
            candidates = candidates[candidates["track_genre"].isin(liked_genres)]

    # Candidate pool by predicted popularity (fast and realistic)
    candidates = candidates.sort_values("pred_popularity", ascending=False).head(candidate_pool)

    # Cosine similarity computation using NearestNeighbors
    cand_idx = candidates.index.values
    X_cand = X_audio_scaled[cand_idx]

    nn = NearestNeighbors(metric="cosine", algorithm="brute")
    nn.fit(X_cand)

    dists, inds = nn.kneighbors(user_vec.reshape(1, -1), n_neighbors=min(k, len(candidates)))

    # Gather top-k
    rec_rows = cand_idx[inds[0]]
    recs = df.loc[rec_rows, ["track_id","track_name","artists","track_genre","popularity","pred_popularity"]].copy()

    # Similarity = 1 - cosine distance
    recs["similarity_to_user"] = 1.0 - dists[0]

    # Normalize predicted popularity for combination
    pred = recs["pred_popularity"].values.astype(float)
    pred_norm = (pred - pred.min()) / (pred.max() - pred.min() + 1e-12)

    recs["final_score"] = alpha * recs["similarity_to_user"].values + (1 - alpha) * pred_norm

    return recs.sort_values("final_score", ascending=False).reset_index(drop=True)


In [None]:
# test the recommendation function
# Choose 3 random "liked songs" (replace these with real user likes if you want)
liked = df["track_id"].sample(3, random_state=42).tolist()
print("Liked track_ids:", liked)

recs = recommend_for_user(liked, k=20, alpha=0.75, candidate_pool=5000, filter_by_genre=True)
recs.head(20)


Liked track_ids: ['6MxGvnJWqdGS0chQypGXhB', '1VKnVxZZnubjTEJG6Tme1y', '3Lu4WAsmpag6CZSN0H3Wor']


Unnamed: 0,track_id,track_name,artists,track_genre,popularity,pred_popularity,similarity_to_user,final_score
0,4d6eqRtpDX7tydHJGDZUBQ,She Drives Me Crazy,Fine Young Cannibals,synth-pop,74,35.115366,0.912985,0.931622
1,0en6SFmN4eaErErH126wbJ,What Have I Done to Deserve This? (with Dusty ...,Pet Shop Boys;Dusty Springfield,synth-pop,61,34.261234,0.921878,0.930274
2,43hajUyjyHEJCIoDtXSs63,What Have I Done to Deserve This?,Pet Shop Boys;Dusty Springfield,synth-pop,0,34.320791,0.920328,0.929671
3,1AMbMn3yvTExSbSyq5h6aR,"Weird Science - From ""Weird Science"" Soundtrack",Oingo Boingo,synth-pop,24,35.213397,0.904261,0.926
4,6nEGxVCcMhIugOHk37APU7,Things Can Only Get Better,Howard Jones,synth-pop,23,35.410443,0.894026,0.920173
5,2pZbERJUZsfkKRn1EU1mgD,You Can Win If You Want,Modern Talking,synth-pop,56,35.447357,0.893529,0.920147
6,4IcLNRtHFMTEAvoLklnZjA,A Girl Like You,Easton Corbin,country,0,17.311799,0.899532,0.754406
7,5IfdDiOJitysgex3qEL0bn,$50 Dollars and a Flask of Crown,Bleu Edmondson,country,43,16.453786,0.887931,0.737651
8,1uJjQSOlc8ql6cs0cVWQH6,No Body,Blake Shelton,country,2,16.743643,0.882414,0.736234
9,3b1VmMmh8KVV5es53mROfG,No Body,Blake Shelton,country,0,16.743643,0.882414,0.736234
