<a href="https://colab.research.google.com/github/unknown-spec10/Data-Science/blob/main/Spotify_Hybrid_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kagglehub --upgrade



In [None]:
import kagglehub

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
data_path = Path("/root/.cache/kagglehub/datasets/undefinenull/million-song-dataset-spotify-lastfm/versions/1")


songs_data_path = data_path / 'Music Info.csv'
users_data_path = data_path / 'User Listening History.csv'

In [None]:
#Load data
df_songs = pd.read_csv(songs_data_path)# content data
df_inter = pd.read_csv(users_data_path)# collaborative data

In [None]:
df_inter.shape

(9711301, 5)

In [None]:
# Index encoding
df_songs["track_idx"] = df_songs["track_id"].astype("category").cat.codes
df_inter["track_idx"] = df_inter["track_id"].astype("category").cat.codes
df_inter["user_idx"] = df_inter["user_id"].astype("category").cat.codes

In [None]:
# keeping index mappings
track2idx = dict(zip(df_songs.track_id, df_songs.track_idx))
idx2track = {v: k for k, v in track2idx.items()}

#Content Preprorocessing:-)

In [None]:
# Print the column names to identify the correct artist column name
print(df_songs.columns)

Index(['track_id', 'name', 'artist', 'spotify_preview_url', 'spotify_id',
       'tags', 'genre', 'year', 'duration_ms', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature', 'track_idx'],
      dtype='object')


In [None]:
def get_content_matrix(df_songs):
    # Columns used in content profile
    content_cols = ["artist", "tags", "year", "danceability", "energy", "valence", "tempo"]
    df_features = df_songs[content_cols].copy()

    # Fill NaN values in 'tags' with an empty string
    df_features['tags'] = df_features['tags'].fillna('')

    # Pipeline
    col_tf = ColumnTransformer([
        ("artist", OneHotEncoder(handle_unknown='ignore'), ["artist"]),
        ("tags", TfidfVectorizer(max_features=3000), "tags"),
        ("year", OneHotEncoder(handle_unknown='ignore'), ["year"]),
        ("features", StandardScaler(), ["danceability", "energy", "valence", "tempo"])
    ])
    X_content = col_tf.fit_transform(df_features)
    return X_content, col_tf

#Collaborative Similarity:-)

In [None]:
def build_collaborative_matrix(df_inter, n_items, n_users):
    R = csr_matrix((df_inter.playcount, (df_inter.track_idx, df_inter.user_idx)), shape=(n_items, n_users))
    S_collab = cosine_similarity(R, dense_output=False)
    return R, S_collab

#Hybrid Recommender:-)

In [None]:
def hybrid_recommend(seed_track_id, X_content, S_collab, α=0.7, k=10):
    if seed_track_id not in track2idx:
        return []
    i = track2idx[seed_track_id]

    # Collaborative score
    collab_scores = S_collab[i].toarray().ravel()

    # Content score
    content_scores = cosine_similarity(X_content[i], X_content).ravel()

    # Hybrid
    hybrid = α * collab_scores + (1 - α) * content_scores

    # Get top-k indices (excluding self)
    top_idx = np.argsort(hybrid)[::-1]
    top_idx = [x for x in top_idx if x != i][:k]
    return [idx2track[j] for j in top_idx]

#Testing:-(

In [None]:
X_content, col_tf = get_content_matrix(df_songs)

n_items = df_songs["track_idx"].max() + 1
n_users = df_inter["user_idx"].max() + 1
R, S_collab = build_collaborative_matrix(df_inter, n_items, n_users)

# Test
seed = df_songs.track_id.iloc[0]
hybrid_recommend(seed, X_content, S_collab, α=0.7, k=5)

['TRGSZLI128F4230F3A',
 'TRARVZG128F1497359',
 'TRMSIMM128F9316F1D',
 'TRKSBMG128F92E2E43',
 'TRIOWFB128EF35C9ED']