In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


df_songs = pd.read_csv("ex.csv")
df_interact = pd.read_csv("Processed_User_Song_Interaction.csv")


df_songs = df_songs.reset_index(drop=True)
df_songs['song_id'] = df_songs.index


df_interact['song_id'] = df_interact['song_id'] - 1


ModuleNotFoundError: No module named 'surprise'

In [None]:
metadata = df_songs[['Genre']].fillna('Unknown')
encoder = OneHotEncoder()
genre_encoded = encoder.fit_transform(metadata).toarray()
pca = PCA(n_components=min(20, genre_encoded.shape[1]), random_state=42)
meta_pca = pca.fit_transform(genre_encoded)

In [None]:
reader = Reader(rating_scale=(0, df_interact['listen_count'].max()))
data = Dataset.load_from_df(df_interact[['user_id', 'song_id', 'listen_count']], reader)
trainset = data.build_full_trainset()

model = SVD(n_factors=50, random_state=42)
model.fit(trainset)

svd_vectors = model.qi 

In [None]:

inner_id_to_raw_id = {inner_id: trainset.to_raw_iid(inner_id) for inner_id in range(trainset.n_items)}
svd_df = pd.DataFrame(model.qi, columns=[f"svd_{i}" for i in range(model.qi.shape[1])])
svd_df['song_id'] = svd_df.index.map(inner_id_to_raw_id)
inner_id_to_raw_id = {inner_id: trainset.to_raw_iid(inner_id) for inner_id in range(trainset.n_items)}
svd_df = pd.DataFrame(model.qi, columns=[f"svd_{i}" for i in range(model.qi.shape[1])])
svd_df['song_id'] = svd_df.index.map(inner_id_to_raw_id)
meta_df = pd.DataFrame(meta_pca, columns=[f"pca_{i}" for i in range(meta_pca.shape[1])])
meta_df['song_id'] = df_songs['song_id']
combined_df = pd.merge(svd_df, meta_df, on='song_id', how='inner')
combined_vectors = combined_df.drop(columns=['song_id']).values
combined_df = pd.merge(df_songs, combined_df, on='song_id', how='inner')
combined_vectors = combined_df[[col for col in combined_df.columns if col.startswith('svd_') or col.startswith('pca_')]].values



In [None]:
def recommend_for_user(user_id, top_k=10):
    try:
        user_vector = model.pu[trainset.to_inner_uid(user_id)]
    except ValueError:
        return f"User ID {user_id} not found."

    user_augmented = np.hstack([user_vector, np.zeros(meta_pca.shape[1])])
    similarities = cosine_similarity([user_augmented], combined_vectors).flatten()

    listened = df_interact[df_interact.user_id == user_id]['song_id'].tolist()

    
    unlistened_df = combined_df[~combined_df['song_id'].isin(listened)].copy()
    unlistened_df['score'] = similarities[unlistened_df.index]

    top_k_df = unlistened_df.sort_values(by='score', ascending=False).head(top_k)

    return top_k_df[['Song-Name', 'Singer/Artists', 'Genre', 'score']]


In [None]:
recommend_for_user("user_52")

Unnamed: 0,Song-Name,Singer/Artists,Genre,score
716,Sanson Ki Zarurat Hai Jaise,Kumar Sanu,BollywoodRomantic,0.422703
1243,Tanha Tanha Yahan Pe Jeena,Asha Bhosle,BollywoodDance,0.372663
2264,O Chhalia Re Chhalia,"Asha Bhosle, Mohammed Rafi",BollywoodDance,0.360918
2083,Paayal Meri,"Alka Yagnik, Udit Narayan",BollywoodDance,0.353555
2014,Dil Me Kayi Armaan,Aaman Trikha,BollywoodSad,0.340998
1766,Desi Beat,"Amrita Kak, Mika Singh",BollywoodDance,0.331093
165,Gal Ban Gayi,"Meet Bros, Neha Kakkar, Sukhbir Singh, Yo Yo H...",BollywoodDance,0.324658
1312,Aye Dil Laaya Hai Bahaar,"Hariharan, Kavita Krishnamurthy",BollywoodDance,0.312875
917,It’s the Time to Disco,"K.K., Loy Mendonsa, Shaan, Vasundhara Das",BollywoodDance,0.312468
2304,Jashna Hai Mohabbat Ka,Kumar Sanu,BollywoodDance,0.294009


In [None]:

from surprise import accuracy
from surprise.model_selection import train_test_split

# Chia dữ liệu train/test
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
model = SVD(n_factors=50, random_state=42)
model.fit(trainset)
predictions = model.test(testset)

# Tính RMSE
rmse = accuracy.rmse(predictions)
print("📊 RMSE của mô hình SVD:", rmse)


In [None]:

from sklearn.preprocessing import MinMaxScaler
from surprise import accuracy
from surprise.model_selection import train_test_split

# Chuẩn hóa listen_count về khoảng [0, 1]
scaler = MinMaxScaler()
df_interact['normalized_listen'] = scaler.fit_transform(df_interact[['listen_count']])

# Dùng normalized listen count thay vì listen_count gốc
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(df_interact[['user_id', 'song_id', 'normalized_listen']], reader)

# Chia train/test
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Huấn luyện lại mô hình
model = SVD(n_factors=50, reg_all=0.05, random_state=42)
model.fit(trainset)

# Dự đoán và tính RMSE
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print("📊 RMSE sau khi chuẩn hóa dữ liệu và huấn luyện lại:", rmse)
