In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import numpy as np

# Load data and models (example placeholders)
user_data = pd.read_csv(r'C:\Users\hp\Desktop\.vscode\merging\dataset 1 - Spotify_data.csv')
spotify_songs = pd.read_csv(r'C:\Users\hp\Desktop\.vscode\merging\DATA - spotify_dataset.csv')

# Rename columns for merging
spotify_songs = spotify_songs.rename(columns={'track_id': 'Song_ID', 'track_popularity': 'Popularity'})
user_data = user_data.rename(columns={'User ID': 'User_ID'})

# Merge user data with song data based on 'User_ID'
merged_df = pd.merge(user_data, spotify_songs, on='User_ID', how='inner')

# Collaborative Filtering Dataset
cf_features = ['User_ID', 'Song_ID', 'Popularity']
cf_filtered = merged_df[cf_features]

# Content-Based Filtering Dataset
cbf_features = ['Song_ID', 'Genre', 'Album']
cbf_filtered = merged_df[cbf_features].copy()

# Fill NaN values in textual features
text_features = ['Genre', 'Album']
for feature in text_features:
    cbf_filtered[feature] = cbf_filtered[feature].fillna('')

# TF-IDF on textual features for Content-Based Filtering
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
cbf_filtered['combined_features'] = cbf_filtered[text_features].apply(lambda x: ' '.join(x), axis=1)
textual_features = tfidf_vectorizer.fit_transform(cbf_filtered['combined_features'])

# Ensure the data is sufficient
if textual_features.shape[0] == 0 or textual_features.shape[1] == 0:
    print("Insufficient data for TF-IDF vectorization.")
else:
    print(f"TF-IDF matrix shape: {textual_features.shape}")

# Precompute content-based song similarities
song_similarity_matrix = cosine_similarity(textual_features, textual_features)

# Collaborative Filtering Recommendation Function
user_item_matrix = cf_filtered.pivot_table(index='User_ID', columns='Song_ID', values='Popularity', fill_value=0)
sparse_matrix = csr_matrix(user_item_matrix.values)
user_similarity_matrix = cosine_similarity(sparse_matrix, dense_output=False)

def get_cf_scores(user_id):
    if user_id not in user_item_matrix.index:
        return pd.Series(dtype=float)

    user_index = user_item_matrix.index.get_loc(user_id)
    similarity_scores = user_similarity_matrix[user_index].toarray().ravel()
    similarity_scores[user_index] = 0

    similarity_df = pd.DataFrame({'user': user_item_matrix.index, 'similarity': similarity_scores}).set_index('user')
    target_user_songs = user_item_matrix.loc[user_id]
    unseen_songs = target_user_songs[target_user_songs == 0].index

    song_scores = pd.Series(0, index=unseen_songs)
    for similar_user_id, row in similarity_df[similarity_df['similarity'] > 0].iterrows():
        sim_score = row['similarity']
        similar_user_songs = user_item_matrix.loc[similar_user_id]
        song_scores = song_scores.add(similar_user_songs[unseen_songs] * sim_score, fill_value=0)

    return song_scores

def get_cbf_scores(user_id):
    user_indices = merged_df.index[merged_df['User_ID'] == user_id].tolist()
    scores = pd.Series(0.0, index=cbf_filtered['Song_ID'].unique())
    for user_index in user_indices:
        sim_scores = song_similarity_matrix[user_index]
        for idx, score in enumerate(sim_scores):
            song_id = merged_df.iloc[user_index]['Song_ID']
            scores.loc[song_id] += score
    return scores

def get_top_n_recommendations(user_id, n=5):
    cf_scores = get_cf_scores(user_id)
    cbf_scores = get_cbf_scores(user_id)
    combined_scores = cf_scores.add(cbf_scores, fill_value=0)
    top_songs = combined_scores.nlargest(n)
    return combined_scores, top_songs

# Main script to predict top 5 songs for a specified user
if __name__ == "__main__":
    user_id = input("Please enter your User ID: ")
    if user_id not in user_item_matrix.index:
        print("User ID not found in the dataset.")
    else:
        combined_scores, top_songs = get_top_n_recommendations(user_id, n=5)
        
        print(f"All similarity scores for User '{user_id}':")
        for song_id, score in combined_scores.items():
            print(f"- Song ID: {song_id}, Score: {score}")
        
        print(f"\nTop 5 song recommendations for User '{user_id}':")
        for song_id, score in top_songs.items():
            print(f"- Song ID: {song_id}, Score: {score}")


TF-IDF matrix shape: (10000, 22)
All similarity scores for User 'User_78':
- Song ID: Song_1, Score: 0.0
- Song ID: Song_10, Score: 0.0
- Song ID: Song_100, Score: 0.0
- Song ID: Song_1000, Score: 0.0
- Song ID: Song_10000, Score: 0.0
- Song ID: Song_1001, Score: 0.0
- Song ID: Song_1002, Score: 0.0
- Song ID: Song_1003, Score: 0.0
- Song ID: Song_1004, Score: 0.0
- Song ID: Song_1005, Score: 0.0
- Song ID: Song_1006, Score: 0.0
- Song ID: Song_1007, Score: 0.0
- Song ID: Song_1008, Score: 0.0
- Song ID: Song_1009, Score: 0.0
- Song ID: Song_101, Score: 0.0
- Song ID: Song_1010, Score: 0.0
- Song ID: Song_1011, Score: 0.0
- Song ID: Song_1012, Score: 0.0
- Song ID: Song_1013, Score: 0.0
- Song ID: Song_1014, Score: 0.0
- Song ID: Song_1015, Score: 0.0
- Song ID: Song_1016, Score: 0.0
- Song ID: Song_1017, Score: 0.0
- Song ID: Song_1018, Score: 0.0
- Song ID: Song_1019, Score: 0.0
- Song ID: Song_102, Score: 0.0
- Song ID: Song_1020, Score: 0.0
- Song ID: Song_1021, Score: 0.0
- Song I