# HYBRID APPROACH: Genres and PlayCount

In this approach, we will use a Hybrid Recommender System (Content

In [2]:
pip install lightgbm

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## 1. Getting All The Needed Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
unique_artists_df = pd.read_pickle('unique_artists_df.pkl')
playlists_genres_df = pd.read_pickle('playlists_genres_df.pkl')
playlist_genres_dup= pd.read_pickle('playlist_genres_dup.pkl')
tracks_df = pd.read_csv('tracks_df.csv')
common_genres = np.load("common_genres.npy", allow_pickle=True)
common_genres = common_genres.tolist()
artist_vectors = np.load("artist_vectors.npy", allow_pickle=True)
merged_df = pd.read_csv('merged_df')

In [None]:
merged_pcount_genresf= pd.read_csv('/home/jovyan/Spotify/merged_pcount_genres.csv')

In [None]:
merged_mil_df= pd.read_csv('/home/jovyan/Spotify/merged_mil_df .csv')

In [5]:
merged_pcount_genres = merged_mil_df.merge(unique_artists_df[['artist_name', 'genres']], on='artist_name', how='inner')

In [None]:
merged_pcount_genres

In [9]:
merged_pcount_genresf.to_csv('merged_pcount_genres.csv', index=False)

In [7]:
merged_pcount_genresf.isnull().sum()

track_uri       0
artist_uri      0
artist_name     0
track_name     14
user_id         0
play_count      0
genres          0
dtype: int64

In [8]:
merged_pcount_genresf.dropna(subset=['track_name'], inplace=True)

## 2. Getting Content-Based System 

In [11]:
def cb_recommend(playlist_pid):
    # Retrieve genres of the given playlist
    my_all_genres = playlist_genres_dup.loc[playlist_pid, 'genres']
    
    # Count occurrences of each genre
    my_genre_counts = Counter(my_all_genres)
    total_genres = len(my_all_genres)

    # Compute playlist genre percentage vector
    my_vector = []
    total_genres = len(my_all_genres)
    for genre in common_genres:
        count = my_genre_counts.get(genre, 0) #retrieve the value of key (genres) in the dictionaery Counter
        percentage = (count / total_genres ) * 100
        my_vector.append(percentage)
    my_vector = np.array(my_vector)

    # Compute cosine similarity with each artist and store the similarity score
    similarities = {}
    for idx, artist_vector in enumerate(artist_vectors):  
        artist_name = unique_artists_df['artist_name'][idx] 
        similarity = cosine_similarity([my_vector], [artist_vector])
        similarities[artist_name] = similarity[0][0]

    # Select Top 5 Relevant Artists
    sorted_artists = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    top_artists = [artist for artist, _ in sorted_artists[:5]]

    # Get tracks not already in the playlist
    existing_tracks = set(tracks_df[tracks_df['playlist_pid'] == playlist_pid]['track_uri'])

    recommended_songs = []
    for artist in top_artists:
        if len(recommended_songs) >= 5:
            break

        artist_songs = tracks_df[tracks_df['artist_name'] == artist]
        artist_songs = artist_songs[~artist_songs['track_uri'].isin(existing_tracks)]

        if not artist_songs.empty:
            top_song = artist_songs.sample(n=1).iloc[0]
            score = similarities[artist]  # The similarity score for this artist
            recommended_songs.append({
                'track_name': top_song['track_name'],
                'artist_name': top_song['artist_name'],
                'album_name': top_song['album_name'],
                'similarity_score': score  
            })

    # Create a DataFrame to display the recommended songs with their scores
    recommended_songs_df = pd.DataFrame(recommended_songs)
    return recommended_songs_df


In [7]:
playlist_pid = input("What Is Your Playlist ID?: ")
recommended_songs = cb_recommend(int(playlist_pid))
recommended_songs

What Is Your Playlist ID?:  110


Unnamed: 0,track_name,artist_name,album_name,similarity_score
0,"Soy un Truhán, Soy un Señor",Julio Iglesias,A MIS 33 AÑOS,0.80268
1,Amor Libre,Camilo Sesto,20 Grandes Exitos,0.80268
2,Madalena,Ivan Lins,Cantando Historias Ivan Lins,0.80268
3,"Hasta Siempre, Comandante (Song for Che Guevara)",Carlos Puebla,Cancion Protesta: Protest Songs of Latin America,0.77827
4,Balada (Tchê Tcherere Tchê Tchê),Gusttavo Lima,E Você,0.77827


## 3. Creating Collaborative-Filtering System.

### 3.1 Load The Pre-Trained Model.

In [14]:
#Load the model
import pickle
with open("/home/jovyan/Spotify/trained_model_CF.pkl", "rb") as f:
    model = pickle.load(f)

In [15]:
interaction_df = pd.read_csv('interaction_df.csv')
import scipy.sparse as sparse
interaction_matrix = sparse.load_npz("interaction_matrix.npz")

In [14]:
def cf_recommendations(playlist_id, model, interaction_matrix, interaction_df, track_af_dataframe, N=5):
    # Get recommendations from the model
    recommendations = model.recommend(playlist_id, interaction_matrix[playlist_id], N=N)
    
    # Convert the output to a DataFrame
    recommendations_df = pd.DataFrame({
        'track_idx': recommendations[0],
        'score': recommendations[1]
    })
    
   
    interaction_df['track_idx'] = interaction_df['track_uri'].astype('category').cat.codes
    
    # Create a mapping of track_idx to track_uri
    idx_to_uri = dict(zip(interaction_df['track_idx'], interaction_df['track_uri']))
    
    # Map the track indices to track_uris
    recommendations_df['track_uri'] = recommendations_df['track_idx'].map(idx_to_uri)
    
    # Merge with track_af_dataframe to get detailed track information
    recommended_tracks = recommendations_df.merge(track_af_dataframe, on='track_uri')

    recommended_tracks = recommended_tracks.drop_duplicates(subset=['track_uri'])
    
   
    return recommended_tracks[['track_uri','track_name', 'artist_name', 'album_name', 'score']]

In [12]:
playlist_id = int(input("Enter Playlist ID: "))  # Take user input for playlist ID
recommended_songs_df = cf_recommendations(playlist_id, model, interaction_matrix, interaction_df, tracks_df, N=5)
recommended_songs_df

Enter Playlist ID:  110


Unnamed: 0,track_uri,track_name,artist_name,album_name,score
0,spotify:track:7yq4Qj7cqayVTp3FF9CWbm,Riptide,Vance Joy,Dream Your Life Away,7.4e-05
28448,spotify:track:6RrXd9Hph4hYR4bf3dbM6H,My Girl,The Temptations,The Temptations Sing Smokey,7.3e-05
45566,spotify:track:1mqlc0vEP9mU1kZgTi6LIQ,September,"Earth, Wind & Fire","Now, Then & Forever",7.3e-05
68207,spotify:track:3yrSvpt2l1xhsV9Em88Pul,Brown Eyed Girl,Van Morrison,Blowin' Your Mind!,7.2e-05
88285,spotify:track:2H3ZUSE54pST4ubRd5FzFR,Ain't No Mountain High Enough,Marvin Gaye,United,7.2e-05


# 4. HYBRID RECOMMENDER SYSTEM

In [17]:
from sklearn.preprocessing import MinMaxScaler

def hybrid_recommend(playlist_pid, model, interaction_matrix, interaction_df, track_af_dataframe, cb_weight=0.5, cf_weight=0.5, N=5):
    # Get Content-Based recommendations
    cb_df = cb_recommend(playlist_pid)
    
    # Get Collaborative Filtering recommendations
    cf_df = cf_recommendations(playlist_pid, model, interaction_matrix, interaction_df, track_af_dataframe, N=100)
    
    # Normalize similarity scores for both methods
    cb_df['similarity_score'] = MinMaxScaler().fit_transform(cb_df[['similarity_score']])
    cf_df['score'] = MinMaxScaler().fit_transform(cf_df[['score']])
    
    # Merge both DataFrames on track_uri (track name can also be used)
    hybrid_df = cb_df.merge(cf_df, on=['track_name', 'artist_name', 'album_name'], how='outer')
    
    # Fill missing scores with 0 (for tracks that appear in only one method)
    hybrid_df['similarity_score'] = hybrid_df['similarity_score'].fillna(0)
    hybrid_df['score'] = hybrid_df['score'].fillna(0)

    
    # Compute final hybrid score
    hybrid_df['final_score'] = (cb_weight * hybrid_df['similarity_score']) + (cf_weight * hybrid_df['score'])
    
    # Sort and return top N recommendations
    hybrid_df = hybrid_df.sort_values(by='final_score', ascending=False).head(N)
    
    return hybrid_df[['track_name', 'artist_name', 'album_name', 'final_score']]


In [18]:
playlist_id = int(input("Enter Playlist ID: "))  # Take user input for playlist ID
recommended_songs_h = hybrid_recommend(playlist_id ,model, interaction_matrix, interaction_df, tracks_df, cb_weight=0.5, cf_weight=0.5, N=5)
recommended_songs_h

Enter Playlist ID:  110


Unnamed: 0,track_name,artist_name,album_name,final_score
68,Riptide,Vance Joy,Dream Your Life Away,0.5
104,¿Quieres Ser Mi Amante?,Camilo Sesto,Camilo,0.5
96,Vitoriosa,Ivan Lins,Nova Bis - Ivan Lins,0.5
47,La Carretera,Julio Iglesias,"1, Volumen 1",0.5
58,My Girl,The Temptations,The Temptations Sing Smokey,0.492453


## Why can't we use CF system to test on new playlist?

Because it uses ALS model to train on the existing dataset, so it can predict which playlist is close to which one. So the new dataset have to be added in the dataset (or if the new plalist cvontains song already in the dataset). Then, we have to retrain the model. 