
Project: Song Recommendation System Based on User Mood
This project aims to create a system that suggests songs based on a user's mood. We will use Spotify and Genius APIs to fetch user data, process this data to create embeddings using a pre-trained transformer model, store these embeddings in a FAISS index, and use LangChain and MLflow to manage the retrieval and generation processes.
 Step-by-Step Guide
 
 1. Setup Environment and Install Dependencies
**Why:** To ensure all necessary packages and tools are available for the project.
**Action:** Install the required libraries such as `lyricsgenius`, `spotipy`, `transformers`, `scikit-learn`, `faiss-cpu`, `tqdm`, and `mlflow`.
**Commands:**


In [None]:
%pip install lyricsgenius
%pip install spotipy
%pip install spotipy lyricsgenius transformers scikit-learn gtts pydub librosa
%pip install faiss-cpu
%pip install tqdm

In [None]:
# import pandas as pd


# data = pd.read_csv("spotify/data/data.csv")
# genre_data = pd.read_csv('spotify/data/data_by_genres.csv')
# year_data = pd.read_csv('spotify/data/data_by_year.csv')


In [None]:
# import os
# import pandas as pd
# import tqdm 

# # show stahe of the progress bar
# tqdm.tqdm.pandas()

# # Setting the base directory using list of directory names
# base_dir = "data"

# # Building paths by further extending the base directory
# data_path = os.path.join(base_dir, "data.csv")
# genre_data_path = os.path.join(base_dir, "data_by_genres.csv")
# year_data_path = os.path.join(base_dir, "data_by_year.csv")

# # Reading the data using pandas
# data = pd.read_csv(data_path)
# genre_data = pd.read_csv(genre_data_path)
# year_data = pd.read_csv(year_data_path)


In [None]:
# # spotify 


# import spotipy
# client_id = '10cc8ee290404da9ab9d7b061d526193'
# client_secret = '0dc9cb56d8bc4454afa1ddbe82a7301d'

# tqdm.tqdm.pandas()

# from spotipy.oauth2 import SpotifyClientCredentials
# client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
# sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# # check health 
# sp.trace = False
# track = sp.track('7qiZfU4dY1lWllzX7mPBI3')
# print(track['name'])
# # Now you have the access token to make requests to the Spotify API


In [None]:
# # login to spotfy account as user and get their playlists
# import spotipy.util as util

# username = 'eqanbww3jh63cgf4ot5zyyr5d'
# scope = 'playlist-read-private'
# token = util.prompt_for_user_token(username, scope, client_id, client_secret, redirect_uri='http://localhost:8888/callback')
# if token:
#     sp = spotipy.Spotify(auth=token)
#     playlists = sp.user_playlists(username)
#     for playlist in playlists['items']:
#         print(playlist['name'])

# else:
#     print("Can't get token for", username)

# # list songs in all the playlist 


In [1]:
# Install required libraries for the project
# This ensures all necessary packages are available for audio processing, text embedding, API interactions, and data management
%pip install lyricsgenius spotipy transformers scikit-learn gtts pydub librosa faiss-cpu tqdm mlflow

# Import essential libraries for the project
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import lyricsgenius
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import faiss
import logging
import psutil  # For monitoring system memory
import gc  # For managing memory through garbage collection

# Set up logging to monitor and log the flow of execution and potential issues
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

SPOTIFY_CLIENT_ID = '***REMOVED***'
SPOTIFY_CLIENT_SECRET = '***REMOVED***'
SPOTIFY_REDIRECT_URI = 'http://localhost:8888/callback'
GENIUS_API_TOKEN = '***REMOVED***'


# Initialize the Spotify API with user credentials for accessing music-related data
logger.info("Setting up Spotify API...")
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=SPOTIFY_CLIENT_ID,
                                               client_secret=SPOTIFY_CLIENT_SECRET,
                                               redirect_uri=SPOTIFY_REDIRECT_URI,
                                               scope="user-top-read user-library-read playlist-read-private"))

# Initialize the Genius API with your credentials to fetch song lyrics
logger.info("Setting up Genius API...")
genius = lyricsgenius.Genius(GENIUS_API_TOKEN)

# Load a pre-trained transformer model and tokenizer for processing lyrics into embeddings
logger.info("Loading pre-trained transformer model...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

# Ensure the model operates on CPU to prevent GPU memory overflow issues
device = torch.device("cpu")

# Define a function to embed textual data using the transformer model to get fixed-size numerical vectors
def embed_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy()

# Function to monitor and log the memory usage to manage resources efficiently
def log_memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    logger.info(f"Memory usage: {mem_info.rss / 1024 ** 2:.2f} MB")

# Retrieve and log the user's most listened tracks from Spotify
def get_spotify_top_tracks(sp, limit=5, time_range='medium_term'):
    logger.info(f"Fetching top {limit} tracks from Spotify...")
    results = sp.current_user_top_tracks(limit=limit, time_range=time_range)
    tracks = results['items']
    logger.info(f"Fetched {len(tracks)} tracks.")
    return tracks

# Fetch and log playlists created by the user on Spotify
def get_spotify_playlists(sp):
    logger.info("Fetching user playlists from Spotify...")
    results = sp.current_user_playlists()
    playlists = results['items']
    logger.info(f"Fetched {len(playlists)} playlists.")
    return playlists

# Fetch and log the audio features of tracks from Spotify which includes metrics like tempo, energy, etc.
def get_audio_features(sp, track_ids):
    logger.info("Fetching audio features from Spotify...")
    audio_features = sp.audio_features(track_ids)
    logger.info(f"Fetched audio features for {len(audio_features)} tracks.")
    return audio_features

# Retrieve and log lyrics for specified songs using the Genius API
def get_lyrics(artist, title):
    logger.info(f"Fetching lyrics for {title} by {artist} from Genius...")
    song = genius.search_song(title, artist)
    if song:
        logger.info(f"Fetched lyrics for {title}.")
        return song.lyrics
    logger.warning(f"Lyrics for {title} by {artist} not found.")
    return None

# Convert audio features into a numerical vector for processing and comparison
def audio_features_to_vector(audio_features):
    vector = np.array([
        audio_features['danceability'],
        audio_features['energy'],
        audio_features['speechiness'],
        audio_features['acousticness'],
        audio_features['instrumentalness'],
        audio_features['liveness'],
        audio_features['valence'],
        audio_features['tempo']
    ])
    return vector

# Create and log a FAISS index for efficient similarity searches among large datasets
def create_faiss_index(data, dimension):
    logger.info(f"Creating FAISS index with dimension {dimension}...")
    index = faiss.IndexFlatL2(dimension)
    index.add(data)
    logger.info("FAISS index created.")
    log_memory_usage()
    return index

# Batch processing to manage memory usage while fetching and processing data from Spotify
def fetch_and_process_data(sp, limit=5, batch_size=2):
    tracks = get_spotify_top_tracks(sp, limit=limit)
    track_ids = [track['id'] for track in tracks]
    audio_features = get_audio_features(sp, track_ids)
    
    playlists = get_spotify_playlists(sp)
    playlist_names = [playlist['name'] for playlist in playlists]
    
    lyrics_data = []
    audio_vectors = []
    
    for i in range(0, len(tracks), batch_size):
        batch_tracks = tracks[i:i+batch_size]
        batch_audio_features = audio_features[i:i+batch_size]
        
        for track, audio_feature in zip(batch_tracks, batch_audio_features):
            artist = track['artists'][0]['name']
            title = track['name']
            lyrics = get_lyrics(artist, title)
            if lyrics:
                lyrics_embedding = embed_text(lyrics)
                audio_vector = audio_features_to_vector(audio_feature)
                lyrics_data.append({'id': track['id'], 'embedding': lyrics_embedding, 'artist': artist, 'title': title, 'lyrics': lyrics})
                audio_vectors.append(audio_vector)
        
        # Release memory after processing each batch
        del batch_tracks, batch_audio_features
        gc.collect()
        log_memory_usage()
    
    lyrics_embeddings = np.vstack([song['embedding'] for song in lyrics_data])
    audio_vectors = np.vstack(audio_vectors)
    
    return lyrics_data, lyrics_embeddings, audio_vectors, playlist_names

# Execute the data fetching and processing
logger.info("Fetching and processing data...")
lyrics_data, lyrics_embeddings, audio_vectors, playlist_names = fetch_and_process_data(sp, limit=5)

# Create indices for the embeddings and vectors to facilitate efficient similarity searches
lyrics_index = create_faiss_index(lyrics_embeddings, 768)
audio_index = create_faiss_index(audio_vectors, 8)



Collecting mlflow
  Obtaining dependency information for mlflow from https://files.pythonhosted.org/packages/19/7a/7d5594ddcaaff7a92caed1d7822cfe52ed01fe06c94b4ad88bcfef579c32/mlflow-2.15.1-py3-none-any.whl.metadata
  Downloading mlflow-2.15.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.15.1 (from mlflow)
  Obtaining dependency information for mlflow-skinny==2.15.1 from https://files.pythonhosted.org/packages/ec/a1/3812743e5dd83317d0469a46d737f0ab5c084fecfecc03a1ac8a7e7ec0d8/mlflow_skinny-2.15.1-py3-none-any.whl.metadata
  Downloading mlflow_skinny-2.15.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Obtaining dependency information for alembic!=1.10.0,<2 from https://files.pythonhosted.org/packages/df/ed/c884465c33c25451e4a5cd4acad154c29e5341e3214e220e7f3478aa4b0d/alembic-1.13.2-py3-none-any.whl.metadata
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Obtaining dependency in

INFO:__main__:Setting up Spotify API...
INFO:__main__:Setting up Genius API...
INFO:__main__:Loading pre-trained transformer model...
INFO:__main__:Fetching and processing data...
INFO:__main__:Fetching top 5 tracks from Spotify...
INFO:spotipy.oauth2:User authentication requires interaction with your web browser. Once you enter your credentials and give authorization, you will be redirected to a url.  Paste that url you were directed to to complete the authorization.
INFO:spotipy.oauth2:Opened https://accounts.spotify.com/authorize?client_id=10cc8ee290404da9ab9d7b061d526193&response_type=code&redirect_uri=http%3A%2F%2Flocalhost%3A8888%2Fcallback&scope=user-top-read+user-library-read+playlist-read-private in your browser
INFO:__main__:Fetched 5 tracks.
INFO:__main__:Fetching audio features from Spotify...
INFO:__main__:Fetched audio features for 5 tracks.
INFO:__main__:Fetching user playlists from Spotify...
INFO:__main__:Fetched 50 playlists.
INFO:__main__:Fetching lyrics for THE GREA

Searching for "THE GREATEST" by Billie Eilish...


INFO:__main__:Fetched lyrics for THE GREATEST.


Done.


INFO:__main__:Fetching lyrics for No Surprises by Radiohead from Genius...


Searching for "No Surprises" by Radiohead...


INFO:__main__:Fetched lyrics for No Surprises.
INFO:__main__:Memory usage: 696.61 MB
INFO:__main__:Fetching lyrics for Bunker by Balthazar from Genius...


Done.
Searching for "Bunker" by Balthazar...


INFO:__main__:Fetched lyrics for Bunker.
INFO:__main__:Fetching lyrics for Candy by Paolo Nutini from Genius...


Done.
Searching for "Candy" by Paolo Nutini...


INFO:__main__:Fetched lyrics for Candy.
INFO:__main__:Memory usage: 822.36 MB
INFO:__main__:Fetching lyrics for Fake Plastic Trees by Radiohead from Genius...


Done.
Searching for "Fake Plastic Trees" by Radiohead...


INFO:__main__:Fetched lyrics for Fake Plastic Trees.
INFO:__main__:Memory usage: 826.78 MB
INFO:__main__:Creating FAISS index with dimension 768...
INFO:__main__:FAISS index created.
INFO:__main__:Memory usage: 827.02 MB
INFO:__main__:Creating FAISS index with dimension 8...
INFO:__main__:FAISS index created.
INFO:__main__:Memory usage: 827.03 MB


Done.
