In [1]:
# code sketch:
# 1. small subset of songs, get lyrics / info from spotify / genius api
#     turn lyrics coupled w metadata into vector using BERT or something
#.    store in Pinecone? FAISS
# 2. user input > gpt 4 / langchain to get semantic vector rep. of that
#.     compare using cosine sim., closest 10 get suggested


In [None]:
# SPOTIFY / GENIUS END:
#.   (first, parse user input, find vector vs. "energy" and other things that could narrow the search)
#    pick 100 random songs that work > get their metadata and send to genius > extract lyrics, save as text files
#          if no lyrics, reject

In [None]:
# LANGCHAIN:
#   turn the lyrics into vectors > store in pinecone
#   user input from gpt into langchain > comapre to vector store
#   Find top 10 songs in song names > send back to spotify to pull up those songs as playable

In [98]:
# Global imports
from tqdm import tnrange, tqdm_notebook, tqdm
from time import sleep
from lyricsgenius import Genius
import time
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import random

from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import DeepLake
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.vectorstores import FAISS
from langchain.vectorstores import Pinecone


#get my secret sauce variables
from dotenv import load_dotenv
import os
load_dotenv() 

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')
SPOTIFY_CLIENT_ID = os.getenv('SPOTIFY_CLIENT_ID')
SPOTIFY_CLIENT_SECRET=os.getenv('SPOTIFY_CLIENT_SECRET')
token = os.getenv('GENIUS_ACCESS_TOKEN')

In [131]:
#Spotify Handling, where we get the 100 sorted, random songs with lyrics:

# need to add song length detection, nothing over 6 mins.

#Authentication - without user
client_credentials_manager = SpotifyClientCredentials(client_id='36c36f81c0ff4b9da4cdbb44d12de650', client_secret='4ecf40e8b9f5443bae664b740a319465')
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

#pick 100 random songs

import random
import string

#this part basically picks random characters from the list of all characters

def get_random_search():
    # A list of all characters that can be chosen.
    characters = string.ascii_lowercase

    # Gets a random character from the characters string.
    random_character = random.choice(characters)
    random_search = ''

    # Places the wildcard character at the beginning, or both beginning and end, randomly.
    switch = random.randint(0, 1)
    if switch == 0:
        random_search = random_character + '%'
    elif switch == 1:
        random_search = '%' + random_character + '%'

    return random_search


trackList = [] #empty tracklist array we're gonna fill

songNumber = 500 # number of songs we're gonna sample from randomly
pbar = tqdm(total = songNumber) #progress bar init

while len(trackList) <= songNumber: 
    ranNum = random.randint(1,4000) # make sure the offset is a random one so there's never repeats, and they aren't all super pop. new.
    query = get_random_search() #use the function to get a random search on a letter, ie. e% so any song that starts with that letter
    track = sp.search(query, limit=1, offset = ranNum, type='track') #query spotify
    trackURI = track['tracks']['items'][0]['uri']
    trackURI = trackURI.replace('spotify:album:', '')
    trackName = track['tracks']['items'][0]['name']
    trackArtist = track['tracks']['items'][0]['artists'][0]['name']
    audio_features = sp.audio_features(trackURI)
    instrumentalness = audio_features[0]['instrumentalness'] #does this have lyrics?
    
    if instrumentalness <= 0.5: #if no lyrics, reject and get a new one
        sleep(0.1)
        pbar.update(1)
        array = [trackName, trackArtist]
        trackList.append(array)




101it [6:39:33, 237.36s/it]                             | 0/500 [00:00<?, ?it/s][A
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'b%', 'limit': 1, 'offset': 2430, 'type': 'track', 'market': None} returned 400 due to Bad request.


SpotifyException: http status: 400, code:-1 - https://api.spotify.com/v1/search?q=b%25&limit=1&offset=2430&type=track:
 Bad request., reason: None

In [None]:
# GENIUS PART , now we take the song lists and download the lyrics for each one into separate text files for parsing with LangCHAIN:
# going to use lyricsgenius which is a python wrapper for GENIUS API
genius = Genius(token)

import time
import lyricsgenius
from requests.exceptions import Timeout

genius = lyricsgenius.Genius("my_token")  # replace with your token
genius.verbose = False

#custom get song lyrics function, with timeout behavior
def get_song_lyrics(title, artist, max_retries=5, delay_between_retries=10):
    for attempt in range(max_retries):
        try:
            song = genius.search_song(title=title, artist=artist)
            if song is not None:
                return song
            else:
                print(f"No lyrics found for {title} by {artist}.")
                return None
        except Timeout:
            if attempt < max_retries - 1:  # it's not the last attempt
                print(f"Timeout occurred when getting lyrics for {title} by {artist}. "
                      f"Waiting {delay_between_retries} seconds before retrying...")
                time.sleep(delay_between_retries)
                continue
            else:  # it's the last attempt
                print(f"Still experiencing timeouts after {max_retries} attempts. "
                      f"Giving up on getting lyrics for {title} by {artist}.")
                return None
        except Exception as e:
            print(f"An error occurred: {e}")
            return None

for i in tqdm(trackList):
    title = i[0]
    artist = i[1]
    song = get_song_lyrics(title, artist,5,10)
    if song != None and "[Verse" in song.lyrics:
        lyrics = song.lyrics
        with open('lyrics/%s_%s.txt'%(title,artist), 'w') as f:
            f.write(lyrics)
            

In [None]:
#LANGCHAIN TIME
# here gonna take each lyric file and summarize it / vectorize it and store in pinecone
# initialize langchain / openAI
import pinecone

embeddings = OpenAIEmbeddings(openai_api_key = openai_api_key)

docs=[]

for filename in tqdm(os.listdir(directory)):
    if filename.endswith(".txt"):  # check if the file is a .txt file
        file_path = os.path.join(directory, filename)
        # now you can do something with the file
        loader = TextLoader(file_path)
        docs.append(loader.load())
        

In [None]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment='asia-northeast1-gcp'
)

index_name="spotifysemantic"

for i in tqdm(docs):
    docsearch=Pinecone.from_documents(i, embeddings, index_name=index_name)





In [None]:
docsearch = Pinecone.from_existing_index(index_name, embeddings)
query = "something is wrong but i can't put my finger on it"
ans = docsearch.similarity_search(query)

for i in ans:
    print(i.metadata)