In [1]:
import requests
from bs4 import BeautifulSoup

kjarkas_id = 8988


def extract_lyrics_links(artist_url: str) -> list[str]:
    response = requests.get(url=artist_url)
    print(f"Response from artist page: {response.status_code}")
    soup = BeautifulSoup(response.text, "html.parser")
    links_ul = soup.find("ul", class_="listado-letras")
    links = [a["href"] for a in links_ul.find_all("a")]
    return links

def extract_song_lyric(song_url: str) -> str:
    response = requests.get(url=song_url)
    response.encoding = "utf-8"
    print(f"Response from lyrics page: {response.status_code}")
    soup = BeautifulSoup(response.text, "html.parser")
    song_title = soup.find("h1").get_text()
    print(f"Title: {song_title}")
    header = soup.find("h2", string="LETRA")
    if header is None:
        header = soup.find("h2", string="LETRA EN ESPAÑOL")
        
    if header is None:
        return ""

    paragraphs = []

    for p in header.find_all_next("p"):
        if p.find_parent("div") is None:
            continue
        if p.find_parent("div").get("id") == "letra":
            paragraphs.append(p.get_text(separator="\n"))

    lyrics = "\n".join(p for p in paragraphs)
    return f"Título: {song_title} \n\n {lyrics}"

def extract_artist_lyrics(artist: str, artist_id: int):
    lyrics_url = f"https://www.musica.com/letras.asp?letras={artist_id}&orden=alf"
    links = extract_lyrics_links(lyrics_url)
    print(f"found {len(links)} songs")
    artist_header = f"{artist}\n===\n"
    lyrics = []
    for link in links:
        print(f"extracting song from: {link}")
        lyrics.append(extract_song_lyric(link))
    lyrics_str = "\n\n===\n\n".join(lyrics)
    artist_str = artist_header + lyrics_str
    with open(f"{artist_id}_{artist}.txt", "w", encoding="utf-8") as file:
        file.write(artist_str)
    return artist_str

In [2]:
#extract_artist_lyrics("Kjarkas", kjarkas_id)

In [3]:
import chromadb
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection(name="kjarkas_songs2")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [4]:
from dotenv import load_dotenv
import openai
import os
from llama_index.core import (
    VectorStoreIndex, 
    SimpleDirectoryReader, 
    StorageContext,
    load_index_from_storage,
    Document,
    PromptTemplate
)
from llama_index.vector_stores.chroma import ChromaVectorStore

load_dotenv()
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI

# override default LLM 
llm = OpenAI(model="gpt-4o-mini")
Settings.llm = llm

In [5]:
def get_artist_documents(filename: str) -> list[Document]:
    with open(filename) as file:
        data = file.read()
    songs = data.split("===")
    artist = songs.pop(0).strip()
    
    documents = [
        Document(
            text=song,
            metadata={
                "category":"music",
                "artist": artist,
            }
        )
        for song in songs
    ]    
    return documents
    

In [6]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
Settings.embed_model = embed_model

In [7]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(get_artist_documents("8988_Kjarkas.txt"), storage_context=storage_context)
# PERSIST_DIR = "lyrics_index"

# if not os.path.exists(PERSIST_DIR):
#     documents = get_artist_documents("8988_Kjarkas.txt")
#     index = VectorStoreIndex.from_documents(documents, show_progress=True)
#     index.storage_context.persist(persist_dir=PERSIST_DIR)
# else:
#     storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) 
#     index = load_index_from_storage(storage_context)

Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given


In [8]:
query_engine = index.as_query_engine(verbose=True)

In [9]:
qa_template_str = """
    You are an expert in Bolivian Folk music, your task is to guide and teach the user 
    about your field. Answer the user queries only with supported data in your context.
    Your context may contain complete lyrics or parts of them in different languages, but
    your answer will always be in Spanish. 

    Context information is below.
    ---------------------
    {context_str}
    ---------------------
    Given the context information and not prior knowledge, 
    answer the query with detailed source information, include direct quotes and use bullet lists in your 
    answers, in one of the bullets detail the tone/sentiment of the song.
    Query: {query_str}
    Answer: 
"""
qa_template = PromptTemplate(qa_template_str)

In [10]:
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_template}
)

In [11]:
response = query_engine.query("cuales canciones de los Kjarkas hablan de abandono?")

ValueError: Expected where to have exactly one operator, got {} in query.

In [None]:
from IPython.display import Markdown, display
display(Markdown(response.response))

In [None]:
response.source_nodes

In [None]:
chat_engine = index.as_chat_engine(verbose=True)

In [None]:
response = chat_engine.chat("que canciones de los kjarkas hablan de abandono?")

In [None]:
display(Markdown(response.response))

In [None]:
display(Markdown(chat_engine.chat("y cuales hablan de la naturaleza?").response))