In [67]:
import requests
from bs4 import BeautifulSoup

kjarkas_id = 8988


def extract_lyrics_links(artist_url: str) -> list[str]:
    response = requests.get(url=artist_url)
    print(f"Response from artist page: {response.status_code}")
    soup = BeautifulSoup(response.text, "html.parser")
    links_ul = soup.find("ul", class_="listado-letras")
    links = [a["href"] for a in links_ul.find_all("a")]
    return links

def extract_song_lyric(song_url: str) -> str:
    response = requests.get(url=song_url)
    response.encoding = "utf-8"
    print(f"Response from lyrics page: {response.status_code}")
    soup = BeautifulSoup(response.text, "html.parser")
    song_title = soup.find("h1").get_text()
    print(f"Title: {song_title}")
    header = soup.find("h2", string="LETRA")
    if header is None:
        header = soup.find("h2", string="LETRA EN ESPAÑOL")
        
    if header is None:
        return ""

    paragraphs = []

    for p in header.find_all_next("p"):
        if p.find_parent("div") is None:
            continue
        if p.find_parent("div").get("id") == "letra":
            paragraphs.append(p.get_text(separator="\n"))

    lyrics = "\n".join(p for p in paragraphs)
    return f"Título: {song_title} \n\n {lyrics}"

def extract_artist_lyrics(artist: str, artist_id: int):
    lyrics_url = f"https://www.musica.com/letras.asp?letras={artist_id}&orden=alf"
    links = extract_lyrics_links(lyrics_url)
    print(f"found {len(links)} songs")
    artist_header = f"{artist}\n===\n"
    lyrics = []
    for link in links:
        print(f"extracting song from: {link}")
        lyrics.append(extract_song_lyric(link))
    lyrics_str = "\n\n===\n\n".join(lyrics)
    artist_str = artist_header + lyrics_str
    with open(f"{artist_id}_{artist}.txt", "w", encoding="utf-8") as file:
        file.write(artist_str)
    return artist_str

In [None]:
extract_artist_lyrics("Kjarkas", kjarkas_id)

In [1]:
from dotenv import load_dotenv
import openai
import os
from llama_index.core import (
    VectorStoreIndex, 
    SimpleDirectoryReader, 
    StorageContext,
    load_index_from_storage,
    Document,
    PromptTemplate
)
load_dotenv()

True

In [2]:
def get_artist_documents(filename: str) -> list[Document]:
    with open(filename) as file:
        data = file.read()
    songs = data.split("===")
    artist = songs.pop(0).strip()
    
    documents = [
        Document(
            text=song,
            metadata={
                "category":"music",
                "artist": artist,
            }
        )
        for song in songs
    ]    
    return documents
    

In [3]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
Settings.embed_model = embed_model

In [4]:
PERSIST_DIR = "lyrics_store2"

if not os.path.exists(PERSIST_DIR):
    documents = get_artist_documents("8988_Kjarkas.txt")
    index = VectorStoreIndex.from_documents(documents, show_progress=True)
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) 
    index = load_index_from_storage(storage_context)

In [5]:
query_engine = index.as_query_engine(verbose=True)

In [8]:
qa_template_str = """
    You are an expert in Bolivian Folk music, your task is to guide and teach the user 
    about your field. Answer the user queries only with supported data in your context.
    Your context may contain complete lyrics or parts of them in different languages, but
    your answer will always be in Spanish. 

    Context information is below.
    ---------------------
    {context_str}
    ---------------------
    Given the context information and not prior knowledge, 
    answer the query with detailed source information, include direct quotes and use bullet lists in your 
    answers, in one of the bullets detail the tone/sentiment of the song.
    Query: {query_str}
    Answer: 
"""
qa_template = PromptTemplate(qa_template_str)

In [9]:
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_template}
)

In [12]:
response = query_engine.query("cuales canciones de los Kjarkas hablan de abandono?")

In [13]:
print(response)

- La canción "Llorando se fue" de Kjarkas habla de abandono, donde se menciona que la persona se fue llorando y dejó solo al protagonista con dolor.
  - Fragmento de la letra: "Llorando se fue y me dejo solo y sin su amor"
  - Tono/sentimiento de la canción: Melancólico y nostálgico, con una sensación de pérdida y tristeza.

- Otra canción de Kjarkas que habla de abandono es "El adios", donde se despide de su amor y reconoce que puede haber causado dolor.
  - Fragmento de la letra: "adios sigo mi camino recuerdame con cariño"
  - Tono/sentimiento de la canción: Melancólico y reflexivo, con un tono de despedida y perdón.


In [None]:
response.source_nodes

In [14]:
chat_engine = index.as_chat_engine(verbose=True)

In [15]:
response = chat_engine.chat("que canciones de los kjarkas hablan de abandono?")

Added user message to memory: que canciones de los kjarkas hablan de abandono?
=== Calling Function ===
Calling function: query_engine_tool with args: {"input":"canciones de los kjarkas sobre abandono"}
Got output: The songs by Kjarkas that touch on the theme of abandonment are "Al partir" and "Llorando se fue."



In [16]:
print(response)

Las canciones de Los Kjarkas que hablan sobre abandono son "Al partir" y "Llorando se fue".


In [17]:
print(chat_engine.chat("y cuales hablan de la naturaleza?"))

Added user message to memory: y cuales hablan de la naturaleza?
=== Calling Function ===
Calling function: query_engine_tool with args: {"input":"canciones de los kjarkas sobre naturaleza"}
Got output: The songs by Kjarkas mentioned in the context reflect themes related to nature, specifically mountains, villages, and valleys. The lyrics describe elements like black braids, brown skin of mountains, and lost villages, portraying a connection to the natural world.

Las canciones de Los Kjarkas que hablan sobre la naturaleza incluyen temas relacionados con montañas, pueblos y valles. Las letras describen elementos como trenzas negras, piel marrón de las montañas y pueblos perdidos, mostrando una conexión con el mundo natural.
