<a href="https://colab.research.google.com/github/towardsai/ragbook-notebooks/blob/main/notebooks/Chapter%2005%20-%20LlamaIndex_Introduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## SETUP

In [20]:
# Load the autoreload extension
%load_ext autoreload

# Set autoreload to mode 2 to reload all modules automatically
%autoreload 2

In [2]:
%pip install -q langchain openai python-dotenv pypdf deeplake langchain-community llama-index-vector-stores-deeplake
%pip install -q requests beautifulsoup4 llama-index faiss-cpu openai llama-index-vector-stores-faiss

Note: you may need to restart the kernel to use updated packages.


In [21]:
import os
import openai

from dotenv import load_dotenv

load_dotenv()

# os.environ['ACTIVELOOP_TOKEN']

# Set the API key
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables")

openai.api_key = api_key

In [22]:
import logging
import sys

#You can set the logging level to DEBUG for more verbose output,
# or use level=logging.INFO for less detailed information.
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Imports

In [23]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, Document
from llama_index.node_parser import SimpleNodeParser
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores import FaissVectorStore
from llama_index.embeddings import OpenAIEmbedding
from llama_index.llms import OpenAI
from llama_index.indices.composability import ComposableGraph
from llama_index.indices.keyword_table import KeywordTableIndex
import faiss

  WikipediaReader = download_loader("WikipediaReader")




## Get data

In [1]:
import requests
from bs4 import BeautifulSoup
import time
from typing import List, Tuple
import json
import os

def get_song_urls(base_url: str) -> List[str]:
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    song_elements = soup.select('li.songList-table-row.--song')
    
    urls = []
    for element in song_elements:
        share_url = element.get('data-shareurl')
        if share_url:
            urls.append(share_url)
    
    return urls

def get_lyrics_and_meaning(url: str) -> Tuple[str, str]:
    # Get lyrics
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    lyrics = soup.select_one('div.lyric-original')
    lyrics_text = lyrics.get_text(strip=True) if lyrics else ""
    
    # Get meaning
    meaning_url = url.rstrip('/') + '/significado.html'
    meaning_response = requests.get(meaning_url)
    meaning_soup = BeautifulSoup(meaning_response.text, 'html.parser')
    
    meaning = meaning_soup.select_one('div.lyric-meaning')
    meaning_text = meaning.get_text(strip=True) if meaning else ""
    
    return lyrics_text, meaning_text

def scrape_cerati_lyrics() -> List[Tuple[str, str]]:
    base_url = "https://www.letras.com/gustavo-cerati/"
    song_urls = get_song_urls(base_url)
    lyrics_and_meanings = []

    for url in song_urls:
        print(f"Scraping: {url}")
        lyrics, meaning = get_lyrics_and_meaning(url)
        if lyrics or meaning:
            lyrics_and_meanings.append((lyrics, meaning))
        time.sleep(1)  # Be nice to the server

    return lyrics_and_meanings

def load_or_scrape_cerati_data(file_path='cerati_lyrics_and_meanings.json'):
    if os.path.exists(file_path):
        print(f"Loading existing data from {file_path}")
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    else:
        print("Scraping Cerati lyrics and meanings...")
        cerati_data = scrape_cerati_lyrics()
        print(f"Scraped {len(cerati_data)} songs.")
        
        # Save the data to a file
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(cerati_data, f, ensure_ascii=False, indent=2)
        print(f"Data saved to {file_path}")
        
        return cerati_data

# Run the scraper
if __name__ == "__main__":
    cerati_data = load_or_scrape_cerati_data()
    print(f"Collected data for {len(cerati_data)} songs.")
    
    # Print a sample to verify
    if cerati_data:
        print("\nSample data:")
        print("Lyrics:", cerati_data[0][0][:100] + "..." if cerati_data[0][0] else "No lyrics")
        print("Meaning:", cerati_data[0][1][:100] + "..." if cerati_data[0][1] else "No meaning")

Loading existing data from cerati_lyrics_and_meanings.json
Collected data for 169 songs.

Sample data:
Lyrics: Por aquello que encontré en tus ojosPor aquello que perdí en la luchaConocer la otra mitad es pocoCo...
Meaning: La Búsqueda de la Plenitud en 'Vivo' de Gustavo CeratiLa canción 'Vivo' de Gustavo Cerati, una figur...


## FAISS

In [13]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document, Settings
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.storage.storage_context import StorageContext
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.indices.composability import ComposableGraph
from llama_index.core.indices.keyword_table import KeywordTableIndex
from llama_index.core.query_engine import MultiStepQueryEngine
from llama_index.core.readers.download import download_loader
import pickle
import os

# Download and initialize the WikipediaReader
WikipediaReader = download_loader("WikipediaReader")
loader = WikipediaReader()

# Fetch Gustavo Cerati's Wikipedia page
cerati_wiki_documents = loader.load_data(pages=['Gustavo Cerati'])

# Assuming cerati_data is available from the previous script
lyrics_documents = []
meaning_documents = []
for lyrics, meaning in cerati_data:
    lyrics_doc = Document(text=lyrics, metadata={"type": "lyrics"})
    meaning_doc = Document(text=meaning, metadata={"type": "meaning"})
    lyrics_documents.append(lyrics_doc)
    meaning_documents.append(meaning_doc)

# Set up models
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-4o-mini")

# Update global settings
Settings.embed_model = embed_model
Settings.llm = llm

# Create FAISS index
d = 1536  # dimensionality of text-embedding-3-small
faiss_index = faiss.IndexFlatL2(d)
vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Create VectorStoreIndex for lyrics
lyrics_index = VectorStoreIndex.from_documents(
    lyrics_documents,
    storage_context=storage_context
)

# Create KeywordTableIndex for meanings
meaning_index = KeywordTableIndex.from_documents(
    meaning_documents,
)

# Create a composable graph
graph = ComposableGraph.from_indices(
    KeywordTableIndex,
    [lyrics_index, meaning_index],
    index_summaries=["Lyrics of Gustavo Cerati songs", "Meanings of Gustavo Cerati songs"]
)

# Create a query engine
query_engine = graph.as_query_engine()

# Function to generate creative text
def generate_creative_text(prompt):
    response = query_engine.query(
        f"Using the style and themes of Gustavo Cerati's lyrics, and considering their meanings, "
        f"create a new full length song lyrics in spanish inspired by the following prompt: {prompt}. "
        f"The response should be in the style of song lyrics. It should include an analysis of its meaning at the end"
    )
    return response

# Example usage
creative_prompt = "Cráneo abierto, el disco que no fue y la historia del ictus que me llevó. Dedicada a Cris, Topi, Morgui y Naho"
new_text = generate_creative_text(creative_prompt)
print(new_text)

In the shadows of a broken dream,  
Donde el tiempo se detuvo sin querer,  
Un eco de silencio en mi ser,  
Un susurro de recuerdos que no se pueden ver.  

En el laberinto de un cráneo abierto,  
Donde se esconde el misterio incierto,  
Se desvanecen los colores del ayer,  
En la danza eterna de un amanecer.  

El disco que no fue, la canción sin final,  
Un suspiro perdido en la inmensidad,  
La melodía que se desvaneció en el viento,  
En el eco de un lamento sin aliento.  

La historia del ictus que me llevó,  
Un latido que se perdió en la oscuridad,  
Un destello de luz en la penumbra,  
En el abrazo eterno de la eternidad.  

Dedicada a Cris, Topi, Morgui y Naho,  
En el lienzo de la vida que se despliega,  
En cada nota de este canto sincero,  
En la melodía eterna que el corazón entrega.  

Análisis: La canción habla de la experiencia de enfrentar la pérdida, la nostalgia y la esperanza a través de metáforas visuales y emocionales. Describe un viaje interno a través de la memo

## Tools

In [44]:
from llama_index.core import VectorStoreIndex, Document, Settings, ServiceContext
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.vector_stores import SimpleVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.indices.keyword_table import KeywordTableIndex
from llama_index.core.query_engine import MultiStepQueryEngine
from llama_index.core.readers.download import download_loader

from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors.pydantic_selectors import PydanticSingleSelector
from llama_index.core.tools import QueryEngineTool, ToolMetadata
import pickle
import os


def load_or_scrape_cerati_data(file_path='cerati_lyrics_and_meanings.json'):
    if os.path.exists(file_path):
        print(f"Loading existing data from {file_path}")
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    else:
        print("Scraping Cerati lyrics and meanings...")
        cerati_data = scrape_cerati_lyrics()
        print(f"Scraped {len(cerati_data)} songs.")
        
        # Save the data to a file
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(cerati_data, f, ensure_ascii=False, indent=2)
        print(f"Data saved to {file_path}")
        
        return cerati_data

def create_documents(cerati_data):
    lyrics_documents = []
    meaning_documents = []
    for lyrics, meaning in cerati_data:
        lyrics_doc = Document(text=lyrics, metadata={"type": "lyrics"})
        meaning_doc = Document(text=meaning, metadata={"type": "meaning"})
        lyrics_documents.append(lyrics_doc)
        meaning_documents.append(meaning_doc)
    return lyrics_documents, meaning_documents

# Load or scrape Cerati data
cerati_data = load_or_scrape_cerati_data()

# Create documents
lyrics_documents, meaning_documents = create_documents(cerati_data)

# Download and initialize the WikipediaReader
WikipediaReader = download_loader("WikipediaReader")
loader = WikipediaReader()
cerati_wiki_documents = loader.load_data(pages=['Gustavo Cerati', 'Soda Stereo'])

print(f"Number of lyrics documents: {len(lyrics_documents)}")
print(f"Number of meaning documents: {len(meaning_documents)}")
print(f"Number of Wikipedia documents: {len(cerati_wiki_documents)}")

# Set up models with the specified versions
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-4o-mini", temperature=0.7)

# Update global settings
Settings.embed_model = embed_model
Settings.llm = llm


# Create SimpleVectorStore
vector_store = SimpleVectorStore()
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Create VectorStoreIndex for lyrics and Wikipedia biography
lyrics_bio_index = VectorStoreIndex.from_documents(
    lyrics_documents + cerati_wiki_documents,
    storage_context=storage_context,
    service_context=service_context,
)

# Create KeywordTableIndex for meanings
meaning_index = KeywordTableIndex.from_documents(
    meaning_documents,
    service_context=service_context,
)

# Create query engine tools
lyrics_bio_tool = QueryEngineTool(
    query_engine=lyrics_bio_index.as_query_engine(similarity_top_k=20),
    metadata=ToolMetadata(
        name="lyrics_and_bio",
        description="Useful for questions about Gustavo Cerati's lyrics and biography"
    )
)

meaning_tool = QueryEngineTool(
    query_engine=meaning_index.as_query_engine(similarity_top_k=20),
    metadata=ToolMetadata(
        name="song_meanings",
        description="Useful for questions about the meanings of Gustavo Cerati's songs"
    )
)

# Create a router query engine
router_query_engine = RouterQueryEngine(
    selector=PydanticSingleSelector.from_defaults(),
    query_engine_tools=[lyrics_bio_tool, meaning_tool]
)

# Function to generate autobiographical creative text
def generate_autobiographical_text(prompt):
    response = router_query_engine.query(
        f"Using the style and themes of Gustavo Cerati's lyrics, and considering their meanings, "
        f"create a new full length song lyrics in spanish inspired by the following prompt: {prompt}. "
        f"The response should be in the style of song lyrics. It should include an analysis of its meaning at the end"
    )
    return response

# Example usage
creative_prompt = "Cráneo abierto"
new_text = generate_autobiographical_text(creative_prompt)
print(new_text)

Loading existing data from cerati_lyrics_and_meanings.json


  WikipediaReader = download_loader("WikipediaReader")


Number of lyrics documents: 169
Number of meaning documents: 169
Number of Wikipedia documents: 2
**Cráneo Abierto**

(Verse 1)  
En la penumbra de un susurro,  
las sombras juegan en mi piel,  
un cráneo abierto, un laberinto,  
donde los ecos vuelven a nacer.  
Las ideas fluyen como ríos,  
en un torrente de luz y de fe,  
cada pensamiento es un destello,  
un reflejo de lo que no se ve.

(Chorus)  
Cráneo abierto, corazón sincero,  
buscando respuestas en el silencio.  
Las voces del alma, un canto eterno,  
en este viaje, soy el viajero.  
Cráneo abierto, sin miedo a caer,  
las verdades ocultas empiezan a arder.  
En la fragilidad de lo que soy,  
encuentro la fuerza, me vuelvo a crear.

(Verse 2)  
Las cicatrices cuentan historias,  
de amores perdidos y sueños de ayer,  
cada herida es una memoria,  
un mapa que guía mi amanecer.  
En la tormenta de mis pensamientos,  
las dudas flotan como un papel,  
pero en el caos, hallo el momento,  
donde el dolor se convierte en miel.

(C