# Cifras.com.br Data Availability

In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

from bs4 import BeautifulSoup

options = Options()
options.add_argument("--headless")

driver = webdriver.Firefox(options=options)

url = "https://www.cifras.com.br/robots.txt"

driver.get(url)

soup = BeautifulSoup(driver.page_source, "lxml")

txt = soup.find("pre")

driver.quit()

print(txt)

<pre>User-agent: googlebot
User-agent: google
User-agent: bingbot
User-agent: bing
Disallow:

User-agent: *
Disallow:
Crawl-delay: 10

Sitemap: https://www.cifras.com.br/sitemap_index.xml.gz
</pre>


GREAT NEWS! Cifras.com.br allow us to scrape data according to their website by respecting a 10s interval between requests.

## Check if data is available (i.e. URL link exists)

First, build a function that checks if a URL is available following the sitemap information.

In [5]:
!zcat sitemap_index.xml.gz

<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>https://www.cifras.com.br/sitemaps/artists-1.xml.gz</loc><lastmod>2023-12-07T02:33:52+00:00</lastmod></sitemap><sitemap><loc>https://www.cifras.com.br/sitemaps/artists-2.xml.gz</loc><lastmod>2023-12-07T02:33:52+00:00</lastmod></sitemap><sitemap><loc>https://www.cifras.com.br/sitemaps/artists-3.xml.gz</loc><lastmod>2023-12-07T02:33:52+00:00</lastmod></sitemap><sitemap><loc>https://www.cifras.com.br/sitemaps/artists-4.xml.gz</loc><lastmod>2023-12-07T02:33:52+00:00</lastmod></sitemap><sitemap><loc>https://www.cifras.com.br/sitemaps/top-songs-1.xml.gz</loc><lastmod>2023-12-07T02:33:52+00:00</lastmod></sitemap><sitemap><loc>https://www.cifras.com.br/sitemaps/songs-1.xml.gz</loc><lastmod>2023-12-07T02:33:52+00:00</lastmod></sitemap><sitemap><loc>https://www.cifras.com.br/sitemaps/songs-2.xml.gz</loc><lastmod>2023-12-07T02:33:52+00:00</lastmod></sitemap><sitemap><loc>https:/

In [13]:
import xml.etree.ElementTree as ET

def extract_links(xml_file):
    links = []
    
    # Parse the XML content
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Define the namespace
    namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
    
    # Find all <loc> tags within the defined namespace
    loc_tags = root.findall('.//ns:loc', namespace)
    
    # Extract the text content of each <loc> tag (i.e., the links)
    links = [loc.text for loc in loc_tags]
    
    return links

# Extract links from the XML content
xml_file = 'sitemap_index.xml'
result_links = extract_links(xml_file)

# Print the extracted links
for link in result_links:
    print(link)


https://www.cifras.com.br/sitemaps/artists-1.xml.gz
https://www.cifras.com.br/sitemaps/artists-2.xml.gz
https://www.cifras.com.br/sitemaps/artists-3.xml.gz
https://www.cifras.com.br/sitemaps/artists-4.xml.gz
https://www.cifras.com.br/sitemaps/top-songs-1.xml.gz
https://www.cifras.com.br/sitemaps/songs-1.xml.gz
https://www.cifras.com.br/sitemaps/songs-2.xml.gz
https://www.cifras.com.br/sitemaps/songs-3.xml.gz
https://www.cifras.com.br/sitemaps/songs-4.xml.gz
https://www.cifras.com.br/sitemaps/songs-5.xml.gz
https://www.cifras.com.br/sitemaps/songs-6.xml.gz
https://www.cifras.com.br/sitemaps/songs-7.xml.gz
https://www.cifras.com.br/sitemaps/songs-8.xml.gz
https://www.cifras.com.br/sitemaps/songs-9.xml.gz
https://www.cifras.com.br/sitemaps/songs-10.xml.gz
https://www.cifras.com.br/sitemaps/songs-11.xml.gz
https://www.cifras.com.br/sitemaps/songs-12.xml.gz
https://www.cifras.com.br/sitemaps/songs-13.xml.gz
https://www.cifras.com.br/sitemaps/songs-14.xml.gz
https://www.cifras.com.br/sitemap

In [14]:
import subprocess
import os

download_folder = "sitemap"
os.makedirs(download_folder, exist_ok=True)

# Iterate over the extracted links and download each file into the specified folder
for link in result_links:
    subprocess.run(["wget", link, "-P", download_folder])


--2024-01-08 17:07:26--  https://www.cifras.com.br/sitemaps/artists-1.xml.gz
Resolving www.cifras.com.br (www.cifras.com.br)... 172.67.74.6, 104.26.7.6, 104.26.6.6
Connecting to www.cifras.com.br (www.cifras.com.br)|172.67.74.6|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 134814 (132K) [application/octet-stream]
Saving to: ‘sitemap/artists-1.xml.gz’

     0K .......... .......... .......... .......... .......... 37% 1.13M 0s
    50K .......... .......... .......... .......... .......... 75% 14.2M 0s
   100K .......... .......... .......... .                    100% 1.56M=0.07s

2024-01-08 17:07:27 (1.94 MB/s) - ‘sitemap/artists-1.xml.gz’ saved [134814/134814]

--2024-01-08 17:07:27--  https://www.cifras.com.br/sitemaps/artists-2.xml.gz
Resolving www.cifras.com.br (www.cifras.com.br)... 172.67.74.6, 104.26.6.6, 104.26.7.6
Connecting to www.cifras.com.br (www.cifras.com.br)|172.67.74.6|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 

HTTP request sent, awaiting response... 200 OK
Length: 319640 (312K) [application/octet-stream]
Saving to: ‘sitemap/songs-6.xml.gz’

     0K .......... .......... .......... .......... .......... 16% 2.17M 0s
    50K .......... .......... .......... .......... .......... 32% 2.50M 0s
   100K .......... .......... .......... .......... .......... 48% 1.03M 0s
   150K .......... .......... .......... .......... .......... 64% 16.2M 0s
   200K .......... .......... .......... .......... .......... 80% 24.1M 0s
   250K .......... .......... .......... .......... .......... 96% 4.31M 0s
   300K .......... ..                                         100% 42.4M=0.1s

2024-01-08 17:07:35 (2.87 MB/s) - ‘sitemap/songs-6.xml.gz’ saved [319640/319640]

--2024-01-08 17:07:35--  https://www.cifras.com.br/sitemaps/songs-7.xml.gz
Resolving www.cifras.com.br (www.cifras.com.br)... 104.26.6.6, 172.67.74.6, 104.26.7.6
Connecting to www.cifras.com.br (www.cifras.com.br)|104.26.6.6|:443... connected.
HTTP r

   150K .......... .......... .......... .......... .......... 61%  818K 0s
   200K .......... .......... .......... .......... .......... 77% 2.31M 0s
   250K .......... .......... .......... .......... .......... 92% 2.07M 0s
   300K .......... .......... ...                             100% 12.7M=0.3s

2024-01-08 17:07:44 (1.07 MB/s) - ‘sitemap/songs-14.xml.gz’ saved [331667/331667]

--2024-01-08 17:07:44--  https://www.cifras.com.br/sitemaps/songs-15.xml.gz
Resolving www.cifras.com.br (www.cifras.com.br)... 104.26.6.6, 104.26.7.6, 172.67.74.6
Connecting to www.cifras.com.br (www.cifras.com.br)|104.26.6.6|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 324600 (317K) [application/octet-stream]
Saving to: ‘sitemap/songs-15.xml.gz’

     0K .......... .......... .......... .......... .......... 15% 1.58M 0s
    50K .......... .......... .......... .......... .......... 31% 6.72M 0s
   100K .......... .......... .......... .......... .......... 47% 3.31M 0s
   

Resolving www.cifras.com.br (www.cifras.com.br)... 104.26.6.6, 104.26.7.6, 172.67.74.6
Connecting to www.cifras.com.br (www.cifras.com.br)|104.26.6.6|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 272199 (266K) [application/octet-stream]
Saving to: ‘sitemap/songs-23.xml.gz’

     0K .......... .......... .......... .......... .......... 18% 2.28M 0s
    50K .......... .......... .......... .......... .......... 37% 1.32M 0s
   100K .......... .......... .......... .......... .......... 56% 28.6M 0s
   150K .......... .......... .......... .......... .......... 75% 1.55M 0s
   200K .......... .......... .......... .......... .......... 94% 11.5M 0s
   250K .......... .....                                      100% 40.9M=0.1s

2024-01-08 17:07:50 (2.70 MB/s) - ‘sitemap/songs-23.xml.gz’ saved [272199/272199]

--2024-01-08 17:07:50--  https://www.cifras.com.br/sitemaps/songs-24.xml.gz
Resolving www.cifras.com.br (www.cifras.com.br)... 104.26.7.6, 172.67.74.6, 104

HTTP request sent, awaiting response... 200 OK
Length: 270844 (264K) [application/octet-stream]
Saving to: ‘sitemap/playlists-7.xml.gz’

     0K .......... .......... .......... .......... .......... 18% 2.16M 0s
    50K .......... .......... .......... .......... .......... 37% 1.23M 0s
   100K .......... .......... .......... .......... .......... 56% 14.1M 0s
   150K .......... .......... .......... .......... .......... 75% 1.11M 0s
   200K .......... .......... .......... .......... .......... 94% 21.6M 0s
   250K .......... ....                                       100% 30.9M=0.1s

2024-01-08 17:07:59 (2.30 MB/s) - ‘sitemap/playlists-7.xml.gz’ saved [270844/270844]

--2024-01-08 17:07:59--  https://www.cifras.com.br/sitemaps/playlists-8.xml.gz
Resolving www.cifras.com.br (www.cifras.com.br)... 104.26.7.6, 104.26.6.6, 172.67.74.6
Connecting to www.cifras.com.br (www.cifras.com.br)|104.26.7.6|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 269292 (263K) [

In [19]:
xml_files = os.listdir(download_folder)

In [20]:
xml_files[0][0:-3]

'artists-1.xml'

In [29]:
for f in xml_files:
    print("--------------------------------------------------")
    file = download_folder + "/" + f
    print(file)
    print()
    
    xml_content = subprocess.run(["zcat", file], capture_output=True, text=True).stdout

    save_file = file[:-3]
    with open(save_file, "w") as text_file:
        text_file.write(xml_content)

    print(f"File saved as: {save_file}")

--------------------------------------------------
sitemap/artists-1.xml.gz

File saved as: sitemap/artists-1.xml
--------------------------------------------------
sitemap/artists-2.xml.gz

File saved as: sitemap/artists-2.xml
--------------------------------------------------
sitemap/artists-3.xml.gz

File saved as: sitemap/artists-3.xml
--------------------------------------------------
sitemap/artists-4.xml.gz

File saved as: sitemap/artists-4.xml
--------------------------------------------------
sitemap/top-songs-1.xml.gz

File saved as: sitemap/top-songs-1.xml
--------------------------------------------------
sitemap/songs-1.xml.gz

File saved as: sitemap/songs-1.xml
--------------------------------------------------
sitemap/songs-2.xml.gz

File saved as: sitemap/songs-2.xml
--------------------------------------------------
sitemap/songs-3.xml.gz

File saved as: sitemap/songs-3.xml
--------------------------------------------------
sitemap/songs-4.xml.gz

File saved as: sitema

In [38]:
import requests

def check_links():
    #for i in range(24):
    i=3
    sitemap_songfile = download_folder + "/songs-" + str(i) + ".xml"

    tree = ET.parse(sitemap_songfile)
    root = tree.getroot()
   
    # Define the namespace
    namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
    
    # Find all <loc> tags within the defined namespace
    loc_tags = root.findall('.//ns:loc', namespace)
    
    # Extract the text content of each <loc> tag (i.e., the links)
    links = [loc.text for loc in loc_tags]
    
    return links

count = 0
for link in check_links():
    print(link)
    count += 1
print(count)


https://www.cifras.com.br/cifra/jorge-e-mateus/cobertor-ainda-bem-pot-pourri
https://www.cifras.com.br/cifra/jorge-e-mateus/entao-valeu
https://www.cifras.com.br/cifra/jorge-e-mateus/anjo-de-amor-dois-pot-pourri
https://www.cifras.com.br/cifra/misc-traditional/banner-funf-schwane
https://www.cifras.com.br/cifra/ashe/real-love
https://www.cifras.com.br/cifra/the-wedding-present/skin-diving
https://www.cifras.com.br/tablatura-baixo/the-chats/southport-superman
https://www.cifras.com.br/cifra/pp/abrigo
https://www.cifras.com.br/cifra/ashe/save-myself
https://www.cifras.com.br/tablatura-baixo/first-aid-kit/ugly
https://www.cifras.com.br/cifra/desavanco/propaganda
https://www.cifras.com.br/cifra/desavanco/olhos
https://www.cifras.com.br/cifra/garoua/beijo-doce
https://www.cifras.com.br/cifra/chase-eagleson/magic
https://www.cifras.com.br/cifra/jaehyun-nct/forever-only
https://www.cifras.com.br/cifra/desavanco/cotovelo
https://www.cifras.com.br/cifra/takan-cavalo-branco/vem-forca-da-ayahuasc

In [49]:
test_url = "https://www.cifras.com.br/cifra/mestrinho/bom-dia"
aux = test_url[32:].split("/")
filename = aux[0] + "-" + aux[1]
print(filename)import requests

def check_links():
    #for i in range(24):
    i=3
    sitemap_songfile = download_folder + "/songs-" + str(i) + ".xml"

    tree = ET.parse(sitemap_songfile)
    root = tree.getroot()
   
    # Define the namespace
    namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
    
    # Find all <loc> tags within the defined namespace
    loc_tags = root.findall('.//ns:loc', namespace)
    
    # Extract the text content of each <loc> tag (i.e., the links)
    links = [loc.text for loc in loc_tags]
    
    return links

count = 0
for link in check_links():
    print(link)
    count += 1
print(count)


mestrinho-bom-dia


In [5]:
import xml.etree.ElementTree as ET

download_folder = "sitemap"
sitemap_genres = download_folder + "/genres-1.xml"

tree = ET.parse(sitemap_genres)
root = tree.getroot()

# Define the namespace
namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

# Find all <loc> tags within the defined namespace
loc_tags = root.findall('.//ns:loc', namespace)

# Extract the text content of each <loc> tag (i.e., the links)
links = [loc.text for loc in loc_tags]
    
links


['https://www.cifras.com.br/genero/rockn-roll',
 'https://www.cifras.com.br/genero/mpb',
 'https://www.cifras.com.br/genero/reggae',
 'https://www.cifras.com.br/genero/forro',
 'https://www.cifras.com.br/genero/rock-alternativo',
 'https://www.cifras.com.br/genero/samba-e-pagode',
 'https://www.cifras.com.br/genero/country',
 'https://www.cifras.com.br/genero/sertanejo',
 'https://www.cifras.com.br/genero/lambada',
 'https://www.cifras.com.br/genero/raphip-hop',
 'https://www.cifras.com.br/genero/gospel',
 'https://www.cifras.com.br/genero/axe-music',
 'https://www.cifras.com.br/genero/brasil',
 'https://www.cifras.com.br/genero/australiano',
 'https://www.cifras.com.br/genero/bossa-nova',
 'https://www.cifras.com.br/genero/latinas',
 'https://www.cifras.com.br/genero/besteirol',
 'https://www.cifras.com.br/genero/brega',
 'https://www.cifras.com.br/genero/dance-music',
 'https://www.cifras.com.br/genero/disco',
 'https://www.cifras.com.br/genero/especial-de-natal',
 'https://www.cifra

In [6]:
import xml.etree.ElementTree as ET

download_folder = "sitemap"
sitemap_genres = download_folder + "/playlists-1.xml"

tree = ET.parse(sitemap_genres)
root = tree.getroot()

# Define the namespace
namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

# Find all <loc> tags within the defined namespace
loc_tags = root.findall('.//ns:loc', namespace)

# Extract the text content of each <loc> tag (i.e., the links)
links = [loc.text for loc in loc_tags]
    
links


['https://www.cifras.com.br/perfil/2974857/playlist/brown-jackson_330596',
 'https://www.cifras.com.br/perfil/4069345/playlist/great-songs_330592',
 'https://www.cifras.com.br/perfil/4069341/playlist/louange_330591',
 'https://www.cifras.com.br/perfil/1856241/playlist/laura-pausini_330590',
 'https://www.cifras.com.br/perfil/4069333/playlist/3_330589',
 'https://www.cifras.com.br/perfil/4068876/playlist/favourites-1_330587',
 'https://www.cifras.com.br/perfil/3342087/playlist/cats-stuff_330586',
 'https://www.cifras.com.br/perfil/3682866/playlist/ukulele_330585',
 'https://www.cifras.com.br/perfil/3906409/playlist/fdo_330583',
 'https://www.cifras.com.br/perfil/1442948/playlist/traceys_330582',
 'https://www.cifras.com.br/perfil/3653100/playlist/markys-songs_330580',
 'https://www.cifras.com.br/perfil/4069225/playlist/louvor-ippa_330579',
 'https://www.cifras.com.br/perfil/3852597/playlist/neil-young_330578',
 'https://www.cifras.com.br/perfil/3852597/playlist/neil-young_330576',
 'htt

In [13]:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup

import xml.etree.ElementTree as ET

# capabilities = DesiredCapabilities.FIREFOX
# capabilities["pageLoadStrategy"] = "eager"

# driver = webdriver.Firefox(options=options, desired_capabilities=capabilities)


def scrape_cifra_genre(url):

    try:
        options = Options()
        options.add_argument("--headless")

        driver = webdriver.Firefox(options=options)

        driver.get(url)
        driver.implicitly_wait(10)

        soup = BeautifulSoup(driver.page_source, "lxml")

        cifrasoup = soup.find(class_="all-artists-list__content")
                
        driver.quit()
        
    except Exception as e:
        print(f"Scraping Error!!!!")
        print(e)
        return "ERROR!!"

    return cifrasoup

url = "https://www.cifras.com.br/genero/rockn-roll?page=2"
res = scrape_cifra_genre(url)
print(res)


<div class="all-artists-list__content"><a href="/baba-cosmica"><div class="all-artists-list__item"><img alt="Baba Cósmica" class="component-avatar lazy avatar--sm" data-src="https://www.letras.com.br/arquivos/fotos/artistas/thumb1/165/16483,195511.jpg" onerror="this.src = '/assets/img/svg/artist-no-photo.svg'" src="/assets/img/svg/artist-no-photo.svg"/> <div class="artist-name"> Baba Cósmica </div></div></a> <a href="/baby-animals"><div class="all-artists-list__item"><img alt="Baby Animals" class="component-avatar lazy avatar--sm" data-src="https://www.cifras.com.br/assets/img/svg/artist-no-photo.svg" onerror="this.src = '/assets/img/svg/artist-no-photo.svg'" src="/assets/img/svg/artist-no-photo.svg"/> <div class="artist-name"> Baby Animals </div></div></a> <a href="/babys"><div class="all-artists-list__item"><img alt="Babys" class="component-avatar lazy avatar--sm" data-src="https://www.cifras.com.br/assets/img/svg/artist-no-photo.svg" onerror="this.src = '/assets/img/svg/artist-no-ph

In [18]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup

def scrape_cifra_genre(url):
    try:
        options = Options()
        options.add_argument("--headless")

        driver = webdriver.Firefox(options=options)

        driver.get(url)
        driver.implicitly_wait(10)

        soup = BeautifulSoup(driver.page_source, "lxml")

        cifrasoup = soup.find(class_="all-artists-list__content")
        
        # Extract artist names using BeautifulSoup
        artist_elements = cifrasoup.find_all('div', class_='artist-name')
        artist_names = [artist.text.strip() for artist in artist_elements]
        print("Artist Names:", artist_names)
                
        driver.quit()
        
    except Exception as e:
        print(f"Scraping Error!!!!")
        print(e)
        return "ERROR!!"

    return cifrasoup

url = "https://www.cifras.com.br/genero/rockn-roll?page=2"
res = scrape_cifra_genre(url)
# You can use 'res' for further processing if needed


Artist Names: ['Baba Cósmica', 'Baby Animals', 'Babys', 'Babyshambles', 'Bachelor Girl', 'Bad Company', "Bad Joker's Cream", 'Bad Verona', 'Badlands', 'Baia & Rockboys', 'BaianaSystem', 'Bajaga i Instruktori', 'Band Geeks', 'Banda Artificio', 'Banda Caos', 'Banda Cartada', 'Banda Contra Ordem', 'Banda FETO', 'Banda Frida', 'Banda Genes', 'Banda Heiwa', 'Banda HOPE', 'Banda Ideia Livre', 'Banda Kauze', 'Banda Malta', 'Banda Marujo', 'Banda New Way', 'Banda Offdead', 'Banda Ozoorríveis', 'Banda S.F.C', 'Banda Shampoo', 'Banda SkyHawk', 'Banda Sn', 'Banda Start 07', 'Banda Vulgar', 'BandaTRI', 'Baranga', 'Barão Vermelho', 'Bardot', 'Barefoot Truth', 'Baris Manco', 'Barney Bentall', 'Baroness', 'Basca', 'Batisteriom', 'Battersea Park', 'Bay City Rollers', 'BB Brunes', "Be'lakor", 'Beat Crusaders', 'Beatallica', 'Beatrice', 'Beau Brummels', 'Beaverloop', 'Before Their Eyes', 'Behind Crimson Eyes', 'Being As An Ocean', 'Bel Aguiar', 'Belduinos', 'Bell X1', 'Bella Morte', 'Benji Davis Project

In [26]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup

def scrape_cifra_genre(url):
    try:
        options = Options()
        options.add_argument("--headless")

        driver = webdriver.Firefox(options=options)

        driver.get(url)
        driver.implicitly_wait(10)

        soup = BeautifulSoup(driver.page_source, "lxml")

        cifrasoup = soup.find(class_="all-artists-list__content")
        
        # Extract artist names and href attributes using BeautifulSoup
        artists_info = []
        for artist_element in cifrasoup.find_all('a'):
            artist_name = artist_element.find('div', class_='artist-name').text.strip()
            artist_href = artist_element['href']  # Corrected this line
            artists_info.append({'name': artist_name, 'href': artist_href})

        print("Artists Information:", artists_info)
                
        driver.quit()
        
    except Exception as e:
        print(f"Scraping Error!!!!")
        print(e)
        return "ERROR!!"

    return cifrasoup

url = "https://www.cifras.com.br/genero/rockn-roll?page=2"
res = scrape_cifra_genre(url)
# You can use 'res' for further processing if needed


Artists Information: [{'name': 'Baba Cósmica', 'href': '/baba-cosmica'}, {'name': 'Baby Animals', 'href': '/baby-animals'}, {'name': 'Babys', 'href': '/babys'}, {'name': 'Babyshambles', 'href': '/babyshambles'}, {'name': 'Bachelor Girl', 'href': '/bachelor-girl'}, {'name': 'Bad Company', 'href': '/bad-company'}, {'name': "Bad Joker's Cream", 'href': '/bad-jokers-cream'}, {'name': 'Bad Verona', 'href': '/bad-verona'}, {'name': 'Badlands', 'href': '/badlands'}, {'name': 'Baia & Rockboys', 'href': '/baia-rockboys'}, {'name': 'BaianaSystem', 'href': '/baianasystem'}, {'name': 'Bajaga i Instruktori', 'href': '/bajaga-i-instruktori'}, {'name': 'Band Geeks', 'href': '/band-geeks'}, {'name': 'Banda Artificio', 'href': '/banda-artificio'}, {'name': 'Banda Caos', 'href': '/banda-caos'}, {'name': 'Banda Cartada', 'href': '/banda-cartada'}, {'name': 'Banda Contra Ordem', 'href': '/banda-contra-ordem'}, {'name': 'Banda FETO', 'href': '/banda-feto'}, {'name': 'Banda Frida', 'href': '/banda-frida'}, 

In [1]:
# Get the number of pages genres have
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup

def get_genre_num_pages(url):
    try:
        options = Options()
        options.add_argument("--headless")

        driver = webdriver.Firefox(options=options)

        driver.get(url)
        driver.implicitly_wait(10)

        soup = BeautifulSoup(driver.page_source, "lxml")

        cifrasoup = soup.find(class_="pagination")
        
        print(cifrasoup)
                
        driver.quit()
        
    except Exception as e:
        print(f"Scraping Error!!!!")
        print(e)
        return "ERROR!!"

    return cifrasoup

url = "https://www.cifras.com.br/genero/rockn-roll"
res = get_genre_num_pages(url)


<ul class="pagination"><li aria-disabled="true" aria-label="« Anterior" class="page-item disabled"><span aria-hidden="true" class="page-link">‹</span></li> <li aria-current="page" class="page-item active"><span class="page-link">1</span></li> <li class="page-item"><a class="page-link" href="https://www.cifras.com.br/genero/rockn-roll?page=2">2</a></li> <li class="page-item"><a class="page-link" href="https://www.cifras.com.br/genero/rockn-roll?page=3">3</a></li> <li class="page-item"><a class="page-link" href="https://www.cifras.com.br/genero/rockn-roll?page=4">4</a></li> <li class="page-item"><a class="page-link" href="https://www.cifras.com.br/genero/rockn-roll?page=5">5</a></li> <li class="page-item"><a class="page-link" href="https://www.cifras.com.br/genero/rockn-roll?page=6">6</a></li> <li class="page-item"><a class="page-link" href="https://www.cifras.com.br/genero/rockn-roll?page=7">7</a></li> <li class="page-item"><a class="page-link" href="https://www.cifras.com.br/genero/roc

In [9]:
# Get the number of pages genres have
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup

def get_genre_num_pages(url):
    max_pages = 1
    try:
        options = Options()
        options.add_argument("--headless")

        driver = webdriver.Firefox(options=options)

        driver.get(url)
        driver.implicitly_wait(10)

        soup = BeautifulSoup(driver.page_source, "lxml")

        cifrasoup = soup.find(class_="pagination")
        
        for pages in cifrasoup.find_all('li'):
            i_page = pages.find('a', class_='page-link')
            if i_page:
                i_page = i_page.text.strip()
                max_pages = int(i_page) if i_page.isnumeric() else max_pages
                
        driver.quit()
        
    except Exception as e:
        print(f"Scraping Error!!!!")
        print(e)
        return "ERROR!!"

    return max_pages

url = "https://www.cifras.com.br/genero/gospel"
res = get_genre_num_pages(url)
res

59

In [33]:
# Get the number of pages genres have
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup

def get_genre_num_pages(url):
    max_pages = 1
    try:
        options = Options()
        options.add_argument("--headless")

        driver = webdriver.Firefox(options=options)

        driver.get(url)
        driver.implicitly_wait(10)

        soup = BeautifulSoup(driver.page_source, "lxml")

        cifrasoup = soup.find(class_="pagination")
        
        if cifrasoup:
            for pages in cifrasoup.find_all('li'):
                i_page = pages.find('a', class_='page-link')
                if i_page:
                    i_page = i_page.text.strip()
                    max_pages = int(i_page) if i_page.isnumeric() else max_pages
            
        driver.quit()
        
    except Exception as e:
        print(f"Scraping Error!!!!")
        print(e)
        return "ERROR!!"

    return max_pages

url = "https://www.cifras.com.br/genero/lambada"
res = get_genre_num_pages(url)
res

1