In [1]:
import pandas as pd
import numpy as np
import time
import codecs
import seaborn as sns
import swifter
from polyglot.detect import Detector
import requests
import json



In [2]:
# Abre CSV all_lyrics e substitui todos os caracteres não utf-8 por espaço vazio

create_all_lyrics2 = False
try:
    fh = open("data/all_lyrics2.csv", 'r')
except FileNotFoundError:
    create_all_lyrics2 = True
    
if(create_all_lyrics2):
    with codecs.open('data/all_lyrics.csv', 'r', encoding='utf-8', errors='replace') as fdata:
        data = fdata.read()
        f= open("data/all_lyrics2.csv","w",  encoding='utf-8')
        f.write(data)
        f.close()

In [3]:
# Removendo linhas sem letra musical ou com linguagem mapeada diferente de bra ou desconhecido
# Remove também músicas duplicadas e músicas intrumentais

df_all_lyrics  = pd.read_csv('data/all_lyrics2.csv', encoding='utf8')  
df_all_lyrics.drop(columns=["Unnamed: 0"],inplace=True)
df_all_lyrics.dropna(subset=["Lyric"], inplace = True)
df_all_lyrics["Language"].fillna("Desconhecido", inplace=True)
df_all_lyrics['row_number'] = df_all_lyrics.groupby(['LyricLink']).cumcount() + 1
df_all_lyrics = df_all_lyrics[df_all_lyrics['row_number']==1].drop(columns = ['row_number'])
df_all_lyrics.reset_index(inplace=True, drop = True)

In [5]:
df_all_lyrics.shape

(390400, 3)

In [4]:
df_all_lyrics["Language"].value_counts()

eng             201335
Desconhecido    154733
esp              10351
bra               7422
jpn               6908
out               4861
kor               1473
fra               1220
ita               1134
ale                860
hol                 83
en                  11
icon-cifra           9
Name: Language, dtype: int64

In [25]:
# Detectando linguagem usando biblioteca Polyglot


df_all_lyrics = df_all_lyrics[(df_all_lyrics["Language"] == "Desconhecido") | (df_all_lyrics["Language"] == "bra")]

def lang_detector(lyric): 
    dict_result = {}
    try:
        detector  = Detector(lyric, quiet = True)
        languages = detector.languages
        for language in languages:
            if(language.confidence>0):
                dict_result[language.code] = language.confidence
    finally:
        return dict_result
    
# Só são consideradas letras musicais onde só foi detectado português
    
start = time.time()   

result  = df_all_lyrics["Lyric"].swifter.apply(lang_detector)
df_all_lyrics = df_all_lyrics[result.map(lambda x: ('pt' in x.keys()) & (len(x.keys())==1))]
df_all_lyrics.reset_index(inplace=True, drop = True)

end = time.time() 

print('\nTime(s): '+str("%.2f" % ((end - start)))+'s\n')

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=161768, style=ProgressStyle(description_wi…



Time(s): 12.16s



In [75]:
df_all_lyrics[:5]

Unnamed: 0,Lyric,Language,LyricLink
0,[André Drake]. Minha princesa eu troco tudo pr...,Desconhecido,/andre-drake/bae-ft-v-p-rap.html
1,"[Refrão]. Mais views, pode ser?. Cê sabe, que ...",Desconhecido,/andre-drake/by-chance-brazilian-remix.html
2,"4 Paredes uma jaula, na minha mão uma faca. Um...",Desconhecido,/andre-drake/casa-do-terror.html
3,"[Refrão 1]. Na ""madruga"" não consigo dormir. m...",Desconhecido,/andre-drake/na-madrugada.html
4,[Vp Rap]. Vp de novo que convida. Dessa vez se...,Desconhecido,/andre-drake/v-p-convida-parte-2.html


In [77]:
# Carregando dataframe que liga letra a artista e removendo duplicatas

df_all_songs_links = pd.read_csv('data/all_songs_links.csv', encoding='utf8')   
df_all_songs_links.drop(columns=["Unnamed: 0"],inplace=True)
df_all_songs_links['row_number'] = df_all_songs_links.groupby(['SLink']).cumcount() + 1
df_all_songs_links = df_all_songs_links[df_all_songs_links['row_number']==1].drop(columns = ['row_number'])
df_all_songs_links.reset_index(inplace=True, drop = True)

df_all_songs_links[:5]

Unnamed: 0,ALink,SName,SLink
0,/50-cent/,In da Club,/50-cent/in-da-club.html
1,/50-cent/,Candy Shop,/50-cent/candy-shop.html
2,/50-cent/,21 Questions,/50-cent/21-questions.html
3,/50-cent/,P.I.M.P.,/50-cent/p-i-m-p.html
4,/50-cent/,Window Shopper,/50-cent/window-shopper.html


In [79]:
# Join 
df_lyrics = df_all_songs_links.merge(df_all_lyrics, left_on='SLink', right_on='LyricLink')
df_lyrics.drop(columns=["LyricLink","LanguageDetected","Language"],inplace=True)
df_lyrics[:5]

Unnamed: 0,ALink,SName,SLink,Lyric
0,/andre-drake/,Bae ft. V.P Rap,/andre-drake/bae-ft-v-p-rap.html,[André Drake]. Minha princesa eu troco tudo pr...
1,/andre-drake/,By Chance (Brazilian Remix),/andre-drake/by-chance-brazilian-remix.html,"[Refrão]. Mais views, pode ser?. Cê sabe, que ..."
2,/andre-drake/,Casa do Terror,/andre-drake/casa-do-terror.html,"4 Paredes uma jaula, na minha mão uma faca. Um..."
3,/andre-drake/,Na Madrugada,/andre-drake/na-madrugada.html,"[Refrão 1]. Na ""madruga"" não consigo dormir. m..."
4,/andre-drake/,V.P Convida Parte 2,/andre-drake/v-p-convida-parte-2.html,[Vp Rap]. Vp de novo que convida. Dessa vez se...


In [80]:
# Carregando dataframe com dados dos artistas

df_artists = pd.read_csv('data/artists-data.csv', encoding='utf8')   
df_artists.drop(columns=["Artist","Songs","Popularity"],inplace=True)

def genres_map(genres_str):
    # Removendo "COLETÂNEA" dos gêneros mapeados
    genres = [y.strip() for y in genres_str.split(";")]
    if("COLETÂNEA" in genres):
        genres.remove("COLETÂNEA")
        
    return ', '.join(genres)

df_artists = df_artists[~df_artists["Genres"].isnull()]

df_artists["Genres"] = df_artists["Genres"].map(genres_map)

df_artists[:5]

Unnamed: 0,ParentGenre,Genres,Link
0,Hip Hop,"Hip Hop, Rap, Black Music",/50-cent/
1,Black Music,"Black Music, R&B, Hip Hop",/aaliyah/
2,Black Music,"Black Music, Rap, Hip Hop",/ace-hood/
3,Rap,"Rap, Black Music, Pop",/akon/
4,Black Music,"Black Music, Soul Music, R&B",/alicia-keys/


In [81]:
# Join dataframe letras músicais e dataframe de artistas

df_lyrics = df_lyrics.merge(df_artists, left_on='ALink', right_on='Link')
df_lyrics.drop(columns=["Link"],inplace=True)

In [24]:
# Obtendo links individuais dos artistas
serie_alinks = df_lyrics['ALink'].drop_duplicates().reset_index(drop=True)
serie_alinks[0:5]

0     /andre-drake/
1         /ao-cubo/
2      /artur-desh/
3    /baianasystem/
4       /banca-lps/
Name: ALink, dtype: object

In [16]:
# Preenchendo coluna genres_api com API do vagalume
def get_genres(alink):
    
    resp = ""
    information_obtained = False
    
    counter = 0
    
    exception_obj = None
    error_d = False
    
    while(not(information_obtained)):
        try:
            result = requests.get("https://www.vagalume.com.br"+alink+"index.js")
            result_dict = json.loads(result.text)['artist']

            genres = []
            genres_arr_api = result_dict['genre']
            for genre_dict in genres_arr_api:
                genres.append(genre_dict['name'])
            resp = ', '.join(genres)
            information_obtained = True
            
        except Exception as e:
            counter += 1
            exception_obj = e
            if(counter>20):
                information_obtained = True
                error_d = True
            else:    
                time.sleep(0.1)
                
    if(error_d):
        print("Erro em ","https://www.vagalume.com.br"+alink+"index.js" )
        print(exception_obj)
        
    return resp
    

In [19]:
start = time.time()
series_genres_api = serie_alinks.apply(get_genres)
end   = time.time()
print('\nTime(s): '+str("%.2f" % ((end - start)))+'s\n')


Time(s): 1802.62s



In [26]:
df_alink_genre = pd.DataFrame({"Alink_2":serie_alinks, "Genres_API": series_genres_api })

df_lyrics = df_lyrics.merge(df_alink_genre, left_on='ALink', right_on='Alink_2').drop(columns=["Alink_2"])

In [27]:
# Salvando dataframe resultante

df_lyrics.to_csv("lyrics.csv", sep = ';', index = False)

In [28]:
df_lyrics[:5]

Unnamed: 0,ALink,SName,SLink,Lyric,ParentGenre,Genres,Genres_API
0,/andre-drake/,Bae ft. V.P Rap,/andre-drake/bae-ft-v-p-rap.html,[André Drake]. Minha princesa eu troco tudo pr...,Black Music,"Black Music, Hip Hop, Rap","Black Music, Hip Hop, Rap"
1,/andre-drake/,By Chance (Brazilian Remix),/andre-drake/by-chance-brazilian-remix.html,"[Refrão]. Mais views, pode ser?. Cê sabe, que ...",Black Music,"Black Music, Hip Hop, Rap","Black Music, Hip Hop, Rap"
2,/andre-drake/,Casa do Terror,/andre-drake/casa-do-terror.html,"4 Paredes uma jaula, na minha mão uma faca. Um...",Black Music,"Black Music, Hip Hop, Rap","Black Music, Hip Hop, Rap"
3,/andre-drake/,Na Madrugada,/andre-drake/na-madrugada.html,"[Refrão 1]. Na ""madruga"" não consigo dormir. m...",Black Music,"Black Music, Hip Hop, Rap","Black Music, Hip Hop, Rap"
4,/andre-drake/,V.P Convida Parte 2,/andre-drake/v-p-convida-parte-2.html,[Vp Rap]. Vp de novo que convida. Dessa vez se...,Black Music,"Black Music, Hip Hop, Rap","Black Music, Hip Hop, Rap"
