In [21]:
!pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [22]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [23]:
checkpoint = "papluca/xlm-roberta-base-language-detection"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [24]:
# this file can be found in the directory `data/intermediate`

df_all_artists = pd.read_csv("df_all_artists_filtered.tsv", sep='\t')

df_all_artists.tail()

Unnamed: 0,Artist,Title,Lyric
5606,Khalid,Young dumb,so you're still thinking of me just like i kno...
5607,Khalid,Khalid - Vertigo (Tradução Português),será que é melhor apenas acreditar nas teorias...
5608,Khalid,Better (Miles Away Remix),i'm not really drunk i never get that fucked u...
5609,Khalid,Khalid - Better (Official Music Video),users considering it's a virus or malware must...
5610,Khalid,Better (Rennie! Remix),love to see you shine in the night like the di...


In [25]:
sequences = list(df_all_artists['Lyric'].str[:200])
sequences[:5]

["one one one one one   talkin' in my sleep at night makin' myself crazy out of my mind out of my mind wrote it down and read it out hopin' it would save me too many times too many times  refrain my lov",
 "if you don't wanna see me   did a full 80 crazy thinking 'bout the way i was did the heartbreak change me maybe but look at where i ended up i'm all good already so moved on it's scary i'm not where y",
 "you call me all friendly tellin' me how much you miss me that's funny i guess you've heard my songs well i'm too busy for your business go find a girl who wants to listen 'cause if you think i was bor",
 "i know it's hot i know we've got something that money can't buy fighting in fits biting your lip loving 'til late in the night  pre tell me i'm too crazy you can't tame me can't tame me tell me i have",
 "i see the moon i see the moon i see the moon oh when you're looking at the sun i'm not a fool i'm not a fool not a fool no you're not fooling anyone  pre oh but when you're gone 

In [26]:
def generate_batch(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i : i + batch_size]

In [27]:
# batch = [
#     "It's such a wonderful world.",
#     "Estoy muy contenta.",
# ]

batch_size = 50
input_batches = generate_batch(sequences, batch_size)
lyric_language = []

for batch in input_batches:
  model_inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
  output = model(**model_inputs)
  predictions = torch.nn.functional.softmax(output.logits, dim=-1)
  ranking = np.argmax(predictions.detach().numpy(), axis=1)
  lyric_language.append([model.config.id2label[i] for i in ranking])

In [28]:
lyric_language = [item for sublist in lyric_language for item in sublist]
lyric_language[:5]

['en', 'en', 'en', 'en', 'en']

In [29]:
df_all_artists['lang'] = lyric_language
df_all_artists.tail()

Unnamed: 0,Artist,Title,Lyric,lang
5606,Khalid,Young dumb,so you're still thinking of me just like i kno...,en
5607,Khalid,Khalid - Vertigo (Tradução Português),será que é melhor apenas acreditar nas teorias...,pt
5608,Khalid,Better (Miles Away Remix),i'm not really drunk i never get that fucked u...,en
5609,Khalid,Khalid - Better (Official Music Video),users considering it's a virus or malware must...,en
5610,Khalid,Better (Rennie! Remix),love to see you shine in the night like the di...,en


In [34]:
import collections
collections.Counter(lyric_language)

Counter({'en': 4684,
         'sw': 656,
         'ja': 186,
         'fr': 5,
         'es': 23,
         'nl': 5,
         'th': 1,
         'ur': 24,
         'ru': 3,
         'tr': 9,
         'zh': 1,
         'pl': 2,
         'hi': 2,
         'de': 1,
         'el': 2,
         'it': 4,
         'pt': 3})

In [35]:
df_all_artists[df_all_artists['lang']=='es']

Unnamed: 0,Artist,Title,Lyric,lang
207,Dua Lipa,Dua Lipa - Blow Your Mind (Mwah) (Traducción a...,verso se que hace calor sé que tenemos algo q...,es
208,Dua Lipa,Dua Lipa - Hotter Than Hell (Traducción al Esp...,ah yeah ah yeah verso el me llama el diablo ...,es
209,Dua Lipa,Dua Lipa - IDGAF (Traducción al Español),verso me llamas todo amigable dime cuanto me ...,es
606,Drake,Fire & Desire,dedicate i dedicate the world yeah oh i dedica...,es
1328,Billie Eilish,Eres Tú,hoy desperté con ganas de besarte tengo una se...,es
2302,Lady Gaga,Americano - Gregori Klosman Remix,interlude la la la la la la la la la la la la ...,es
2788,Beyoncé,Diva (Homecoming Live),beyoncé i'm aa diva hey i'm a i'm aa diva hey ...,es
2809,Beyoncé,Si Yo Fuera un Chico,letra de si yo fuera un chico verso si yo fu...,es
2818,Beyoncé,Amor Gitano,letra de amor gitano ft alejandro fernández v...,es
2853,Beyoncé,Oye,letra de oye oye lo que tengo que decir no p...,es


In [32]:
df_all_artists.to_csv('df_all_lyrics_with_lang.csv', sep='|', index=False)

In [36]:
! head -5 df_all_lyrics_with_lang.csv

Artist|Title|Lyric|lang
Dua Lipa|New Rules|one one one one one   talkin' in my sleep at night makin' myself crazy out of my mind out of my mind wrote it down and read it out hopin' it would save me too many times too many times  refrain my love he makes me feel like nobody else nobody else but my love he doesn't love me so i tell myself i tell myself  pre one don't pick up the phone you know he's only callin' 'cause he's drunk and alone two don't let him in you'll have to kick him out again three don't be his friend you know you're gonna wake up in his bed in the morning and if you're under him you ain't gettin' over him   i got new rules i count 'em i got new rules i count 'em i gotta tell them to myself i got new rules i count 'em i gotta tell them to myself   i keep pushin' forwards but he keeps pullin' me backwards nowhere to turn no way nowhere to turn no now i'm standin' back from it i finally see the pattern i never learn i never learn  refrain but my love he doesn't love me so 