# **LSTM Trained with a sample of lyrics from each decade**

## **0.File Preparation**

### **0.1 Requirements**

In [None]:
!pip install laserembeddings
!python -m laserembeddings download-models

Collecting laserembeddings
  Downloading laserembeddings-1.1.2-py3-none-any.whl (13 kB)
Collecting transliterate==1.10.2
  Downloading transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 1.9 MB/s 
[?25hCollecting subword-nmt<0.4.0,>=0.3.6
  Downloading subword_nmt-0.3.8-py3-none-any.whl (27 kB)
Collecting sacremoses==0.0.35
  Downloading sacremoses-0.0.35.tar.gz (859 kB)
[K     |████████████████████████████████| 859 kB 10.6 MB/s 
Collecting mock
  Downloading mock-4.0.3-py3-none-any.whl (28 kB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.35-py3-none-any.whl size=883989 sha256=83c3934bc5b66db8530bec31c3ad57a454dfcc9d2bf937495e1899ed0bcda52b
  Stored in directory: /root/.cache/pip/wheels/d1/ff/0e/e00ff1e22100702ac8b24e709551ae0fb29db9ffc843510a64
Successfully built sacremoses
Installing collected packages: mock, tr

### **0.2 Imports**

In [None]:
import pandas as pd
import numpy as np
import random
from random import sample


#Shell command
from IPython.display import JSON
from google.colab import output
from subprocess import getoutput
import os

#Text Processing
import string
import re

#Modeling
#from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

#Neural Networks
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input, Concatenate, BatchNormalization
from tensorflow.keras.models import Model, Sequential

# Reshaping datasets to tensors
import tensorflow as tf
from tensorflow.keras.utils import to_categorical


#for Colab file dealing
import glob
#You can mount your Google Drive files by running the following code snippet
from google.colab import drive
drive.mount('/content/gdrive') # Now all files in: /content/gdrive/My Drive/location_of_the_file
from os import listdir
from os.path import isfile, join

Mounted at /content/gdrive


In [None]:
#Laser
from laserembeddings import Laser

### **0.3 Functions**

#### **0.3.1 For Text Processing**

In [None]:
def lyrics_preprocessing(text_data):

    preprocessed_texts = []
    for text in text_data:
      text = re.sub('\[', '', text)
      text = re.sub('\]', '', text)
      text = re.sub('\_', ' ', text) # _
      text = re.sub('\!', ' ', text) # !
      text = re.sub('\?', ' ', text) # ?
      text = re.sub('\-', ' ', text) # -
      text = re.sub("[\[].*?\]", "", text)#delete everything between square brackets
      
      text = re.sub("EmbedShare URLCopyEmbedCopy", '', text) #NOOO VA??????
      text = re.sub("EmbedShareURLCopyEmbedCopy", '', text) 

      preprocessed_texts.append(text)

    return preprocessed_texts

In [None]:
def get_paragraphs_preprocessed (Files, mypath, df):
  #paragraphs 
  titles = []
  paragraphs = []
  for i in range(len(Files)):
    f = open(mypath+'/'+Files[i], 'r')

    data = f.read()
    data_splited = data.split("\n\n")
    

    for j in data_splited:
      titles.append(Files[i])
      unwanted = j.split("\n")
      wanted = []
      
      if '[' in unwanted[0]:
        wanted = unwanted[1:]
        j = "\n".join(wanted)

      paragraphs.append(j)

  df['title'] = titles
  df['paragraph'] = paragraphs
  
  return df



#### **0.3.2 For Model Evaluation**

In [None]:
# f1 evaluation
def f1(y_true, y_pred):
    y_true = K.flatten(y_true)
    y_pred = K.flatten(y_pred)
    return 2 * (K.sum(y_true * y_pred)+ K.epsilon()) / (K.sum(y_true) + K.sum(y_pred) + K.epsilon())

#### **0.3.3 For Labeling**

In [None]:
def labeling (l_embeddings, df):
  Xnew = tf.reshape(l_embeddings, [-1, 1, 1024])

  probs=model.predict(Xnew) 
  #The first value of the prediction is for class 0 and the second for class 1 


  ynew = []
  probabilities = []
  psxist = []
  p_not_sxist = []
  c=0
  for item in probs:
    if item[0][0]>item[0][1]:
      y = 0
      probability = item[0][0]  
    else:
      y = 1
      probability = item[0][1]
    p_not_sxist = np.append(p_not_sxist, item[0][0])
    psxist = np.append(psxist, item[0][1])
    c+=1
    ynew = np.append(ynew, y)
    probabilities = np.append(probabilities, probability)

  df['sexist_label'] = ynew.astype('int')
  df['sexist_label probability'] = probabilities
  df['probability_sexist'] = psxist
  df['probability_NOT_sexist'] = p_not_sxist
  
  df = df.sort_values('probability_sexist', ascending=False)
  
  return df

## **1. Dataset**

In [None]:
# Pharagraphs to train and test 
labeled_2021 = '/content/gdrive/My Drive/predicted_2021.csv'
labeled_60s = '/content/gdrive/My Drive/predicted_60s.csv' 
labeled_round2 = '/content/gdrive/My Drive/lyrics_Predicted_Round2.csv'

l2021_df = pd.read_csv(labeled_2021)
l60s_df = pd.read_csv(labeled_60s)
lround2 = pd.read_csv(labeled_round2)
lround2 = lround2.drop(columns=['decade'])

#dataframe to be used
tdf = pd.concat([l2021_df, l60s_df, lround2])
tdf = tdf.dropna(subset=['true_label (0,1 or NA)'])
tdf = tdf.replace([1.0, 0.0],[1,0])
tdf = tdf[(tdf['true_label (0,1 or NA)'] != 'NAP')]
pd.to_numeric(tdf['true_label (0,1 or NA)'], downcast = 'integer')
tdf['true_label (0,1 or NA)']= pd.to_numeric(tdf['true_label (0,1 or NA)'])
tdf

Unnamed: 0,title,paragraph,label,"true_label (0,1 or NA)","racialized_person (0,1 or NA)",Reason,label probability,probability_sexist,probability_NOT_sexist
0,lyricstxtBailemos_Dani Fernandez.txt,Bailaremos\nBailaremos\nBailemos\nBailemos\nBa...,1.0,0,0,,9.897.588.491.439.810,9.897.588.491.439.810,11.161.846.108.734.600
1,lyricstxtNathy Peluso Bzrp Music Sessions Vol....,Motherfuckin' ladies dancin'\nMotherfu-Motherf...,1.0,1,0,motherhood-related,9.618.873.596.191.400,9.618.873.596.191.400,3.857.753.425.836.560
2,lyricstxtSafaera_Bad Bunny.txt,"Bla, bla, bla, bla, bla, bla\nEy, yo', yo', yo...",1.0,0,0,,9.564.121.961.593.620,9.564.121.961.593.620,4.455.895.721.912.380
3,lyricstxtIndeciso_Reik.txt,Victoria ella no es un secreto\nQue tú a mí me...,1.0,1,0,hypersexualization,9.481.527.805.328.360,9.481.527.805.328.360,5.226.750.299.334.520
5,lyricstxtLa Jeepeta _Nio Garcia.txt,"Arrebata'o, dando vuelta en la jeepeta (Dando ...",1.0,1,1,"body shaming, sexual harassment",9.465.652.704.238.890,9.465.652.704.238.890,54.083.433.002.233.500
...,...,...,...,...,...,...,...,...,...
760,lyricstxtbailamos_enrique iglesias.txt,Don't let the world in outside\nDon't let a mo...,0.0,0,0,,998.933.732.509.613,11.397.618.800.401.600,998.933.732.509.613
761,lyricstxtanybody seen my baby_The rolling ston...,We came to rock for Brooklyn\nAnd Queens\nAnd ...,0.0,0,0,,9.989.345.669.746.390,11.381.290.387.362.200,9.989.345.669.746.390
762,lyricstxtCalma _Pedro Cap.txt,Desde la isla del encanto\nFarru lanzai Pedro ...,0.0,0,0,,9.989.352.822.303.770,11.362.035.293.132.000,9.989.352.822.303.770
763,lyricstxtUptown Funk_Mark Ronson Bruno Mars.txt,"Doh\nDoh-doh-doh, doh-doh-doh, doh-doh\nDoh-do...",0.0,0,0,,9.989.357.590.675.350,1.135.141.239.501.530,9.989.357.590.675.350


In [None]:
#training = "C:/Users/Lau/Desktop/TFG/LYRICS_TFG/training_dataset.csv"
training_df = lround2.copy()
training_df = training_df.dropna(subset=['true_label (0,1 or NA)'])
training_df = training_df.replace([1.0, 0.0],[1,0])
pd.to_numeric(training_df['true_label (0,1 or NA)'], downcast = 'integer')
training_df['true_label (0,1 or NA)']= pd.to_numeric(training_df['true_label (0,1 or NA)'])
training_df

Unnamed: 0,title,paragraph,label,"true_label (0,1 or NA)","racialized_person (0,1 or NA)",Reason,label probability,probability_sexist,probability_NOT_sexist
0,lyricstxtElla No Es Tuya _Rochy RD.txt,"Ella no e' tuya, te vendió sueño (Sí, porque c...",1.0,1.0,0.0,attribute stereotyoing,995.332.658.290.863,995.332.658.290.863,479.504.419.490.695
1,lyricstxtElla No Es Tuya _Rochy RD.txt,"Ella no e' tuya, te vendió sueño\nDice que no ...",1.0,1.0,0.0,attribute stereotyoing,9.952.580.332.756.040,9.952.580.332.756.040,4.873.421.508.818.860
2,lyricstxtAmanece_Anuel AA.txt,Y como Karol G en mi cama (Cama)\nComo Becky G...,1.0,1.0,0.0,"hypersexualization, paternalism, attribute ste...",9.944.848.418.235.770,9.944.848.418.235.770,5.661.717.616.021.630
3,lyricstxtMorado_J Balvin.txt,"Yo pedí un trago y ella la botella (Uh, uh, uh...",1.0,1.0,0.0,victim blaming,993.961.751.461.029,993.961.751.461.029,6.203.438.155.353.060
4,lyricstxtMorado_J Balvin.txt,Yo pedí un trago y ella la botella (Ah-ah)\nAb...,1.0,1.0,0.0,victim blaming,9.939.129.948.616.020,9.939.129.948.616.020,6.251.112.557.947.630
...,...,...,...,...,...,...,...,...,...
760,lyricstxtbailamos_enrique iglesias.txt,Don't let the world in outside\nDon't let a mo...,0.0,0.0,0.0,,998.933.732.509.613,11.397.618.800.401.600,998.933.732.509.613
761,lyricstxtanybody seen my baby_The rolling ston...,We came to rock for Brooklyn\nAnd Queens\nAnd ...,0.0,0.0,0.0,,9.989.345.669.746.390,11.381.290.387.362.200,9.989.345.669.746.390
762,lyricstxtCalma _Pedro Cap.txt,Desde la isla del encanto\nFarru lanzai Pedro ...,0.0,0.0,0.0,,9.989.352.822.303.770,11.362.035.293.132.000,9.989.352.822.303.770
763,lyricstxtUptown Funk_Mark Ronson Bruno Mars.txt,"Doh\nDoh-doh-doh, doh-doh-doh, doh-doh\nDoh-do...",0.0,0.0,0.0,,9.989.357.590.675.350,1.135.141.239.501.530,9.989.357.590.675.350


## **2. Lyrics to be Labeled**

In [None]:
"""# Lyrics to be labeled 
mypath60s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/1960-1969'
mypath70s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/1970-1979'
mypath80s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/1980-1989'
mypath90s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/1990-1999'
mypath00s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/2000-2009'
mypath10s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/2010-2019'
mypath20s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/2020-2021'

mypaths = [mypath60s, mypath70s, mypath80s, mypath90s, mypath00s, mypath10s, mypath20s]

Files60s = [f for f in listdir(mypath60s) if isfile(join(mypath60s, f))]
Files70s = [f for f in listdir(mypath70s) if isfile(join(mypath70s, f))]
Files80s = [f for f in listdir(mypath80s) if isfile(join(mypath80s, f))]
Files90s = [f for f in listdir(mypath90s) if isfile(join(mypath90s, f))]
Files00s = [f for f in listdir(mypath00s) if isfile(join(mypath00s, f))]
Files10s = [f for f in listdir(mypath10s) if isfile(join(mypath10s, f))]
Files20s = [f for f in listdir(mypath20s) if isfile(join(mypath20s, f))]

#Eliminate songs that are already labeled
titles = tdf['title'].tolist()

def del_labeled(list_files, list_titles):
  for fl in list_files:
    if fl in list_titles:
      list_files.remove(fl)
  return list_files

Files = [Files60s, Files70s, Files80s, Files90s, Files00s, Files10s, Files20s]
for i in Files: 
  i = del_labeled(i, titles)



cols=['title', 'paragraph', 'label']
lyfinal_df = pd.DataFrame(columns=cols)


i = 0
lyrics_df_list = []
for path in mypaths: 
  new_ly_df = pd.DataFrame(columns=cols)
  ldf = get_paragraphs_preprocessed(Files[i], path, new_ly_df)
  lyrics_df_list.append(ldf)
  i+=1

lyfinal_df = pd.concat(lyrics_df_list)"""


lyfinal_df = pd.read_csv('/content/gdrive/My Drive/DEF_labeled_df.csv')
lyfinal_df = lyfinal_df.drop(columns=['Unnamed: 0', 'sexist_label probability', 'racialized_label',	'probability_sexist', 'probability_NOT_sexist'])
lyfinal_df['sexist_label'] = ''
lyfinal_df

Unnamed: 0,title,paragraph,sexist_label,label_racialized,label_racialized probability,probability_racialized,probability_NOT_racialised
0,lyricstxtCant Feel My Face_The Weeknd.txt,I can't feel my face when I'm with you (I can'...,,0,0.569428,0.392769,0.569428
1,lyricstxtgrande_paolo vallesi.txt,"Paolo: Yo soy quien, se dormía en las clases d...",,1,0.498606,0.498606,0.450495
2,lyricstxtScatman_Scatman john.txt,I'm the Scatman\nSki-bi dibby dib yo da dub du...,,0,0.754303,0.199873,0.754303
3,lyricstxtDont Wanna Go Home_Jason Derulo.txt,I just met this sexy Haitian girl moving like ...,,0,0.733986,0.215519,0.733986
4,lyricstxtCon Calma_Daddy Yankee.txt,"Con calma, yo quiero ver como ella lo menea (C...",,0,0.713123,0.232265,0.713123
...,...,...,...,...,...,...,...
17402,lyricstxtCorazon de neon_OMD.txt,"Barcelona, Moscú, Casablanca\nBruselas, Madrid...",,0,0.643671,0.294289,0.643671
17403,lyricstxtHave you seen her_MC hammer.txt,I see her face and I can't let go\nShe's in my...,,0,0.705960,0.243503,0.705960
17404,lyricstxtHappy _Pharrell Williams.txt,(Because I'm happy)\nClap along if you feel li...,,0,0.744704,0.211503,0.744704
17405,lyricstxtMagic_Coldplay.txt,"And I don't, and I don't, and I don't, and I d...",,0,0.747309,0.207083,0.747309


## **3. LSTM Training**

### **3.1 Data Preparation**

In [None]:
laser = Laser() # importing class for using embeddings extraction

In [None]:
#processed dataframe
X_tobe_processed = tdf['paragraph']

X_processed = lyrics_preprocessing(X_tobe_processed)
X_embeddings = laser.embed_sentences(X_processed, lang = 'es')

y = tdf['true_label (0,1 or NA)']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.33, random_state=42)

In [None]:
lyfinal_df['paragraph'] = lyfinal_df['paragraph'].astype(str)
lyrics_processed = lyrics_preprocessing(lyfinal_df['paragraph'])
lyrics_embeddings = laser.embed_sentences(lyrics_processed, lang = 'es')

### **3.2 Modeling**

In [None]:
# KFOLD CROSS-VAL BASED ON: https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-use-k-fold-cross-validation-with-keras.md


# - - - - - TRAIN FEATURES - - - - -
X1_laser = tf.reshape(X_train, [-1, 1, 1024])

Y1 = to_categorical(y_train, 2)
Y1_reshaped = tf.reshape(Y1, [-1, 1, 2])

print('Train data shapes:',X1_laser.shape, Y1_reshaped.shape)

# - - - - - TEST FEATURES - - - - -
X2_laser = tf.reshape(X_test, [-1, 1, 1024])

Y2 = to_categorical(y_test, 2)
Y2_reshaped = tf.reshape(Y2, [-1, 1, 2])

print('Test data shapes:', X2_laser.shape, Y2_reshaped.shape)


inputs = np.concatenate((X1_laser, X2_laser), axis=0)
targets = np.concatenate((Y1_reshaped, Y2_reshaped), axis=0)

# Define per-fold score containers 
acc_per_fold = []
f1_per_fold = []
auc_per_fold = []
loss_per_fold = []

num_folds = 10
# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(inputs, targets):

  # Define the model architecture
  model = tf.keras.Sequential()
  model.add(LSTM(100, input_shape=(1, 1024), return_sequences=True))
  model.add(Dense(1024,activation='relu')) # MUST BE 2 hidden layers
  model.add(Dropout(0.5))
  model.add(Dense(128,activation='sigmoid'))
  model.add(Dense(2, activation='sigmoid'))

  # Compile the model
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.CategoricalAccuracy(name='accuracy'), f1, tf.keras.metrics.AUC(name='auc')])


  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  # Fit data to model
  history = model.fit(X1_laser, Y1_reshaped, validation_data=(X2_laser, Y2_reshaped), epochs=20, batch_size=100)


  # Generate generalization metrics
  scores = model.evaluate(X2_laser, Y2_reshaped, verbose=0)
  print(f'\nScore for fold {fold_no}: \n')
  print("Accuracy: %.2f%%" % (scores[1]*100))
  print("F1: %.2f%%" % (scores[2]*100))
  print("AUC: %.2f%%" % (scores[3]*100))
  print("Loss: %.2f%%" % (scores[0]))
  print('\n------------------------------------------------------------------------\n')
    
  acc_per_fold.append(scores[1] * 100)
  f1_per_fold.append(scores[2] * 100)
  auc_per_fold.append(scores[3] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

Train data shapes: (1115, 1, 1024) (1115, 1, 2)
Test data shapes: (550, 1, 1024) (550, 1, 2)
------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Score for fold 1: 

Accuracy: 87.09%
F1: 84.66%
AUC: 92.26%
Loss: 0.44%

------------------------------------------------------------------------

------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Score for fold 2: 

Accuracy: 86.55%
F1: 84.47%
AUC: 92.16%
Loss: 0.44%

-----------------------------

In [None]:
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]} - F1: {f1_per_fold[i]} - AUC: {auc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Loss: {np.mean(loss_per_fold)}')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> F1: {np.mean(f1_per_fold)} (+- {np.std(f1_per_fold)})')
print(f'> AUC: {np.mean(auc_per_fold)} (+- {np.std(auc_per_fold)})')

print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.43777212500572205 - Accuracy: 87.09090948104858 - F1: 84.66131091117859 - AUC: 92.25817918777466%
------------------------------------------------------------------------
> Fold 2 - Loss: 0.4353945553302765 - Accuracy: 86.54545545578003 - F1: 84.46762561798096 - AUC: 92.16181635856628%
------------------------------------------------------------------------
> Fold 3 - Loss: 0.5123358368873596 - Accuracy: 81.99999928474426 - F1: 81.3953161239624 - AUC: 90.70711135864258%
------------------------------------------------------------------------
> Fold 4 - Loss: 0.43693333864212036 - Accuracy: 83.45454335212708 - F1: 82.2370171546936 - AUC: 91.65553450584412%
------------------------------------------------------------------------
> Fold 5 - Loss: 0.4867892265319824 - Accuracy: 86.72727346420288 - F1: 85.38281321

In [None]:
#model.save('final_model_sexist.h5')

## **4. Labeling**

In [None]:
lyfinal_df = labeling(lyrics_embeddings, lyfinal_df)

In [None]:
final_df = lyfinal_df.sort_values('probability_sexist', ascending=False)
final_df

Unnamed: 0,title,paragraph,sexist_label,label_racialized,label_racialized probability,probability_racialized,probability_NOT_racialised,sexist_label probability,probability_sexist,probability_NOT_sexist
0,lyricstxtCant Feel My Face_The Weeknd.txt,I can't feel my face when I'm with you (I can'...,1,0,0.569428,0.392769,0.569428,0.997119,0.997119,0.003082
1,lyricstxtgrande_paolo vallesi.txt,"Paolo: Yo soy quien, se dormía en las clases d...",1,1,0.498606,0.498606,0.450495,0.997040,0.997040,0.003022
3578,lyricstxtCant Feel My Face_The Weeknd.txt,I can't feel my face when I'm with you\nBut I ...,1,0,0.994051,0.004910,0.994051,0.996978,0.996978,0.003009
2,lyricstxtScatman_Scatman john.txt,I'm the Scatman\nSki-bi dibby dib yo da dub du...,1,0,0.754303,0.199873,0.754303,0.996891,0.996891,0.003088
17312,lyricstxtI Really Like You_Carly Rae Jepsen.txt,Who gave you eyes like that?\nSaid you could k...,1,0,0.865534,0.110881,0.865534,0.996621,0.996621,0.003286
...,...,...,...,...,...,...,...,...,...,...
17397,lyricstxtNobody told me_John Lennon.txt,Everybody's smoking and no one's getting high\...,0,0,0.854194,0.112793,0.854194,0.996889,0.003313,0.996889
17401,lyricstxtNumb_U2.txt,Don't struggle\nDon't jerk\nDon't collar\nDon'...,0,0,0.793198,0.163980,0.793198,0.996888,0.003306,0.996888
17403,lyricstxtHave you seen her_MC hammer.txt,I see her face and I can't let go\nShe's in my...,0,0,0.705960,0.243503,0.705960,0.996885,0.003299,0.996885
17404,lyricstxtHappy _Pharrell Williams.txt,(Because I'm happy)\nClap along if you feel li...,0,0,0.744704,0.211503,0.744704,0.996867,0.003258,0.996867


In [None]:
final_df.to_csv('DEF_labeled_df.csv')