# **LSTM Trained with Lyrics from the years 60s, 70s, 2020 and 2021**



## **0.File Preparation**

### **0.1 Requirements**

In [None]:
!pip install laserembeddings
!python -m laserembeddings download-models

Collecting laserembeddings
  Downloading laserembeddings-1.1.2-py3-none-any.whl (13 kB)
Collecting subword-nmt<0.4.0,>=0.3.6
  Downloading subword_nmt-0.3.8-py3-none-any.whl (27 kB)
Collecting transliterate==1.10.2
  Downloading transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 2.8 MB/s 
Collecting sacremoses==0.0.35
  Downloading sacremoses-0.0.35.tar.gz (859 kB)
[K     |████████████████████████████████| 859 kB 19.7 MB/s 
Collecting mock
  Downloading mock-4.0.3-py3-none-any.whl (28 kB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.35-py3-none-any.whl size=883989 sha256=5ab4cd9ffd7ab1a504b830d105ae07af701648f0081f64cf33474f8ed4fce73c
  Stored in directory: /root/.cache/pip/wheels/d1/ff/0e/e00ff1e22100702ac8b24e709551ae0fb29db9ffc843510a64
Successfully built sacremoses
Installing collected packages: mock, translit

### **0.2 Imports**

In [None]:
import pandas as pd
import numpy as np
import random
from random import sample

#Text Processing
import string
import re

#Modeling
#from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

#Neural Networks
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input, Concatenate, BatchNormalization
from tensorflow.keras.models import Model, Sequential

# Reshaping datasets to tensors
import tensorflow as tf
from tensorflow.keras.utils import to_categorical


#for Colab file dealing
import glob
#You can mount your Google Drive files by running the following code snippet
from google.colab import drive
drive.mount('/content/gdrive') # Now all files in: /content/gdrive/My Drive/location_of_the_file
from os import listdir
from os.path import isfile, join

Mounted at /content/gdrive


In [None]:
#Laser
from laserembeddings import Laser

### **0.3 Functions**

#### **0.3.1 For Text Processing**

In [None]:
def tweet_preprocessing(text_data):
    preprocessed_texts = []
    for text in text_data:
            # hashtags -> words, URLs -> URL and mentions -> USER
            text = re.sub('#', '', text)
            text = re.sub('((www\.[\\s]+)|(https?://[^\\s]+))', 'URL', text)
            text = re.sub('@[A-Za-z0-9_-]+', 'USER', text)
            text = re.sub('RT @[A-Za-z0-9_-]+:', 'USER', text)
            text = re.sub('\_', ' ', text) # _
            text = re.sub('\!', ' ', text) # !
            text = re.sub('\?', ' ', text) # ?
            text = re.sub('\W', ' ', text) # symbols
            text = re.sub('\_', ' ', text) # _
            text = re.sub('[\s]+', ' ', text) # spaces
            text = re.sub(r'(\d)\s+(\d)', r'\1\2', text) # remove spaces between numbers
            preprocessed_texts.append(text)

    return preprocessed_texts

In [None]:
def lyrics_preprocessing(text_data):
    preprocessed_texts = []
    for text in text_data:
      text = str(text).strip()
      text = re.sub('\[', '', text)
      text = re.sub('\]', '', text)
      text = re.sub('\_', ' ', text) # _
      text = re.sub('\!', ' ', text) # !
      text = re.sub('\?', ' ', text) # ?
      text = re.sub('\W', ' ', text) # symbols
      text = re.sub('\-', ' ', text) # -
      text = re.sub('[\s]+', ' ', text) # spaces

      text = re.sub("[\[].*?\]", "", text)#delete everything between square brackets

      # Get rid of Genius watermarks
      text = re.sub("EmbedShare URLCopyEmbedCopy", '', text) 
      text = re.sub("EmbedShareURLCopyEmbedCopy", '', text) 


      preprocessed_texts.append(text)

    return preprocessed_texts

In [None]:
def get_paragraphs_preprocessed (Files, mypath, df):
  #paragraphs 
  titles = []
  paragraphs = []
  for i in range(len(Files)):
    f = open(mypath+'/'+Files[i], 'r')

    data = f.read()
    data_splited = data.split("\n\n")
    

    for j in data_splited:
      titles.append(Files[i])
      unwanted = j.split("\n")
      wanted = []
      
      if '[' in unwanted[0]:
        wanted = unwanted[1:]
        j = "\n".join(wanted)

      paragraphs.append(j)

  df['title'] = titles
  df['paragraph'] = paragraphs
  
  return df



#### **0.3.2 For Model Evaluation**

In [None]:
# f1 evaluation
def f1(y_true, y_pred):
    y_true = K.flatten(y_true)
    y_pred = K.flatten(y_pred)
    return 2 * (K.sum(y_true * y_pred)+ K.epsilon()) / (K.sum(y_true) + K.sum(y_pred) + K.epsilon())

#### **0.3.3 For Labeling**

In [None]:
def del_labeled(list_files, list_titles):
  for fl in list_files:
    if fl in list_titles:
      list_files.remove(fl)
  return list_files

In [None]:
def labeling (l_embeddings, df):
  Xnew = tf.reshape(l_embeddings, [-1, 1, 1024])

  probs=model.predict(Xnew) 
  
  #The first value of the prediction is for class 0 and the second for class 1 

  ynew = []
  probabilities = []
  psxist = []
  p_not_sxist = []
  c=0
  for item in probs:
    if item[0][0]>item[0][1]:
      y = 0
      probability = item[0][0]  
    else:
      y = 1
      probability = item[0][1]
    p_not_sxist = np.append(p_not_sxist, item[0][0])
    psxist = np.append(psxist, item[0][1])
    c+=1
    ynew = np.append(ynew, y)
    probabilities = np.append(probabilities, probability)

  df['label'] = ynew.astype('int')
  df['label probability'] = probabilities
  df['probability_sexist'] = psxist
  df['probability_NOT_sexist'] = p_not_sxist
  
  df = df.sort_values('probability_sexist', ascending=False)
  
  return df 

## **1. Dataset**

In [None]:
# Pharagraphs to train and test 
labeled_2021 = '/content/gdrive/My Drive/predicted_2021.csv' #365
labeled_60s = '/content/gdrive/My Drive/predicted_60s.csv' #665

l2021_df = pd.read_csv(labeled_2021)
l60s_df = pd.read_csv(labeled_60s)

#dataframe to be used
tdf = pd.concat([l2021_df, l60s_df])
tdf = tdf.dropna(subset=['true_label (0,1 or NA)'])
tdf = tdf.replace([1.0, 0.0],[1,0])
tdf = tdf[(tdf['true_label (0,1 or NA)'] != 'NAP')]
pd.to_numeric(tdf['true_label (0,1 or NA)'], downcast = 'integer')
tdf['true_label (0,1 or NA)']= pd.to_numeric(tdf['true_label (0,1 or NA)'])

## **2. Lyrics to be Labeled**

In [None]:
# Lyrics to be labeled 
mypath80s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/1980-1989'
mypath90s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/1990-1999'
mypath00s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/2000-2009'
mypath10s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/2010-2019'
mypath21s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/2020-2021'


mypaths = [mypath80s, mypath90s, mypath00s, mypath10s, mypath21s]

Files80s = [f for f in listdir(mypath80s) if isfile(join(mypath80s, f))]
Files90s = [f for f in listdir(mypath90s) if isfile(join(mypath90s, f))]
Files00s = [f for f in listdir(mypath00s) if isfile(join(mypath00s, f))]
Files10s = [f for f in listdir(mypath10s) if isfile(join(mypath10s, f))]
Files21s = [f for f in listdir(mypath21s) if isfile(join(mypath21s, f))]


#Eliminate songs that are already labeled
titles = tdf['title'].tolist()



Files21s = del_labeled(Files21s, titles)

Files = [Files80s, Files90s, Files00s, Files10s, Files21s]


cols=['title', 'paragraph', 'label', 'decade']
lyrics_df = pd.DataFrame(columns=cols)


i = 0
lyrics_df_list = []
decade = 1980
for path in mypaths: 
  new_ly_df = pd.DataFrame(columns=cols)
  lyrics_df = get_paragraphs_preprocessed(Files[i], path, new_ly_df)
  lyrics_df['decade'] = decade
  decade+=10
  lyrics_df_list.append(lyrics_df)
  i+=1

lyrics_df = pd.concat(lyrics_df_list)

lyrics_df

Unnamed: 0,title,paragraph,label,decade
0,lyricstxtSin amor_Ivan.txt,Me bebo la penúltima cerveza\nSin respirar\nMe...,,1980
1,lyricstxtSin amor_Ivan.txt,No aguanto en casa solo\nSin nada que hacer\nH...,,1980
2,lyricstxtSin amor_Ivan.txt,"Sin, sin, sin amor\nMe siento libre pero algo ...",,1980
3,lyricstxtSin amor_Ivan.txt,"Sin, sin, sin amor\nMe siento libre pero algo ...",,1980
4,lyricstxtSin amor_Ivan.txt,"Sin, sin, sin amor\nMe siento libre pero algo ...",,1980
...,...,...,...,...
813,lyricstxtYo x Ti Tu x Mi_ROSALçA.txt,Colgando del cuello los juguete' (Del cuello l...,,2020
814,lyricstxtYo x Ti Tu x Mi_ROSALçA.txt,Somos dos cantantes como los de ante'\nEl resp...,,2020
815,lyricstxtYo x Ti Tu x Mi_ROSALçA.txt,"(Woh-oh, oh-oh)\nY yo por ti, tú por mí, ¿quié...",,2020
816,lyricstxtYo x Ti Tu x Mi_ROSALçA.txt,Somos dos cantantes como los de ante'\nEl resp...,,2020


## **3.LSTM Training**

### **3.1 Data Preparation**

In [None]:
laser = Laser() # importing class for using embeddings extraction

In [None]:
#processed dataframe
X_tobe_processed = tdf['paragraph']

X_processed = lyrics_preprocessing(X_tobe_processed)
X_embeddings = laser.embed_sentences(X_processed, lang = 'es')

y = tdf['true_label (0,1 or NA)']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.33, random_state=42)

### **3.2 Modeling**

In [None]:
# KFOLD CROSS-VAL BASED ON: https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-use-k-fold-cross-validation-with-keras.md


# - - - - - TRAIN FEATURES - - - - -
X1_laser = tf.reshape(X_train, [-1, 1, 1024])

Y1 = to_categorical(y_train, 2)
Y1_reshaped = tf.reshape(Y1, [-1, 1, 2])

print('Train data shapes:',X1_laser.shape, Y1_reshaped.shape)

# - - - - - TEST FEATURES - - - - -
X2_laser = tf.reshape(X_test, [-1, 1, 1024])

Y2 = to_categorical(y_test, 2)
Y2_reshaped = tf.reshape(Y2, [-1, 1, 2])

print('Test data shapes:', X2_laser.shape, Y2_reshaped.shape)


inputs = np.concatenate((X1_laser, X2_laser), axis=0)
targets = np.concatenate((Y1_reshaped, Y2_reshaped), axis=0)

# Define per-fold score containers 
acc_per_fold = []
f1_per_fold = []
auc_per_fold = []
loss_per_fold = []

num_folds = 10
# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(inputs, targets):

  # Define the model architecture
  model = tf.keras.Sequential()
  model.add(LSTM(100, input_shape=(1, 1024), return_sequences=True))
  model.add(Dense(1024,activation='relu')) # MUST BE 2 hidden layers
  model.add(Dropout(0.5))
  model.add(Dense(128,activation='sigmoid'))
  model.add(Dense(2, activation='sigmoid'))

  # Compile the model
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.CategoricalAccuracy(name='accuracy'), f1, tf.keras.metrics.AUC(name='auc')])


  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  # Fit data to model
  history = model.fit(X1_laser, Y1_reshaped, validation_data=(X2_laser, Y2_reshaped), epochs=20, batch_size=100)


  # Generate generalization metrics
  scores = model.evaluate(X2_laser, Y2_reshaped, verbose=0)
  print(f'\nScore for fold {fold_no}: \n')
  print("Accuracy: %.2f%%" % (scores[1]*100))
  print("F1: %.2f%%" % (scores[2]*100))
  print("AUC: %.2f%%" % (scores[3]*100))
  print("Loss: %.2f%%" % (scores[0]))
  print('\n------------------------------------------------------------------------\n')
    
  acc_per_fold.append(scores[1] * 100)
  f1_per_fold.append(scores[2] * 100)
  auc_per_fold.append(scores[3] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

Train data shapes: (690, 1, 1024) (690, 1, 2)
Test data shapes: (340, 1, 1024) (340, 1, 2)
------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Score for fold 1: 

Accuracy: 87.94%
F1: 86.80%
AUC: 94.97%
Loss: 0.38%

------------------------------------------------------------------------

------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Score for fold 2: 

Accuracy: 87.35%
F1: 86.24%
AUC: 95.09%
Loss: 0.36%

-------------------------------

In [None]:
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]} - F1: {f1_per_fold[i]} - AUC: {auc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Loss: {np.mean(loss_per_fold)}')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> F1: {np.mean(f1_per_fold)} (+- {np.std(f1_per_fold)})')
print(f'> AUC: {np.mean(auc_per_fold)} (+- {np.std(auc_per_fold)})')

print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.3822953701019287 - Accuracy: 87.94117569923401 - F1: 86.7958664894104 - AUC: 94.97491121292114%
------------------------------------------------------------------------
> Fold 2 - Loss: 0.36340418457984924 - Accuracy: 87.35294342041016 - F1: 86.23794913291931 - AUC: 95.08649706840515%
------------------------------------------------------------------------
> Fold 3 - Loss: 0.3360816240310669 - Accuracy: 87.64705657958984 - F1: 85.38581728935242 - AUC: 94.82179880142212%
------------------------------------------------------------------------
> Fold 4 - Loss: 0.35372620820999146 - Accuracy: 87.64705657958984 - F1: 85.76995730400085 - AUC: 94.73832249641418%
------------------------------------------------------------------------
> Fold 5 - Loss: 0.36780378222465515 - Accuracy: 87.64705657958984 - F1: 86.417883

## **4. Labeling**

In [None]:
decades = [1980,1990, 2000, 2010, 2020]
sample_sorted_decades = []
for decade in decades:
  
  decade_df = lyrics_df[lyrics_df['decade'] ==decade]
  decade_df['paragraph'] = decade_df['paragraph'].astype(str)
  decade_df = decade_df[decade_df['paragraph'] != '']
  #decade_df['paragraph'] = decade_df['paragraph'].replace("EmbedShare URLCopyEmbedCopy", "", inplace=True) #SEGUEIX SENSE ANAR?!

  decade_df_processed = lyrics_preprocessing(decade_df['paragraph'])
  decade_df_embeddings = laser.embed_sentences(decade_df_processed, lang = 'es')
  decade_df = labeling(decade_df_embeddings, decade_df)
  
  #get 50 from top, 50 middle, 50 low
  
  sample1, sample2, sample3 = np.array_split(decade_df, 3)

  sample1 = sample1.sample(n=50)
  sample2 = sample2.sample(n=50)
  sample3 = sample3.sample(n=50)

  samples =[sample1, sample2, sample3]
  decade_df = pd.concat(samples)

  sample_sorted_decades.append(decade_df)


final_df = pd.concat(sample_sorted_decades)
final_df = final_df.sort_values('probability_sexist', ascending=False)


final_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,title,paragraph,label,decade,label probability,probability_sexist,probability_NOT_sexist
312,lyricstxtRelacion_Sech.txt,Ella siempre estaba cuando tú no estabas\nFue ...,1,2020,0.990960,0.990960,0.007631
2676,lyricstxtScream and Shout_will.i.am Britney Sp...,Hi! I’m Tunechi – I give the girls my room key...,1,2010,0.990256,0.990256,0.008354
85,lyricstxtCon Calma_Daddy Yankee.txt,"Con calma, yo quiero ver como ella lo menea\nM...",1,2020,0.990128,0.990128,0.008479
808,lyricstxtAdicto with Anuel AA Ozuna_Tainy.txt,(¡Ozuna!)\nSoy adicto a tu' parte'\nMe hiciste...,1,2020,0.990053,0.990053,0.008563
6145,lyricstxtSi Se Da_Myke Towers.txt,"(Pri, yah, yah, yah, ¡Farru!)\nY si se da, bab...",1,2010,0.989980,0.989980,0.008639
...,...,...,...,...,...,...,...
2571,lyricstxten navidad_rosana.txt,Para que todos los días sean navidad\nPara que...,0,1990,0.995805,0.004170,0.995805
2145,lyricstxtHeaven for everyone_Queen.txt,This could be heaven\nThis could be heaven\nTh...,0,1990,0.995818,0.004159,0.995818
3249,lyricstxtOne Love to Give_Stephanie.txt,One love to give\nOne song to sing\nTwo hearts...,0,1980,0.995886,0.004100,0.995886
2399,lyricstxtstreets of love_the rolling stones.txt,"And I, I walk the streets of love\nAnd they're...",0,2000,0.995900,0.004096,0.995900


In [None]:
#final_df.to_csv('lyrics_Predicted_Round2.csv')