# **LSTM Trained with set of tweets and lyrics from existing datasets**


## **0.File Preparation**

### **0.1 Requirements**

In [None]:
!pip install laserembeddings
!python -m laserembeddings download-models

Collecting laserembeddings
  Downloading laserembeddings-1.1.2-py3-none-any.whl (13 kB)
Collecting sacremoses==0.0.35
  Downloading sacremoses-0.0.35.tar.gz (859 kB)
[K     |████████████████████████████████| 859 kB 4.9 MB/s 
[?25hCollecting transliterate==1.10.2
  Downloading transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 3.4 MB/s 
[?25hCollecting subword-nmt<0.4.0,>=0.3.6
  Downloading subword_nmt-0.3.8-py3-none-any.whl (27 kB)
Collecting mock
  Downloading mock-4.0.3-py3-none-any.whl (28 kB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.35-py3-none-any.whl size=883989 sha256=dc3a87e7839414970951dabcea67a6e554c42bbcd1bb136f6327bf0dff11b909
  Stored in directory: /root/.cache/pip/wheels/d1/ff/0e/e00ff1e22100702ac8b24e709551ae0fb29db9ffc843510a64
Successfully built sacremoses
Installing collected packages: moc

### **0.2 Imports**

In [None]:
import pandas as pd
import numpy as np
import random
from random import sample

#Text Processing
import string
import re

#Modeling
#from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

#Neural Networks
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input, Concatenate, BatchNormalization
from tensorflow.keras.models import Model, Sequential

# Reshaping datasets to tensors
import tensorflow as tf
from tensorflow.keras.utils import to_categorical


#for Colab file dealing
import glob
#You can mount your Google Drive files by running the following code snippet
from google.colab import drive
drive.mount('/content/gdrive') # Now all files in: /content/gdrive/My Drive/location_of_the_file
from os import listdir
from os.path import isfile, join

Mounted at /content/gdrive


In [None]:
#Laser
from laserembeddings import Laser

### **0.3 Functions**

#### **0.3.1 For Text Processing**

In [None]:
def tweet_preprocessing(text_data):
    preprocessed_texts = []
    for text in text_data:
            # hashtags -> words, URLs -> URL and mentions -> USER
            text = re.sub('#', '', text)
            text = re.sub('((www\.[\\s]+)|(https?://[^\\s]+))', 'URL', text)
            text = re.sub('@[A-Za-z0-9_-]+', 'USER', text)
            text = re.sub('RT @[A-Za-z0-9_-]+:', 'USER', text)
            text = re.sub('\_', ' ', text) # _
            text = re.sub('\!', ' ', text) # !
            text = re.sub('\?', ' ', text) # ?
            text = re.sub('\W', ' ', text) # symbols
            text = re.sub('\_', ' ', text) # _
            text = re.sub('[\s]+', ' ', text) # spaces
            text = re.sub(r'(\d)\s+(\d)', r'\1\2', text) # remove spaces between numbers
            preprocessed_texts.append(text)

    return preprocessed_texts

In [None]:
def lyrics_preprocessing(text_data):
    preprocessed_texts = []
    for text in text_data:
      text = str(text).strip()

      text = re.sub('\[', '', text)
      text = re.sub('\]', '', text)
      text = re.sub('\_', ' ', text) # _
      text = re.sub('\!', ' ', text) # !
      text = re.sub('\?', ' ', text) # ?
      text = re.sub('\W', ' ', text) # symbols
      text = re.sub('\-', ' ', text) # -
      text = re.sub('[\s]+', ' ', text) # spaces

      text = re.sub("[\[].*?\]", "", text)#delete everything between square brackets

      # Get rid of Genius watermarks
      text = re.sub("EmbedShare URLCopyEmbedCopy", '', text) 
      text = re.sub("EmbedShareURLCopyEmbedCopy", '', text) 


      preprocessed_texts.append(text)

    return preprocessed_texts

In [None]:
def get_paragraphs_preprocessed (Files, mypath, df):
  
  #paragraphs 
  titles = []
  paragraphs = []
  for i in range(len(Files)):
    f = open(mypath+'/'+Files[i], 'r')

    data = f.read()
    data_splited = data.split("\n\n")
    

    for j in data_splited:
      titles.append(Files[i])
      unwanted = j.split("\n")
      wanted = []
      
      if '[' in unwanted[0]:
        wanted = unwanted[1:]
        j = "\n".join(wanted)

      paragraphs.append(j)

  df['title'] = titles
  df['paragraph'] = paragraphs
  
  return df



#### **0.3.2 For Model Evaluation**

In [None]:
# f1 evaluation
def f1(y_true, y_pred):
    y_true = K.flatten(y_true)
    y_pred = K.flatten(y_pred)
    return 2 * (K.sum(y_true * y_pred)+ K.epsilon()) / (K.sum(y_true) + K.sum(y_pred) + K.epsilon())

#### **0.3.3 For Labeling**

In [None]:
def labeling (l_embeddings, df):
  Xnew = tf.reshape(l_embeddings, [-1, 1, 1024])

  probs=model.predict(Xnew) 
  
  #The first value of the prediction is for class 0 and the second for class 1 

  ynew = []
  probabilities = []
  psxist = []
  p_not_sxist = []
  c=0
  for item in probs:
    if item[0][0]>item[0][1]:
      y = 0
      probability = item[0][0]  
    else:
      y = 1
      probability = item[0][1]
    p_not_sxist = np.append(p_not_sxist, item[0][0])
    psxist = np.append(psxist, item[0][1])
    c+=1
    ynew = np.append(ynew, y)
    probabilities = np.append(probabilities, probability)

  df['label'] = ynew.astype('int')
  df['label probability'] = probabilities
  df['probability_sexist'] = psxist
  df['probability_NOT_sexist'] = p_not_sxist
  
  df = df.sort_values('probability_sexist', ascending=False)
  
  return df 


## **1. Dataset**

In [None]:
training = '/content/gdrive/My Drive/training_dataset.csv'
training = pd.read_csv(training)
training_df = training.copy()
training_df

Unnamed: 0.1,Unnamed: 0,text,Class,language,dataset,Category,highlight
0,0,Red One Sugababes Girls bring the fun of life ...,sexism,en,lyrics,Not specified,Not specified
1,1,I guess it was yourself you were involved with...,sexism,en,lyrics,Not specified,Not specified
2,2,Bill collectors at my door What can you do for...,sexism,en,lyrics,Not specified,Not specified
3,3,I ain't cooking all day (I ain't your mama!) I...,sexism,en,lyrics,Not specified,Not specified
4,4,All hands on deck All in front all in the back...,sexism,en,lyrics,Not specified,Not specified
...,...,...,...,...,...,...,...
21772,3595,"""Experimentos que surgen en la ociosidad de la...",not_sexism,es,MeTwo,Not specified,Not specified
21773,3596,Mucho feminismo pero la Pedroche en tetas. Por...,sexism,es,MeTwo,Not specified,Not specified
21774,3597,hermana estaba contando a madrastra que un gom...,not_sexism,es,MeTwo,Not specified,Not specified
21775,3598,"@AdrianFtm24 @s0ymia Mucho feminismo, pero mir...",sexism,es,MeTwo,Not specified,Not specified


## **2. Lyrics to be Labeled**

In [None]:
mypath60s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/1960-1969'
mypath70s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/1970-1979'
mypath21s = '/content/gdrive/MyDrive/2021 Music Lyrics [Laura Casanovas]/2020-2021'

Files60s = [f for f in listdir(mypath60s) if isfile(join(mypath60s, f))]
Files70s = [f for f in listdir(mypath70s) if isfile(join(mypath70s, f))]
Files21s = [f for f in listdir(mypath21s) if isfile(join(mypath21s, f))]

cols=['title', 'paragraph', 'label']

lyrics_df = pd.DataFrame(columns=cols)

In [None]:
lyrics_df = get_paragraphs_preprocessed(Files21s, mypath21s, lyrics_df)
lyrics_df['paragraph'] = lyrics_preprocessing(lyrics_df['paragraph'])
lyrics_df

Unnamed: 0,title,paragraph,label
0,lyricstxt_Aitana.txt,,
1,lyricstxt_Aitana.txt,Voy a salir no más fingir no más servir La noc...,
2,lyricstxt_Aitana.txt,Tira porque te toca a ti perder Que aquí ya se...,
3,lyricstxt_Aitana.txt,Pero si me toca toca tócame Yo decido el cuánd...,
4,lyricstxt_Aitana.txt,En un chico malo no no no Pa fuera lo malo no ...,
...,...,...,...
1661,lyricstxtThe Business_Tisto.txt,Mama please don t worry bout me Cause I m abou...,
1662,lyricstxtThe Business_Tisto.txt,Let s get down let s get down to business Give...,
1663,lyricstxtThe Business_Tisto.txt,Back and forth back and forth with the bullshi...,
1664,lyricstxtThe Business_Tisto.txt,Let s get down let s get down to business Give...,


## **3. LSTM Training**

### **3.1 Data Preparation**

In [None]:
laser = Laser() # importing class for using embeddings extraction

In [None]:
# train data
train_data = training_df[(training_df['dataset']=='exist')|(training_df['dataset']=='exist_test')|(training_df['dataset']=='MeTwo')]
texts_tobe_processed_train = train_data['text']


texts_processed_train = tweet_preprocessing(texts_tobe_processed_train)


train_embeddings = laser.embed_sentences(texts_processed_train, lang = 'en') 

train_data[['Class']] = train_data[['Class']].replace(['sexism', 'not_sexism'],[1,0])
train_labels = train_data['Class']
train_labels = train_labels.astype('int64')


# test data

#since I do not really know what '-1' means I will drop the 145 rows with value -1 for the testing part
test_data = training_df[(training_df['dataset']=='lyrics')&(training_df['Class']!='-1')]

texts_tobe_processed_test = test_data['text']

texts_processed_test = tweet_preprocessing(texts_tobe_processed_test)

    
test_embeddings = laser.embed_sentences(texts_processed_test, lang = 'en')

test_data[['Class']] = test_data[['Class']].replace(['sexism', 'not_sexism'],[1,0])
test_labels = test_data['Class']
test_labels = test_labels.astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [None]:
lyrics_df['paragraph'] = lyrics_df['paragraph'].astype(str)
lyrics_processed = lyrics_preprocessing(lyrics_df['paragraph'])
lyrics_embeddings = laser.embed_sentences(lyrics_processed, lang = 'es')

### **3.2 Modeling**

In [None]:
# - - - - - TRAIN FEATURES - - - - -
X1_laser = tf.reshape(train_embeddings, [-1, 1, 1024])

Y1 = to_categorical(train_labels, 2)
Y1_reshaped = tf.reshape(Y1, [-1, 1, 2])

print('Train data shapes:',X1_laser.shape, Y1_reshaped.shape)

# - - - - - TEST FEATURES - - - - -
X2_laser = tf.reshape(test_embeddings, [-1, 1, 1024])

Y2 = to_categorical(test_labels, 2)
Y2_reshaped = tf.reshape(Y2, [-1, 1, 2])

print('Test data shapes:', X2_laser.shape, Y2_reshaped.shape)


inputs = np.concatenate((X1_laser, X2_laser), axis=0)
targets = np.concatenate((Y1_reshaped, Y2_reshaped), axis=0)

# Define per-fold score containers 
acc_per_fold = []
f1_per_fold = []
auc_per_fold = []
loss_per_fold = []

num_folds = 10

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(inputs, targets):

  # Define the model architecture
  model = tf.keras.Sequential()
  model.add(LSTM(100, input_shape=(1, 1024), return_sequences=True))
  model.add(Dense(1024,activation='relu')) # MUST BE 2 hidden layers
  model.add(Dropout(0.5))
  model.add(Dense(128,activation='sigmoid'))
  model.add(Dense(2, activation='sigmoid'))

  # Compile the model
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.CategoricalAccuracy(name='accuracy'), f1, tf.keras.metrics.AUC(name='auc')])


  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  # Fit data to model
  history = model.fit(X1_laser, Y1_reshaped, validation_data=(X2_laser, Y2_reshaped), epochs=20, batch_size=100)


  # Generate generalization metrics
  scores = model.evaluate(X2_laser, Y2_reshaped, verbose=0)
  print(f'\nScore for fold {fold_no}: \n')
  print("Accuracy: %.2f%%" % (scores[1]*100))
  print("F1: %.2f%%" % (scores[2]*100))
  print("AUC: %.2f%%" % (scores[3]*100))
  print("Loss: %.2f%%" % (scores[0]))
  print('\n------------------------------------------------------------------------\n')
    
  acc_per_fold.append(scores[1] * 100)
  f1_per_fold.append(scores[2] * 100)
  auc_per_fold.append(scores[3] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

Train data shapes: (14678, 1, 1024) (14678, 1, 2)
Test data shapes: (387, 1, 1024) (387, 1, 2)
------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Score for fold 1: 

Accuracy: 69.51%
F1: 65.10%
AUC: 74.51%
Loss: 0.69%

------------------------------------------------------------------------

------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Score for fold 2: 

Accuracy: 71.32%
F1: 66.73%
AUC: 77.86%
Loss: 0.64%

---------------------------

In [None]:
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]} - F1: {f1_per_fold[i]} - AUC: {auc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Loss: {np.mean(loss_per_fold)}')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> F1: {np.mean(f1_per_fold)} (+- {np.std(f1_per_fold)})')
print(f'> AUC: {np.mean(auc_per_fold)} (+- {np.std(auc_per_fold)})')

print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.6945356130599976 - Accuracy: 69.50904130935669 - F1: 65.09531736373901 - AUC: 74.51408505439758%
------------------------------------------------------------------------
> Fold 2 - Loss: 0.6442931294441223 - Accuracy: 71.3178277015686 - F1: 66.72589182853699 - AUC: 77.85956859588623%
------------------------------------------------------------------------
> Fold 3 - Loss: 0.7596547603607178 - Accuracy: 66.66666865348816 - F1: 65.60471057891846 - AUC: 73.1179416179657%
------------------------------------------------------------------------
> Fold 4 - Loss: 0.6943183541297913 - Accuracy: 69.76743936538696 - F1: 66.47533178329468 - AUC: 75.94595551490784%
------------------------------------------------------------------------
> Fold 5 - Loss: 0.7083387970924377 - Accuracy: 68.73385310173035 - F1: 65.4515683650

## **4.Labeling**

In [None]:
lyrics_df = labeling(lyrics_embeddings, lyrics_df)
#lyrics_df.to_csv('lyrics_Predicted_2021.csv')