# **Initial Models with existing Dataset**

Goal is to choose which model works better 

## **0.File Preparation**

### **0.1 Requirements**


In [None]:
!pip install laserembeddings
!python -m laserembeddings download-models

Collecting laserembeddings
  Downloading laserembeddings-1.1.2-py3-none-any.whl (13 kB)
Collecting sacremoses==0.0.35
  Downloading sacremoses-0.0.35.tar.gz (859 kB)
[K     |████████████████████████████████| 859 kB 5.2 MB/s 
[?25hCollecting subword-nmt<0.4.0,>=0.3.6
  Downloading subword_nmt-0.3.8-py3-none-any.whl (27 kB)
Collecting transliterate==1.10.2
  Downloading transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 3.4 MB/s 
Collecting mock
  Downloading mock-4.0.3-py3-none-any.whl (28 kB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.35-py3-none-any.whl size=883989 sha256=039e704a3867537a0e08c6db842951ec01943ff298887b28323a84fdef2ebc10
  Stored in directory: /root/.cache/pip/wheels/d1/ff/0e/e00ff1e22100702ac8b24e709551ae0fb29db9ffc843510a64
Successfully built sacremoses
Installing collected packages: mock, tra

### **0.2 Imports**

In [None]:
import pandas as pd
import os
import numpy as np
from collections import Counter

import regex as re 
import string

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

import re
from laserembeddings import Laser

from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input, Concatenate, BatchNormalization
from tensorflow.keras.models import Model, Sequential

# Reshaping datasets to tensors
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

#for Colab file dealing
import glob
#You can mount your Google Drive files by running the following code snippet
from google.colab import drive
drive.mount('/content/gdrive') # Now all files in: /content/gdrive/My Drive/location_of_the_file
from os import listdir
from os.path import isfile, join

Mounted at /content/gdrive


### **0.3 Functions**

#### **0.3.1 For Text Processing**

In [None]:
def tweet_preprocessing(text_data):
    # you could use this function for preprocess tweets
    preprocessed_texts = []
    for text in text_data:
            # hashtags -> words, URLs -> URL and mentions -> USER
            text = re.sub('#', '', text)
            text = re.sub('((www\.[\\s]+)|(https?://[^\\s]+))', 'URL', text)
            text = re.sub('@[A-Za-z0-9_-]+', 'USER', text)
            text = re.sub('RT @[A-Za-z0-9_-]+:', 'USER', text)
            text = re.sub('\_', ' ', text) # _
            text = re.sub('\!', ' ', text) # !
            text = re.sub('\?', ' ', text) # ?
            text = re.sub('\W', ' ', text) # symbols
            text = re.sub('\_', ' ', text) # _
            text = re.sub('[\s]+', ' ', text) # spaces
            text = re.sub(r'(\d)\s+(\d)', r'\1\2', text) # remove spaces between numbers
            preprocessed_texts.append(text)

    return preprocessed_texts

#### **0.3.2 For Model Evaluation**

In [None]:
# f1 evaluation
def f1(y_true, y_pred):
    y_true = K.flatten(y_true)
    y_pred = K.flatten(y_pred)
    return 2 * (K.sum(y_true * y_pred)+ K.epsilon()) / (K.sum(y_true) + K.sum(y_pred) + K.epsilon())

## **1. Data Preparation**

### **1.1 Dataset**

In [None]:
training = '/content/gdrive/My Drive/training_dataset.csv'
training = pd.read_csv(training)
training_df = training.copy()
training_df

Unnamed: 0.1,Unnamed: 0,text,Class,language,dataset,Category,highlight
0,0,Red One Sugababes Girls bring the fun of life ...,sexism,en,lyrics,Not specified,Not specified
1,1,I guess it was yourself you were involved with...,sexism,en,lyrics,Not specified,Not specified
2,2,Bill collectors at my door What can you do for...,sexism,en,lyrics,Not specified,Not specified
3,3,I ain't cooking all day (I ain't your mama!) I...,sexism,en,lyrics,Not specified,Not specified
4,4,All hands on deck All in front all in the back...,sexism,en,lyrics,Not specified,Not specified
...,...,...,...,...,...,...,...
21772,3595,"""Experimentos que surgen en la ociosidad de la...",not_sexism,es,MeTwo,Not specified,Not specified
21773,3596,Mucho feminismo pero la Pedroche en tetas. Por...,sexism,es,MeTwo,Not specified,Not specified
21774,3597,hermana estaba contando a madrastra que un gom...,not_sexism,es,MeTwo,Not specified,Not specified
21775,3598,"@AdrianFtm24 @s0ymia Mucho feminismo, pero mir...",sexism,es,MeTwo,Not specified,Not specified


### **1.2 Cleaning and Embedding**

In [None]:
laser = Laser() # importing class for using embeddings extraction

#### **1.2.1 Dataset: Exist, Exist test & MeTwo for training and Lyrics for testing** 

In [None]:
# train data
train_data = training_df[(training_df['dataset']=='exist')|(training_df['dataset']=='exist_test')|(training_df['dataset']=='MeTwo')]
texts_tobe_processed_train = train_data['text']


texts_processed_train = tweet_preprocessing(texts_tobe_processed_train)


train_embeddings = laser.embed_sentences(texts_processed_train, lang = 'en') 

train_data[['Class']] = train_data[['Class']].replace(['sexism', 'not_sexism'],[1,0])
train_labels = train_data['Class']
train_labels = train_labels.astype('int64')


# test data

#since I do not really know what '-1' means I will drop the 145 rows with value -1 for the testing part
test_data = training_df[(training_df['dataset']=='lyrics')&(training_df['Class']!='-1')]

texts_tobe_processed_test = test_data['text']

texts_processed_test = tweet_preprocessing(texts_tobe_processed_test)

    
test_embeddings = laser.embed_sentences(texts_processed_test, lang = 'en')

test_data[['Class']] = test_data[['Class']].replace(['sexism', 'not_sexism'],[1,0])
test_labels = test_data['Class']
test_labels = test_labels.astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


#### **1.2.2 Dataset: Exist for training & Exist_test for testing**


In [None]:
# train data
train_data2 = training_df[training_df['dataset']=='exist']
texts_tobe_processed_train2 = train_data2['text']


texts_processed_train2 = tweet_preprocessing(texts_tobe_processed_train2)


train_embeddings2 = laser.embed_sentences(texts_processed_train2, lang = 'en') 

train_data2[['Class']] = train_data2[['Class']].replace(['sexism', 'not_sexism'],[1,0])
train_labels2 = train_data2['Class']
train_labels2 = train_labels2.astype('int64')


# test data

test_data2 = training_df[training_df['dataset']=='exist_test']

texts_tobe_processed_test2 = test_data2['text']

texts_processed_test2 = tweet_preprocessing(texts_tobe_processed_test2)

    
test_embeddings2 = laser.embed_sentences(texts_processed_test2, lang = 'en')

test_data2[['Class']] = test_data2[['Class']].replace(['sexism', 'not_sexism'],[1,0])
test_labels2 = test_data2['Class']
test_labels2 = test_labels2.astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


## **2. Modeling**

### **2.1 SVM + Dataset: Exist, Exist test & MeTwo for training and Lyrics for testing**

In [None]:
### Train with tweets

In [None]:
# train the model (SVM for example)
model = svm.SVC(kernel='linear', degree=3, gamma='auto', tol=0.001)
model.fit(train_embeddings, train_labels)
prediction = model.predict(test_embeddings)

In [None]:
from sklearn.model_selection import cross_val_score
# measure model performance
accuracy = cross_val_score(model, train_embeddings, train_labels, scoring='accuracy', cv=10).mean()
f1_ = cross_val_score(model, train_embeddings, train_labels, cv=10, scoring= 'f1').mean()
auc = cross_val_score(model, train_embeddings, train_labels, cv=10, scoring= 'roc_auc').mean()

print('ACC:', accuracy)
print('F1:', f1_)
print('AUC:', auc)

ACC: 0.6837396380683856
F1: 0.6533043244083196
AUC: 0.7437647023707117


### **2.2 SVM + Dataset: Exist for training & Exist_test for testing**
now we use as train the training exist dataset and as test the testing exist dataset, this way we can see if the model is the one that does not work or the classifier is not adapting to the lyrics dataset. 

In [None]:
# train the model (SVM)
model2 = svm.SVC(kernel='linear', degree=3, gamma='auto', tol=0.001)
model2.fit(train_embeddings2, train_labels2)
prediction2 = model2.predict(test_embeddings2)

In [None]:
# measure model performance
accuracy = cross_val_score(model2, train_embeddings2, train_labels2, cv=10, scoring= 'accuracy').mean()
f1_ = cross_val_score(model2, train_embeddings2, train_labels2, cv=10, scoring= 'f1').mean()
auc = cross_val_score(model2, train_embeddings2, train_labels2, cv=10, scoring= 'roc_auc').mean()

print('ACC:', accuracy)
print('F1:', f1_)
print('AUC:', auc)

ACC: 0.7084632049758894
F1: 0.6997301330869063
AUC: 0.7696235346104098


### **2.3 LSTM + Datasets: Exist, Exist test & MeTwo**

In [None]:
# - - - - - TRAIN FEATURES - - - - -
X1_laser = tf.reshape(train_embeddings, [-1, 1, 1024])

Y1 = to_categorical(train_labels, 2)
Y1_reshaped = tf.reshape(Y1, [-1, 1, 2])

print('Train data shapes:',X1_laser.shape, Y1_reshaped.shape)

# - - - - - TEST FEATURES - - - - -
X2_laser = tf.reshape(test_embeddings, [-1, 1, 1024])

Y2 = to_categorical(test_labels, 2)
Y2_reshaped = tf.reshape(Y2, [-1, 1, 2])

print('Test data shapes:', X2_laser.shape, Y2_reshaped.shape)

inputs = np.concatenate((X1_laser, X2_laser), axis=0)
targets = np.concatenate((Y1_reshaped, Y2_reshaped), axis=0)

# Define per-fold score containers 
acc_per_fold = []
f1_per_fold = []
auc_per_fold = []
loss_per_fold = []

num_folds = 10
# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(inputs, targets):

  # Define the model architecture
  model_neur = tf.keras.Sequential()
  model_neur.add(LSTM(100, input_shape=(1, 1024), return_sequences=True))
  model_neur.add(Dense(1024,activation='relu')) # MUST BE 2 hidden layers
  model_neur.add(Dropout(0.5))
  model_neur.add(Dense(128,activation='sigmoid'))
  model_neur.add(Dense(2, activation='sigmoid'))

  # Compile the model
  model_neur.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.CategoricalAccuracy(name='accuracy'), f1, tf.keras.metrics.AUC(name='auc')])


  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  # Fit data to model
  history = model_neur.fit(X1_laser, Y1_reshaped, validation_data=(X2_laser, Y2_reshaped), epochs=20, batch_size=100)


  # Generate generalization metrics
  scores = model_neur.evaluate(X2_laser, Y2_reshaped, verbose=0)
  print(f'\nScore for fold {fold_no}: \n')
  print("Accuracy: %.2f%%" % (scores[1]*100))
  print("F1: %.2f%%" % (scores[2]*100))
  print("AUC: %.2f%%" % (scores[3]*100))
  print("Loss: %.2f%%" % (scores[0]))
  print('\n------------------------------------------------------------------------\n')
    
  acc_per_fold.append(scores[1] * 100)
  f1_per_fold.append(scores[2] * 100)
  auc_per_fold.append(scores[3] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

Train data shapes: (14678, 1, 1024) (14678, 1, 2)
Test data shapes: (387, 1, 1024) (387, 1, 2)
------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Score for fold 1: 

Accuracy: 71.83%
F1: 65.86%
AUC: 78.21%
Loss: 0.61%

------------------------------------------------------------------------

------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Score for fold 2: 

Accuracy: 63.57%
F1: 63.37%
AUC: 68.94%
Loss: 0.92%

---------------------------

In [None]:
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]} - F1: {f1_per_fold[i]} - AUC: {auc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Loss: {np.mean(loss_per_fold)}')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> F1: {np.mean(f1_per_fold)} (+- {np.std(f1_per_fold)})')
print(f'> AUC: {np.mean(auc_per_fold)} (+- {np.std(auc_per_fold)})')

print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.6113957762718201 - Accuracy: 71.83462381362915 - F1: 65.86218476295471 - AUC: 78.21478843688965%
------------------------------------------------------------------------
> Fold 2 - Loss: 0.9229946732521057 - Accuracy: 63.56589198112488 - F1: 63.368427753448486 - AUC: 68.9401626586914%
------------------------------------------------------------------------
> Fold 3 - Loss: 0.6305097341537476 - Accuracy: 72.09302186965942 - F1: 64.40009474754333 - AUC: 76.59829258918762%
------------------------------------------------------------------------
> Fold 4 - Loss: 0.6248045563697815 - Accuracy: 70.54263353347778 - F1: 66.64077043533325 - AUC: 78.00512909889221%
------------------------------------------------------------------------
> Fold 5 - Loss: 0.6914043426513672 - Accuracy: 70.28423547744751 - F1: 65.17699360

 ### **2.4 LSTM + Dataset: Exist for training & Exist_test for testing**

In [None]:
# - - - - - TRAIN FEATURES - - - - -
X1_laser2 = tf.reshape(train_embeddings2, [-1, 1, 1024])

Y12 = to_categorical(train_labels2, 2)
Y1_reshaped2 = tf.reshape(Y12, [-1, 1, 2])

print('Train data shapes:',X1_laser2.shape, Y1_reshaped2.shape)

# - - - - - TEST FEATURES - - - - -
X2_laser2 = tf.reshape(test_embeddings2, [-1, 1, 1024])

Y22 = to_categorical(test_labels2, 2)
Y2_reshaped2 = tf.reshape(Y22, [-1, 1, 2])

print('Test data shapes:', X2_laser2.shape, Y2_reshaped2.shape)


inputs = np.concatenate((X1_laser, X2_laser), axis=0)
targets = np.concatenate((Y1_reshaped, Y2_reshaped), axis=0)

# Define per-fold score containers 
acc_per_fold = []
f1_per_fold = []
auc_per_fold = []
loss_per_fold = []

num_folds = 10
# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(inputs, targets):

  # Define the model architecture
  model_neur2 = tf.keras.Sequential()
  model_neur2.add(LSTM(100, input_shape=(1, 1024), return_sequences=True))
  model_neur2.add(Dense(1024,activation='relu')) # MUST BE 2 hidden layers
  model_neur2.add(Dropout(0.5))
  model_neur2.add(Dense(128,activation='sigmoid'))
  model_neur2.add(Dense(2, activation='sigmoid'))

  # Compile the model
  model_neur2.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.CategoricalAccuracy(name='accuracy'), f1, tf.keras.metrics.AUC(name='auc')])


  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  # Fit data to model
  history = model_neur2.fit(X1_laser, Y1_reshaped, validation_data=(X2_laser, Y2_reshaped), epochs=20, batch_size=100)


  # Generate generalization metrics
  scores = model_neur2.evaluate(X2_laser, Y2_reshaped, verbose=0)
  print(f'\nScore for fold {fold_no}: \n')
  print("Accuracy: %.2f%%" % (scores[1]*100))
  print("F1: %.2f%%" % (scores[2]*100))
  print("AUC: %.2f%%" % (scores[3]*100))
  print("Loss: %.2f%%" % (scores[0]))
  print('\n------------------------------------------------------------------------\n')
    
  acc_per_fold.append(scores[1] * 100)
  f1_per_fold.append(scores[2] * 100)
  auc_per_fold.append(scores[3] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

Train data shapes: (6977, 1, 1024) (6977, 1, 2)
Test data shapes: (4368, 1, 1024) (4368, 1, 2)
------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Score for fold 1: 

Accuracy: 71.32%
F1: 66.43%
AUC: 77.42%
Loss: 0.65%

------------------------------------------------------------------------

------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Score for fold 2: 

Accuracy: 61.50%
F1: 63.26%
AUC: 68.62%
Loss: 0.87%

---------------------------

In [None]:
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]} - F1: {f1_per_fold[i]} - AUC: {auc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Loss: {np.mean(loss_per_fold)}')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> F1: {np.mean(f1_per_fold)} (+- {np.std(f1_per_fold)})')
print(f'> AUC: {np.mean(auc_per_fold)} (+- {np.std(auc_per_fold)})')

print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.649603545665741 - Accuracy: 71.3178277015686 - F1: 66.43450856208801 - AUC: 77.4232268333435%
------------------------------------------------------------------------
> Fold 2 - Loss: 0.8680086135864258 - Accuracy: 61.49870753288269 - F1: 63.260167837142944 - AUC: 68.62267851829529%
------------------------------------------------------------------------
> Fold 3 - Loss: 0.8347148299217224 - Accuracy: 64.59948420524597 - F1: 64.65307474136353 - AUC: 71.36189937591553%
------------------------------------------------------------------------
> Fold 4 - Loss: 0.6843291521072388 - Accuracy: 68.2170569896698 - F1: 64.86336588859558 - AUC: 74.3224561214447%
------------------------------------------------------------------------
> Fold 5 - Loss: 0.6557521224021912 - Accuracy: 68.73385310173035 - F1: 63.981562852859