In [1]:
import pickle
import numpy as np
import pandas as pd
import random

# Extracción de características
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

# Modelos
from sklearn.linear_model import LogisticRegression

# Procesado
import sklearn.preprocessing as pr
import multiprocessing as mp
from tensorflow.keras.preprocessing import sequence

# Métricas
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve

# Configuración
train_model_1 = True
train_model_2 = True
train_model_3_w2v = True
train_model_3_lstm = True

!mkdir Models
!mkdir Score

Ya existe el subdirectorio o el archivo Models.
Ya existe el subdirectorio o el archivo Score.


# 3.1.- Carga de datos y preparación del validation data

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
with open('./Vars/X_train.pkl', 'rb') as file:
    X = pickle.load(file)
    
with open('./Vars/y_train.pkl', 'rb') as file:
    y = pickle.load(file)

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    train_size=0.80,
    test_size=0.20,
    random_state=52,
    shuffle=True,
    stratify=y
)

# 3.2.- Modelo 1 - Extracción de características TF-IDF + LogisticRegression

La idea es usar este modelo de Machine Learning para ver como se comportan los datos, habiendo previamente preprocesado los datos (tokenizado, limpieza, normalizado...) y preparando una extracción de características TF-IDF para alimentar a dicho modelo

## 3.2.1.- Configuración

In [5]:
class TF_IDF:
    def __init__(self, ngram_range, strip_accents, max_df, min_df, max_features):
        self.vectorizer = TfidfVectorizer(
            ngram_range=ngram_range,
            strip_accents=strip_accents,
            max_df=max_df,
            min_df=min_df,
            max_features=max_features)
         
    def fit(self, train_data):
        self.vectorizer.fit(train_data)
        
    def transform(self, data):
        return self.vectorizer.transform(data)
        
    def get_vocabulary(self):
        return self.vectorizer.vocabulary_

## 3.2.2.- Procesado

In [6]:
# Iniciamos objeto
tf_idf = TF_IDF(ngram_range=(1, 3),
                strip_accents='ascii',
                max_df=0.95,
                min_df=3,
                max_features=4000)

In [7]:
# fit
tf_idf.fit(X_train)

In [8]:
# transform
X_train_f = tf_idf.transform(X_train)
X_valid_f = tf_idf.transform(X_valid)

In [9]:
# Vocabulario
vocab = tf_idf.get_vocabulary()

## 3.2.3.- Entrenamiento

In [10]:
if train_model_1:

    # Regularización
    c_params = [0.01, 0.05, 0.25, 0.5, 1, 10, 100, 1000, 10000]

    # Métricas
    train_acc = list()
    valid_acc = list()
    best_c = 0
    best_train_acc = 0
    best_valid_acc = 0

    for c in c_params:
        # Creo modelo y lo entreno
        model = LogisticRegression(C=c, solver='lbfgs', max_iter=500)
        model.fit(X_train_f, y_train)
        
        # Predicción del modelo
        y_train_pred = model.predict(X_train_f)
        y_valid_pred = model.predict(X_valid_f)

        # Guardo métricas
        train_acc.append(accuracy_score(y_train, y_train_pred))
        valid_acc.append(accuracy_score(y_valid, y_valid_pred))
        print ("Val_Accuracy for C={}: {}".format(c, accuracy_score(y_valid, y_valid_pred)))

        # Checkpoint (Guardo siempre el modelo de mayor accuracy en val)
        if max(valid_acc) == accuracy_score(y_valid, y_valid_pred):
            pickle.dump(model, open("./Models/model_1.pkl", 'wb'))
            best_c = c
            best_train_acc = accuracy_score(y_train, y_train_pred)
            best_valid_acc = accuracy_score(y_valid, y_valid_pred)

    print("Best model\n\tc: {}\n\ttrain acc: {}\n\tval acc: {}".format(best_c, best_train_acc, best_valid_acc))

Val_Accuracy for C=0.01: 0.8378
Val_Accuracy for C=0.05: 0.8587666666666667
Val_Accuracy for C=0.25: 0.8866333333333334
Val_Accuracy for C=0.5: 0.8922333333333333
Val_Accuracy for C=1: 0.8953666666666666
Val_Accuracy for C=10: 0.8968


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Val_Accuracy for C=100: 0.8952666666666667


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Val_Accuracy for C=1000: 0.8951666666666667
Val_Accuracy for C=10000: 0.8951666666666667
Best model
	c: 10
	train acc: 0.9088833333333334
	val acc: 0.8968


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 3.2.5.- Guardando resultados

In [11]:
# Cargo el modelo
with open('./Models/model_1.pkl', 'rb') as file:
    model = pickle.load(file)

In [12]:
# Parámetros para el error
error_train_params = (y_train, model.predict(X_train_f))
error_valid_params = (y_valid, model.predict(X_valid_f))

In [13]:
# Scores
scores = {
    "Name": "TF-IDF + Logistic Regression",
    "Confusion Matrix train": confusion_matrix(*error_train_params),
    "Report train": classification_report(*error_train_params),
    "Accuracy train": accuracy_score(*error_train_params),
    "Confusion Matrix val": confusion_matrix(*error_valid_params),
    "Report val": classification_report(*error_valid_params),
    "Accuracy val": accuracy_score(*error_valid_params)
}

In [14]:
print("Name: {}\nAccuracy val data: {}".format(scores["Name"], scores["Accuracy val"]))

Name: TF-IDF + Logistic Regression
Accuracy val data: 0.8968


In [15]:
# guardo
pickle.dump(scores, open("./Score/model_1_scores.pkl", 'wb'))

# 3.3.- Modelo 2 - Extracción de características Bag_of_Words + LogisticRegression

## 3.3.1.- Configuración

In [16]:
class BoW:
    def __init__(self, max_features):
        self.vectorizer = CountVectorizer(max_features=max_features)
         
    def fit(self, train_data):
        self.vectorizer.fit(train_data)
        
    def transform(self, data, normalize=False):
        if normalize:
            return  pr.normalize(self.vectorizer.transform(data), axis=1)
        else:
            return self.vectorizer.transform(data)
        
    def get_vocabulary(self):
        return self.vectorizer.vocabulary_

## 3.3.2.- Procesado

In [17]:
# Iniciamos objeto
bow = BoW(max_features=4000)

In [18]:
# fit
bow.fit(X_train)

In [19]:
# transform (y normaliza)
X_train_f = bow.transform(X_train, normalize=True)
X_valid_f = bow.transform(X_valid, normalize=True)

In [20]:
# Vocabulario
vocab = bow.get_vocabulary()

## 3.3.3.- Entrenamiento

In [21]:
if train_model_2:

    # Regularización
    c_params = [0.01, 0.05, 0.25, 0.5, 1, 10, 100, 1000, 10000]

    # Métricas
    train_acc = list()
    valid_acc = list()
    best_c = 0
    best_train_acc = 0
    best_valid_acc = 0

    for c in c_params:
        # Creo modelo y lo entreno
        model = LogisticRegression(C=c, solver='lbfgs', max_iter=500)
        model.fit(X_train_f, y_train)
        
        # Predicción del modelo
        y_train_pred = model.predict(X_train_f)
        y_valid_pred = model.predict(X_valid_f)

        # Guardo métricas
        train_acc.append(accuracy_score(y_train, y_train_pred))
        valid_acc.append(accuracy_score(y_valid, y_valid_pred))
        print ("Val_Accuracy for C={}: {}".format(c, accuracy_score(y_valid, y_valid_pred)))

        # Checkpoint (Guardo siempre el modelo de mayor accuracy en val)
        if max(valid_acc) == accuracy_score(y_valid, y_valid_pred):
            pickle.dump(model, open("./Models/model_2.pkl", 'wb'))
            best_c = c
            best_train_acc = accuracy_score(y_train, y_train_pred)
            best_valid_acc = accuracy_score(y_valid, y_valid_pred)

    print("Best model\n\tc: {}\n\ttrain acc: {}\n\tval acc: {}".format(best_c, best_train_acc, best_valid_acc))

Val_Accuracy for C=0.01: 0.8388333333333333
Val_Accuracy for C=0.05: 0.8554333333333334
Val_Accuracy for C=0.25: 0.8747
Val_Accuracy for C=0.5: 0.8805666666666667
Val_Accuracy for C=1: 0.8839333333333333
Val_Accuracy for C=10: 0.8875


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Val_Accuracy for C=100: 0.8856666666666667


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Val_Accuracy for C=1000: 0.8851
Val_Accuracy for C=10000: 0.8852
Best model
	c: 10
	train acc: 0.8977333333333334
	val acc: 0.8875


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 3.3.4.- Guardando resultados

In [22]:
# Cargo el modelo
with open('./Models/model_2.pkl', 'rb') as file:
    model = pickle.load(file)

In [23]:
# Parámetros para el error
error_train_params = (y_train, model.predict(X_train_f))
error_valid_params = (y_valid, model.predict(X_valid_f))

In [24]:
# Scores
scores = {
    "Name": "BoW + Logistic Regression",
    "Confusion Matrix train": confusion_matrix(*error_train_params),
    "Report train": classification_report(*error_train_params),
    "Accuracy train": accuracy_score(*error_train_params),
    "Confusion Matrix val": confusion_matrix(*error_valid_params),
    "Report val": classification_report(*error_valid_params),
    "Accuracy val": accuracy_score(*error_valid_params)
}

In [25]:
# guardo
pickle.dump(scores, open("./Score/model_2_scores.pkl", 'wb'))

# 3.4.- Modelo 3 - LSTM

## 3.4.1.- Vectorización (Word-Embeding)

In [26]:
class callback(CallbackAny2Vec):

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        cum_loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, cum_loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, cum_loss - self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = cum_loss

In [27]:
class W2V:
    def __init__(self, vector_size, window, min_count, sg, hs, negative, workers, seed):
        self.vector_size = vector_size
        self.w2v = Word2Vec(vector_size=vector_size,
                            window=window,
                            min_count=min_count,
                            sg=sg,
                            hs=hs,
                            negative=negative,
                            workers=workers,
                            seed=seed)
        
    def build_vocabulary(self, data):
        self.w2v.build_vocab(self.buid_corpus(data))
        
    def update_vocabulary(self, data):
        self.w2v.build_vocab(self.buid_corpus(data), update=True)
        
    def buid_corpus(self, train_data):
        return [review.split() for review in train_data]
         
    def fit(self, train_data, epochs=2):
        # Construyo el vocabulario
        self.w2v.build_vocab(self.buid_corpus(train_data))
        
        # train (llamamos función para crear el corpus y callback personalizado)
        self.w2v.train(self.buid_corpus(train_data), 
                       total_examples=self.w2v.corpus_count, 
                       epochs=epochs, 
                       compute_loss=True, 
                       callbacks=[callback()])
        
    def transform(self, data):
        word2id = self.w2v.wv.key_to_index
        id2word = {i: word for word, i in word2id.items()}
        # padding y vectorizado
        data = [[word2id.get(word) for word in review.split()] for review in data]
        return sequence.pad_sequences(data, maxlen=self.vector_size)
        
    def get_vocabulary(self):
        return self.w2v.wv.key_to_index
    
    def get_weights(self):
        return self.w2v.wv.vectors

In [28]:
max_words = int(np.quantile([len(words.split()) for words in X_train], 0.95))
cbow_params = {
            'vector_size': max_words,
            'window': 10,
            'min_count': 1,
            'sg': 0,
            'hs': 0,
            'negative': 20,
            'workers' : 8,
            'seed' : 52
}

In [29]:
w2v = W2V(**cbow_params)

In [30]:
w2v.fit(X_train, epochs=5)

Loss after epoch 0: 3383253.0
Loss after epoch 1: 2729093.0
Loss after epoch 2: 2592010.0
Loss after epoch 3: 2409132.0
Loss after epoch 4: 2346413.0


In [31]:
w2v.update_vocabulary(X_valid)

In [32]:
X_train_f = w2v.transform(X_train)
X_valid_f = w2v.transform(X_valid)

In [33]:
X_valid_f

array([[    0,     0,     0, ...,     9,   421,  2147],
       [    0,     0,     0, ...,    39,    21,    81],
       [    0,     0,     0, ...,     8,  6965,  1802],
       ...,
       [    0,     0,     0, ...,    17,   256,    80],
       [  108,     0, 21646, ...,     3,  2544,     6],
       [    0,     0,     0, ...,   206,  2363,   422]])

## 3.4.2.- LSTM

In [34]:
# Parámetros para la capa embeding
w2v_weights = w2v.get_weights()
vocab_size, emdedding_size = w2v_weights.shape

In [35]:
w2v_weights.shape

(53652, 236)

In [36]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, RepeatVector

with tf.device('gpu:0'):
    # Sequential
    model = Sequential()
    # Embedding
    model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, 
                        weights=[w2v_weights]))
    # LSTM + Dropout
    model.add(LSTM(256))
    model.add(Dropout(0.2))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', 
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.0003), 
                  metrics=['accuracy'])

In [37]:
model.layers[0].trainable = False
for l in model.layers:
    print(l.name, l.trainable)

embedding False
lstm True
dropout True
dense True
dropout_1 True
dense_1 True


In [38]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 236)         12661872  
                                                                 
 lstm (LSTM)                 (None, 256)               504832    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 256)               65792     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 13,232,753
Trainable params: 570,881
Non-t

In [39]:
if train_model_3_lstm:
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

    batch_size = 256
    num_epochs = 10

    model.fit(X_train_f, y_train,
              validation_data=(X_valid_f, y_valid),
              batch_size=batch_size, epochs=num_epochs,
              callbacks=[EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True),
                         ModelCheckpoint("./Models/model_3.h5", save_best_only=True)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


## 3.4.3.- Guardamos resultados

In [40]:
# Pasamos los resultados al mismo formato que y
def prepare_lstm_output(y_pred):
    return np.array(np.round(y_pred), dtype=np.int64).reshape(-1)

In [41]:
from tensorflow.keras.models import load_model
# Cargo el modelo
model = load_model("./Models/model_3.h5")

In [42]:
# Parámetros para el error
error_train_params = (y_train, prepare_lstm_output(model.predict(X_train_f)))
error_valid_params = (y_valid, prepare_lstm_output(model.predict(X_valid_f)))

In [43]:
# Scores
scores = {
    "Name": "Word2Vec + LSTM",
    "Confusion Matrix train": confusion_matrix(*error_train_params),
    "Report train": classification_report(*error_train_params),
    "Accuracy train": accuracy_score(*error_train_params),
    "Confusion Matrix val": confusion_matrix(*error_valid_params),
    "Report val": classification_report(*error_valid_params),
    "Accuracy val": accuracy_score(*error_valid_params)
}

In [44]:
# guardo
pickle.dump(scores, open("./Score/model_3_scores.pkl", 'wb'))

# 3.5.- Predicción en conjunto de test
Llegamos a la parte final, vamos a ver como se comportan nuestros modelos entrenados con el conjunto de test para posteriormente evaluarlos

In [46]:
with open('./Vars/X_train.pkl', 'rb') as file:
    X_train = pickle.load(file)
    
with open('./Vars/y_train.pkl', 'rb') as file:
    y_train = pickle.load(file)
    
with open('./Vars/X_test.pkl', 'rb') as file:
    X_test = pickle.load(file)
    
with open('./Vars/y_test.pkl', 'rb') as file:
    y_test = pickle.load(file)

In [47]:
# Función que devuelve el score previamente guardado
def score_from_pkl(path):
    with open(path, 'rb') as file:
        return pickle.load(file)

In [48]:
paths = ["./Score/model_1_scores.pkl", 
         "./Score/model_2_scores.pkl", 
         "./Score/model_3_scores.pkl"]

## 3.5.1.- Modelo 1

In [49]:
# Cargo el modelo
with open('./Models/model_1.pkl', 'rb') as file:
    model = pickle.load(file)

In [50]:
# formateo entrada
X_test_f = tf_idf.transform(X_test)

In [51]:
# Parámetros para el error
error_test_params = (y_test, model.predict(X_test_f))

In [52]:
# Cargo scores
scores = score_from_pkl(paths[0])
# Añado las nuevas scores
scores.update({
    "Confusion Matrix test": confusion_matrix(*error_test_params),
    "Report test": classification_report(*error_test_params),
    "Accuracy test": accuracy_score(*error_test_params)
})
# Guado scores
pickle.dump(scores, open(paths[0], 'wb'))

In [53]:
if train_model_1:
    # guardo
    pickle.dump(scores, open("./Score/model_1_scores.pkl", 'wb'))

## 3.5.2.- Modelo 2

In [54]:
# Cargo el modelo
with open('./Models/model_2.pkl', 'rb') as file:
    model = pickle.load(file)

In [55]:
# formateo entrada
X_test_f = bow.transform(X_test, normalize=True)

In [56]:
# Parámetros para el error
error_test_params = (y_test, model.predict(X_test_f))

In [57]:
# Cargo scores
scores = score_from_pkl(paths[1])
# Añado las nuevas scores
scores.update({
    "Confusion Matrix test": confusion_matrix(*error_test_params),
    "Report test": classification_report(*error_test_params),
    "Accuracy test": accuracy_score(*error_test_params)
})
# Guado scores
pickle.dump(scores, open(paths[1], 'wb'))

In [58]:
if train_model_1:
    # guardo
    pickle.dump(scores, open("./Score/model_2_scores.pkl", 'wb'))

## 3.5.3.- Modelo 3

In [59]:
# Cargo el modelo
model = load_model("./Models/model_3.h5")

In [60]:
# formateo entrada
w2v.update_vocabulary(X_test)
X_test_f = w2v.transform(X_test)

In [61]:
# Parámetros para el error
error_test_params = (y_test, prepare_lstm_output(model.predict(X_test_f)))

In [62]:
# Cargo scores
scores = score_from_pkl(paths[2])
# Añado las nuevas scores
scores.update({
    "Confusion Matrix test": confusion_matrix(*error_test_params),
    "Report test": classification_report(*error_test_params),
    "Accuracy test": accuracy_score(*error_test_params)
})
# Guado scores
pickle.dump(scores, open(paths[2], 'wb'))

In [63]:
if train_model_1:
    # guardo
    pickle.dump(scores, open("./Score/model_3_scores.pkl", 'wb'))