In [4]:
### Importar librerías
import nltk
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Métodos de Validación
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Flatten, Embedding, LSTM, Bidirectional, BatchNormalization
from tensorflow.keras.layers import MaxPooling1D, GlobalMaxPool1D, Conv1D, Dropout, Flatten
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.utils import to_categorical

In [6]:
# Cargar y tratar datos
df = pd.read_csv("../data/inputs/data_news_sentiment.csv", encoding = "ISO-8859-1")



In [7]:
df.shape
df.describe()
df.info()
df.isna().sum()
df.isnull().sum()/len(df)*100
df.duplicated().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16595 entries, 0 to 16594
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   16595 non-null  object
 1   Sentiment  16595 non-null  object
dtypes: object(2)
memory usage: 259.4+ KB


64

In [8]:
df['Sentence_len'] = df['Sentence'].apply(lambda x: len(x.split(' ')))
df

Unnamed: 0,Sentence,Sentiment,Sentence_len
0,SpiceJet to issue 6.4 crore warrants to promoters,neutral,8
1,MMTC Q2 net loss at Rs 10.4 crore,neutral,8
2,"Mid-cap funds can deliver more, stay put: Experts",positive,8
3,Mid caps now turn into market darlings,positive,7
4,"Market seeing patience, if not conviction: Pra...",neutral,8
...,...,...,...
16590,RISING costs have forced packaging producer Hu...,negative,17
16591,Nordic Walking was first used as a summer trai...,neutral,14
16592,"According shipping company Viking Line , the E...",neutral,16
16593,"In the building and home improvement trade , s...",neutral,18


In [9]:
num_classes = len(df['Sentiment'].unique())
df['Sentiment'].value_counts(normalize=True)

Sentiment
neutral     0.396083
positive    0.362880
negative    0.241036
Name: proportion, dtype: float64

#### Duplicados y NaN´s

In [10]:
# Se crea la columna 'texto' con la unión de las columnas 'title' y 'text'. Será la que se emplee para el análisis.
df['Sentence'].fillna('', inplace=True)
df['Sentiment'].fillna('', inplace=True)
# df['texto'] = df['title'] + ' ' + df['text']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sentence'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sentiment'].fillna('', inplace=True)


In [11]:
# Se eliminan los duplicados y las columnas que no interesan
df.shape[0] != df.drop_duplicates("Sentence").shape[0]
df[(df['Sentence'].isna()) | (df['Sentiment'].isna())]
df[df.duplicated("Sentence",keep=False)].sort_values(by="Sentence")
df = df.drop_duplicates("Sentence")
df.shape

(16008, 3)

#### 1.- Funciones de limpieza y tokenización


In [12]:
def limpiar_texto(texto):
    texto = texto.lower()
    texto_limpio = ""
    for s in texto:
        if s.isalnum() or s.isspace():
            texto_limpio += s
    return texto_limpio

In [13]:
def generar_stopwords(X):
    X = df['Sentence'].apply(limpiar_texto).values
    count_vectorizer = CountVectorizer(max_features = 8000)
    count_vectorizer.fit_transform(X)
    vocabulario_ordenado = sorted(count_vectorizer.vocabulary_.items(), key = lambda x : x[0], reverse=False)
    STOPWORDS = nltk.corpus.stopwords.words("english")
    lista_stopwords = [ item[0] for item in vocabulario_ordenado if item[0] > 'zz' or item[0] < 'aa']
    STOPWORDS = set(STOPWORDS).union(set(lista_stopwords))
    return STOPWORDS

In [14]:
def eliminar_stopwords(texto):
    tokens = nltk.word_tokenize(text = texto)
    tokens = [token for token in tokens if token not in STOPWORDS]
    # return tokens
    return " ".join(tokens) #

In [15]:
def preproceso(texto):
    texto_clean = limpiar_texto(texto)
    tokens = eliminar_stopwords(texto_clean)
    return texto_clean

In [16]:
def target_encoding(sentiment,num_classes):
    label_encoder = LabelEncoder()
    sentiment_encoded = label_encoder.fit_transform(sentiment)
    return to_categorical(sentiment_encoded, num_classes=num_classes)

In [17]:
# definir stopwords
STOPWORDS = generar_stopwords (df['Sentence'])
# ejecutar preproceso
df['Sentence_clean'] = df['Sentence'].apply(preproceso)
X = df['Sentence_clean'].values
# Target encoding
y = target_encoding(df['Sentiment'].values,num_classes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentence_clean'] = df['Sentence'].apply(preproceso)


3.- Vectorización


In [18]:
# vocab_length
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

19755

In [19]:
texts = X
def embed(corpus): 
    return word_tokenizer.texts_to_sequences(corpus)

longest_train = max(texts, key=lambda sentence: len(embed(sentence)))
length_long_sentence = len(embed(longest_train))

train_padded_sentences = pad_sequences(
    embed(texts), 
    length_long_sentence, 
    padding='post'
)

train_padded_sentences

array([[1029,    2,  182, ...,    0,    0,    0],
       [2754,   70,   12, ...,    0,    0,    0],
       [ 521,  276,  110, ...,    0,    0,    0],
       ...,
       [ 754, 6335,   57, ...,    0,    0,    0],
       [ 267,  811,   24, ...,    0,    0,    0],
       [ 185, 1225, 5474, ...,    0,    0,    0]])

## Modelo

<!-- Modelos -->

In [35]:
# Importar Glove

embeddings_dictionary = dict()
embedding_dim = 100

# Load GloVe 100D embeddings
with open('../data/raw/Glove/glove.6B.100d.txt',encoding='utf-8') as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions

# embeddings_dictionary

In [36]:
# Now we will load embedding vectors of those words that appear in the
# Glove dictionary. Others will be initialized to 0.

embedding_matrix = np.zeros((vocab_length, embedding_dim))

for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        # try:
            embedding_matrix[index] = embedding_vector
        # except: pass
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.18970001,  0.050024  ,  0.19084001, ..., -0.39804   ,
         0.47646999, -0.15983   ],
       ...,
       [ 0.30169001,  0.34529001, -0.031754  , ...,  0.32038999,
        -0.20541   , -0.15347999],
       [ 0.073452  , -0.020464  , -0.05098   , ..., -0.43972   ,
         0.040741  ,  0.37771001],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    train_padded_sentences, 
    y, 
    test_size=0.25)

In [22]:
X_train

array([[  145,   873,   126, ...,     0,     0,     0],
       [ 4272,    72,  7442, ...,     0,     0,     0],
       [ 2172,    35,    91, ...,     0,     0,     0],
       ...,
       [ 2642,   165, 17164, ...,     0,     0,     0],
       [11611,     2,   589, ...,     0,     0,     0],
       [   44,   540,   206, ...,     0,     0,     0]])

In [39]:
## Modelo

def glove_lstm():
    model = Sequential()
    
    # Capa de Entrada
    model.add(Input(shape = (X_train.shape[1], )))

    model.add(Embedding(
        input_dim=embedding_matrix.shape[0], 
        output_dim=embedding_matrix.shape[1], 
        weights = [embedding_matrix], 
        input_length=length_long_sentence,
        trainable=False
    ))
    
    model.add(Bidirectional(LSTM(
        units = length_long_sentence,
        activation = "tanh",
        return_sequences = True, 
        recurrent_dropout=0.2
    )))
    
    model.add(GlobalMaxPool1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    # model.add(Dense(256, activation = "relu"))
    # model.add(Dropout(0.2))
    model.add(Dense(128, activation = "relu"))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation = "relu"))
    model.add(Dropout(0,2))
    model.add(Dense(num_classes, activation = "softmax"))
    
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

model = glove_lstm()
model.summary()



In [40]:
model = glove_lstm()

checkpoint = ModelCheckpoint(
    '../models/model_glove_lstm.keras', 
    monitor = 'val_loss', 
    verbose = 1, 
    save_best_only = True
)
reduce_lr = ReduceLROnPlateau(
    monitor = 'val_loss', 
    factor = 0.2, 
    verbose = 1, 
    patience = 5,                        
    min_lr = 0.001
)
history = model.fit(
    X_train, 
    y_train, 
    epochs = 10,
    batch_size = 64,
    validation_data = (X_test, y_test),
    verbose = 1,
    callbacks = [reduce_lr, checkpoint]
)

Epoch 1/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 943ms/step - accuracy: 0.5587 - loss: 0.9453
Epoch 1: val_loss improved from inf to 0.93428, saving model to model_glove_lstm.keras
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 1s/step - accuracy: 0.5590 - loss: 0.9448 - val_accuracy: 0.6627 - val_loss: 0.9343 - learning_rate: 0.0010
Epoch 2/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1000ms/step - accuracy: 0.7197 - loss: 0.6687
Epoch 2: val_loss improved from 0.93428 to 0.69855, saving model to model_glove_lstm.keras
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 1s/step - accuracy: 0.7197 - loss: 0.6686 - val_accuracy: 0.7321 - val_loss: 0.6985 - learning_rate: 0.0010
Epoch 3/10
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 995ms/step - accuracy: 0.7684 - loss: 0.5503
Epoch 3: val_loss improved from 0.69855 to 0.56330, saving model to model_glove_lstm.keras
[1m188/188

KeyboardInterrupt: 

In [27]:
test_loss, test_accuracy = model.evaluate(
    X_test, y_test)


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 125ms/step - accuracy: 0.8786 - loss: 0.3283


In [28]:
test_loss, test_accuracy

(0.3306387960910797, 0.8795602321624756)

In [52]:
df_pred = pd.DataFrame(y_test, columns=['Negative', 'Neutral', 'Positive'])


In [60]:
df_pred

Unnamed: 0,Negative,Positive,Neutral
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
3997,0.0,1.0,0.0
3998,0.0,0.0,1.0
3999,0.0,1.0,0.0
4000,0.0,0.0,1.0


In [61]:
df_pred2 = df_pred[['Positive', 'Neutral','Negative']]

In [24]:
X_test.shape


(4002, 298)

In [21]:
y_pred = model.predict(X_test)
ypred = np.argmax(y_pred, axis = 1)
y_pred_one_hot = np.eye(y_pred.shape[1])[ypred]

[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 128ms/step


In [28]:
y_pred

array([[1.8324562e-03, 4.4972086e-01, 5.4844666e-01],
       [6.9685169e-02, 5.2426562e-02, 8.7788826e-01],
       [8.8785699e-04, 2.2336128e-04, 9.9888879e-01],
       ...,
       [3.9290555e-02, 1.3086728e-03, 9.5940065e-01],
       [1.2796413e-02, 9.2025781e-01, 6.6945843e-02],
       [2.2832664e-02, 9.4525152e-01, 3.1915795e-02]], dtype=float32)

In [27]:
ypred

array([2, 2, 2, ..., 2, 1, 1], dtype=int64)

In [24]:
ytest = np.argmax(y_test, axis = 1)

In [25]:
confusion_matrix(ytest, ypred)

array([[ 841,   78,   48],
       [  82, 1392,   76],
       [  59,  139, 1287]], dtype=int64)

In [None]:
X_test


array([[  853,  1790, 10901, ...,     0,     0,     0],
       [  918,  5481,    92, ...,     0,     0,     0],
       [12392,   746,    15, ...,     0,     0,     0],
       ...,
       [ 3357,    66,   207, ...,     0,     0,     0],
       [ 5508,  1140,   281, ...,     0,     0,     0],
       [19249,   502,   280, ...,     0,     0,     0]])

In [None]:
bad_results=[]
for i in range (X_test.shape[0]):
    if ytest[i] != ypred[i]:
        bad_results.append(X_test[i:]) 


In [None]:
len(bad_results)

889

In [26]:
import tensorflow as tf

# # Ruta del archivo del modelo guardado
# model_path = "proyecto_3/4_notebooks/model_glove_lstm.keras"
# Cargar el modelo guardado
model = load_model('../models/model_glove_lstm.keras', compile=False)
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
# Mostrar el resumen del modelo para verificar que se cargó correctamente
model.summary()

In [None]:
y_pred = model.predict(X_test)
ypred = np.argmax(y_pred, axis = 1)


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 143ms/step


In [None]:
y_pred_one_hot = np.eye(y_pred.shape[1])[ypred]