In [1]:
import pandas as pd
import numpy as np
import nltk
import unidecode
from utils import preprocess_text, remove_stopwords, lower_token
from nltk.corpus import stopwords
from nltk import word_tokenize
from numpy import array, asarray, zeros

from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.preprocessing.text import Tokenizer

In [121]:
#First we need to download nltk stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rodrigo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rodrigo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
data = pd.read_csv("corpus/corpus.csv", names=['text','sentiment'], header=0)
data2 = pd.read_csv("corpus/2clases_es_generaltassisol_pub.csv", names=['text','sentiment'], header=0)
data3 = pd.read_csv("corpus/data_def.csv", names=['text','sentiment'], header=0)

#Join all dataframes in a single dataframe
data = pd.concat([data,data3,data2])

#Check the data is correctly readed
print("Dataframe data")
data.head()

Dataframe data


Unnamed: 0,text,sentiment
0,"Salgo de #VeoTV , que día más largoooooo...",NONE
1,@PauladeLasHeras No te libraras de ayudar me/n...,NEU
2,@marodriguezb Gracias MAR,P
3,"Off pensando en el regalito Sinde, la que se v...",N
4,Conozco a alguien q es adicto al drama! Ja ja ...,P


In [3]:
#Now we check the shape
print("Dataframe shape")
data.shape

Dataframe shape


(24882, 2)

In [4]:
#Now we check the sentiment count
print("Dataframe sentiment count")
data.sentiment.value_counts()

Dataframe sentiment count


neg         7807
pos         5393
P           3531
N           2865
NONE        2664
NEU         1123
negativo     869
neutro       494
positivo     136
Name: sentiment, dtype: int64

In [5]:
#Pre process tweets and save them as a new column in the dataframe
data['text_clean'] = data['text'].apply(lambda x: preprocess_text(str(x)))
data.head()

Unnamed: 0,text,sentiment,text_clean
0,"Salgo de #VeoTV , que día más largoooooo...",NONE,Salgo de VeoTV que día más largoooooo
1,@PauladeLasHeras No te libraras de ayudar me/n...,NEU,No te libraras de ayudar me nos Besos gracias
2,@marodriguezb Gracias MAR,P,Gracias MAR
3,"Off pensando en el regalito Sinde, la que se v...",N,Off pensando en el regalito Sinde la que se va...
4,Conozco a alguien q es adicto al drama! Ja ja ...,P,Conozco alguien es adicto al drama Ja ja ja te...


In [6]:
#Transform sentences into tokens
tokens = [word_tokenize(sen) for sen in data.text_clean]

#Put all the words to lowercase
lower_tokens = [lower_token(token) for token in tokens]

#Import spanish stopwords
stoplist = stopwords.words('spanish')

#Remove stopwords from sentences for better process
filtered_words = [remove_stopwords(sen, stoplist) for sen in lower_tokens]

#Update processed text from dataframe with the new filtered sentences
data['text_clean'] = [' '.join(sen) for sen in filtered_words]
#Create a new column that will have the same words but as tokens
data['tokens'] = filtered_words

In [7]:
#Transform sentiment label to three columns in dataset for three outputs
pos = []
neg = []
neu = []

for sent in data.sentiment:
    if sent == 'P' or sent=='pos' or sent=='positivo':
        neu.append(0)
        pos.append(1)
        neg.append(0)
    elif sent == 'N' or sent=='neg' or sent=='negativo':
        pos.append(0)
        neg.append(1)
        neu.append(0)
    else:
        neu.append(1)
        pos.append(0)
        neg.append(0)
        
data['Pos'] = pos
data['Neg'] = neg
data['Neu'] = neu


In [8]:
#Redeclare dataframe with selected columns
data = data[['text_clean', 'tokens', 'sentiment', 'Pos', 'Neu', 'Neg']]
data.head()

Unnamed: 0,text_clean,tokens,sentiment,Pos,Neu,Neg
0,salgo veotv día largoooooo,"[salgo, veotv, día, largoooooo]",NONE,0,1,0
1,libraras ayudar besos gracias,"[libraras, ayudar, besos, gracias]",NEU,0,1,0
2,gracias mar,"[gracias, mar]",P,1,0,0
3,off pensando regalito sinde va sgae van corrup...,"[off, pensando, regalito, sinde, va, sgae, van...",N,0,0,1
4,conozco alguien adicto drama ja ja ja suena,"[conozco, alguien, adicto, drama, ja, ja, ja, ...",P,1,0,0


In [9]:
#Split data for test and training 
data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

In [10]:
#Get total words in the train dataframe
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]

#Get all the sentence lengths from train dataframe
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]

#Get all the words without duplicates in the train dataframe
TRAINING_VOCAB = sorted(list(set(all_training_words)))

print("%s words, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

152518 words total, with a vocabulary size of 30260
Max sentence length is 39


In [11]:
#Get total words in the test dataframe
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]

#Get all the sentence lengths from test dataframe
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]

#Get all the words without duplicates in the test dataframe
TEST_VOCAB = sorted(list(set(all_test_words)))

print("%s words, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

16770 words total, with a vocabulary size of 7890
Max sentence length is 30


In [12]:
#Now we start using tokenizer for sentences

MAX_SEQUENCE_LENGTH = 50  #Max length that a sentence should have
EMBEDDING_DIM = 300   #Dimension of embedding (the same as the dimension of glove embeddings)

#Declare Tokenizer
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)

#Fit tokenizer with training data
tokenizer.fit_on_texts(data_train["text_clean"].tolist())

#Transform sentences from both datasets into sequences with tokenizer
training_sequences = tokenizer.texts_to_sequences(data_train["text_clean"].tolist())
test_sequences = tokenizer.texts_to_sequences(data_test["text_clean"].tolist())

#Pad the sequences adding 0s to reach the max sequence length
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 30260 unique tokens.


In [13]:
#load glove embeddings
embeddings_dictionary = dict()

#Open glove file
glove_file = open('glove/glove-sbwc.i25.vec', encoding="utf8")

#Iterate all lines in glove file 
for line in glove_file:
    #Split words
    records = line.split()
    
    #The first line should not be considered
    if len(records) == 2:
        continue
        
    #Save data in the dictionary
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions

#Close glove file
glove_file.close()

#Create an array with the glove dimension of the embeddings and total unique tokens
train_embedding_weights = zeros((len(train_word_index)+1, EMBEDDING_DIM))

#Save embedding weights using weights from glove if has the word, otherwise use a random array with the same dimension
for word, index in train_word_index.items():
    train_embedding_weights[index,:] = embeddings_dictionary[word] if word in embeddings_dictionary else np.random.rand(EMBEDDING_DIM)

print(train_embedding_weights.shape)

(30261, 300)


In [39]:
#Now we define the model
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    #Create the embedding layer of the model
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    #Create the model as a sequential model using keras
    model = Sequential()
    #Add the embedding layer to the model
    model.add(embedding_layer)
    #Add a convolutional layer of one dimension and 128 filters with tanh activation
    model.add(Conv1D(128, 10, activation='tanh'))
    #Add a global pooling layer
    model.add(GlobalMaxPooling1D())
    #Add a dense layer with 3 outputs using softmax activation, as we have 3 possible answers
    model.add(Dense(3, activation='softmax'))
    #Compile model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    model.summary()
    return model

In [40]:
#Set the label names of the answers
label_names = ['Pos', 'Neu', 'Neg']
#Set training data to fit model
y_train = data_train[label_names].values
x_train = train_cnn_data

In [41]:
#Set number of epochs and batch size
num_epochs = 5
batch_size = 128

#Set test data to evaluate model
y_test = data_test[label_names].values
X_test = test_cnn_data

#Create model
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

#Fit model and evaluate
hist = model.fit(x_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test), shuffle=True, batch_size=batch_size)

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 50, 300)           9078300   
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 41, 128)           384128    
_________________________________________________________________
global_max_pooling1d_12 (Glo (None, 128)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 3)                 387       
Total params: 9,462,815
Trainable params: 384,515
Non-trainable params: 9,078,300
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [55]:
#Test the model with example
text = [
    "Partido complicado pero no perdido. Saldremos de esta 💪 Que la fe sea lo ultimo que se pierda 🙌",
    "@manuval68 @Minsa_Peru @Agencia_Andina @noticias_tvperu @RadioNacionalFM @DiarioElPeruano ASI ES!!!!!!!!!!!!!!!!!!!! MALDITAS FUJUIRRAATAAAAAASSSS!!!!!!!!!!!!!!!! #FujimoristasEnemigosDelPeru!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
    "@Minsa_Peru @Agencia_Andina @noticias_tvperu @RadioNacionalFM @DiarioElPeruano Cuidence por favor !!!!!!!!! distanciamiento fisico ! ❤️❤️❤️❤️🙏🤍💪 tomen distancia 🥺",
    "@rparrawong @Minsa_Peru Tienen el numero de contagios de hoy y fallecidos por regiones please?? 🙁",
    "Partido complicado pero no perdido. Saldremos de esta 💪 Que la fe sea lo ultimo que se pierda 🙌",
    "@pcmperu @nlcr5_ @presidenciaperu @PeruPaisDigital @MTC_GobPeru @Minsa_Peru @EsSaludPeru @elcomercio_peru @larepublica_pe @RPPNoticias @canalN_ @exitosape Otra cojudez de este gobierno incompetente"
]
text = [preprocess_text(t) for t in text]
text = tokenizer.texts_to_sequences(text)
print('----------------')
print(tokenizer.sequences_to_texts(text))
text = pad_sequences(text, maxlen=MAX_SEQUENCE_LENGTH)
predictions = model.predict(text)
for p in predictions:
    p = [round(num,5) for num in p]
    print(p)


----------------
['partido complicado perdido saldremos fe ultimo pierda', 'asi malditas fujuirraataaaaaassss fujimoristasenemigosdelperu', 'cuidence favor distanciamiento fisico tomen distancia', 'numero contagios hoy fallecidos regiones please', 'partido complicado perdido saldremos fe ultimo pierda', 'gobierno incompetente']
[0.96882, 0.02309, 0.00809]
[0.01396, 0.05144, 0.93459]
[0.08014, 0.76291, 0.15694]
[0.0328, 0.92384, 0.04336]
[0.96882, 0.02309, 0.00809]
[0.01748, 0.00121, 0.9813]


In [53]:
#export model
import pickle

with open('tokenizer2.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [54]:
model.save("cnn_model2")


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: cnn_model2\assets
