# TextCNN for sentiment analysis classification problem

In [53]:
import os
import re

import warnings
warnings.simplefilter("ignore", UserWarning)
from matplotlib import pyplot as plt
%matplotlib inline


import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np 
from string import punctuation

from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, roc_auc_score
from sklearn.externals import joblib

import scipy
from scipy.sparse import hstack

In [103]:
from keras.models import Model
from keras.models import Sequential

from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Activation
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.layers import SpatialDropout1D, concatenate, BatchNormalization
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D

from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam

from keras.models import load_model
from keras.utils.vis_utils import plot_model

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## 1. Load Raw Data

In [78]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

In [7]:
data = pd.read_csv('./data/tweets.csv', encoding='latin1', usecols=['Sentiment', 'SentimentText'])
# data.columns = ['sentiment', 'text'])

In [8]:
data.columns = ['sentiment', 'text']

In [9]:
print(data.shape)

(1578614, 2)


In [10]:
data.head()

Unnamed: 0,sentiment,text
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


## 2. Clean Data

In [11]:
def tokenize(tweet):
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r"#(\w+)",'', tweet)
    tweet = re.sub(r"@(\w+)", '', tweet)
    tweet = re.sub(r'[^\w\s]', '', tweet)
    tweet = tweet.strip().lower()
    tokens = word_tokenize(tweet)
    return tokens

In [12]:
data['tokens'] = data['text'].progress_map(tokenize)

progress-bar: 100%|██████████| 1578614/1578614 [05:00<00:00, 5247.16it/s]


In [13]:
data.head()

Unnamed: 0,sentiment,text,tokens
0,0,is so sad for my APL frie...,"[is, so, sad, for, my, apl, friend]"
1,0,I missed the New Moon trail...,"[i, missed, the, new, moon, trailer]"
2,1,omg its already 7:30 :O,"[omg, its, already, 730, o]"
3,0,.. Omgaga. Im sooo im gunna CRy. I'...,"[omgaga, im, sooo, im, gunna, cry, ive, been, ..."
4,0,i think mi bf is cheating on me!!! ...,"[i, think, mi, bf, is, cheating, on, me, t_t]"


In [14]:
data['clean_text'] = data['tokens'].progress_map(lambda tokens: ' '.join(tokens))

progress-bar: 100%|██████████| 1578614/1578614 [00:05<00:00, 314056.67it/s]


In [15]:
data.head()

Unnamed: 0,sentiment,text,tokens,clean_text
0,0,is so sad for my APL frie...,"[is, so, sad, for, my, apl, friend]",is so sad for my apl friend
1,0,I missed the New Moon trail...,"[i, missed, the, new, moon, trailer]",i missed the new moon trailer
2,1,omg its already 7:30 :O,"[omg, its, already, 730, o]",omg its already 730 o
3,0,.. Omgaga. Im sooo im gunna CRy. I'...,"[omgaga, im, sooo, im, gunna, cry, ive, been, ...",omgaga im sooo im gunna cry ive been at this d...
4,0,i think mi bf is cheating on me!!! ...,"[i, think, mi, bf, is, cheating, on, me, t_t]",i think mi bf is cheating on me t_t


### Save cleaned Data

In [16]:
data[['sentiment', 'clean_text']].to_csv('./data/cleaned_text.csv')

## 3. Load Cleaned Data

In [23]:
data = pd.read_csv('./data/cleaned_text.csv')
print(data.shape)

(1578614, 3)


In [24]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentiment,clean_text
0,0,0,is so sad for my apl friend
1,1,0,i missed the new moon trailer
2,2,1,omg its already 730 o
3,3,0,omgaga im sooo im gunna cry ive been at this d...
4,4,0,i think mi bf is cheating on me t_t


In [25]:
data.drop('Unnamed: 0', axis=1)

Unnamed: 0,sentiment,clean_text
0,0,is so sad for my apl friend
1,0,i missed the new moon trailer
2,1,omg its already 730 o
3,0,omgaga im sooo im gunna cry ive been at this d...
4,0,i think mi bf is cheating on me t_t
5,0,or i just worry too much
6,1,juuuuuuuuuuuuuuuuussssst chillin
7,0,sunny again work tomorrow tv tonight
8,1,handed in my uniform today i miss you already
9,1,hmmmm i wonder how she my number


In [26]:
tweets = data['clean_text'].map(str).values
labels = data['sentiment'].map(int).values

In [27]:
tweets

array(['is so sad for my apl friend', 'i missed the new moon trailer',
       'omg its already 730 o', ..., 'zzzzzzzzzz wait no i have homework',
       'zzzzzzzzzzzzz meh what am i doing up again',
       'zzzzzzzzzzzzzzzzzzz i wish'], dtype=object)

## 4. Preprocess Data

In [29]:
max_vocab_size = 30000

In [30]:
tokenizer = Tokenizer(num_words=max_vocab_size)

In [31]:
tokenizer.fit_on_texts(tweets)

In [32]:
# note the index starts from 1 instead of 0
word2index = tokenizer.word_index

In [33]:
index2word = {index:word for word, index in word2index.items()}

In [34]:
print(len(word2index))
print(len(index2word))

444270
444270


In [35]:
index2word[1]

'i'

In [36]:
sequences = tokenizer.texts_to_sequences(tweets)

In [72]:
print(len(labels))
print(len(sequences))

1578614
1578614


### Split data for training and testing

In [73]:
train_seq = sequences[:1400000]
train_labels = labels[:1400000]

test_seq = sequences[1400000:]
test_labels = labels[1400000:]

In [74]:
print(len(train_seq))
print(len(train_labels))
print(len(test_seq))
print(len(test_labels))

1400000
1400000
178614
178614


### Pad sequences

In [76]:
max_seq_length = 35

In [77]:
padded_train_seq = pad_sequences(train_seq, max_seq_length)
padded_test_seq = pad_sequences(test_seq, max_seq_length)

## 5. Loas Pretrained Word Embedding

In [104]:
def get_word2vector(file_name):
    word2vector = {}
    with open(file_name, 'r') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vec = np.asarray(values[1:], dtype=np.float32)
            word2vector[word] = vec
    return word2vector

In [105]:
emb_dim = 50
# word2vec_file_path = 'data/glove.6B.50d.txt'
word2vec_file_path = "data/glove.6B.{}d.txt".format(emb_dim)
word2vec = get_word2vector(word2vec_file_path)

In [106]:
len(word2vec)

400000

## 6. Create Word Embedding Matrix

* We first calculate mean and standard deviation for each dimension of the pretrained word embedding.
* We then initialize the word embedding matrix with values from normal distribution with the calculated means and standard deviations
* Finally, we create the word embedding matrix based on the pretrained word embedding.

In [107]:
values = list(word2vec.values())
all_embs = np.stack(values)

print(all_embs.shape)

(400000, 50)


In [108]:
emb_mean, emb_std = all_embs.mean(), all_embs.std()
print(emb_mean)
print(emb_std)

0.02094049
0.6441044


In [109]:
max_word = min(len(word2index) + 1, max_vocab_size + 1)

embedding_matrix = np.random.normal(emb_mean, emb_std, (max_word, emb_dim))
for word, idx in word2index.items():
    if idx <= max_vocab_size:
        vec = word2vec.get(word)
        if vec is not None:
            embedding_matrix[idx] = vec

print(embedding_matrix.shape)

(30001, 50)


## 7. Construct Embedding Layer

In [110]:
embedding_layer = Embedding(
        input_dim = embedding_matrix.shape[0],
        output_dim = embedding_matrix.shape[1],
        weights=[embedding_matrix],
        input_length=max_seq_length,
        trainable=False)

## 8. Construct RNN Model

In [111]:
hidden_state_dim = 128

In [112]:
def RNN_MODEL(hidden_state_dim, max_seq_length):
    
    input_ = Input(shape=(max_seq_length,))
    x = embedding_layer(input_)
#     x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(GRU(hidden_state_dim, recurrent_dropout=0.3, return_sequences=True))(x)
    avg_pooling = GlobalAveragePooling1D()(x)
    max_pooling = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pooling, max_pooling])
    output = Dense(1, activation="sigmoid")(conc)

    model = Model(inputs=input_, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.01), metrics=['accuracy'])
    
    return model

In [113]:
rnn_model = RNN_MODEL(hidden_state_dim, max_seq_length)
rnn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, 35)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 35, 50)       1500050     input_11[0][0]                   
__________________________________________________________________________________________________
bidirectional_5 (Bidirectional) (None, 35, 256)      137472      embedding_3[0][0]                
__________________________________________________________________________________________________
global_average_pooling1d_5 (Glo (None, 256)          0           bidirectional_5[0][0]            
__________________________________________________________________________________________________
global_max

In [115]:
plot_model(rnn_model, 
           to_file='./model_images/rnn_model.png', 
           show_shapes=True, 
           show_layer_names=True)

<img src='./model_images/rnn_model.png'/>

In [116]:
batch_size = 256
epochs = 2

In [138]:
filepath_1 ="./models/rnn_embeddings/weights-improvement-{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint_1 = ModelCheckpoint(filepath_1, monitor='val_acc', save_best_only=True, mode='max', verbose=1)

history_rnn = rnn_model.fit(x=padded_train_seq, y=train_labels, validation_split=0.1, 
                        batch_size=batch_size, epochs=epochs, callbacks=[checkpoint_1], verbose=1)

In [None]:
plt.plot(history_rnn.history['loss'], label='loss')
plt.plot(history_rnn.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

## 9. Construct TextCNN

The core idea of TextCNN is using convolutional neural network with different filters to extract features from text and then combine those features to perform the classification task. 

To perform the convolution task, we can use either 1-dimentional convolution or 2-dimentional convolution. They are conceptually different, but computationally they are the same (We will see later). If you are using Keras to build the model, you can read [Keras中Conv1D和Conv2D的区别](https://blog.csdn.net/hahajinbu/article/details/79535172) for details (Note, this article maybe not accurate, but you can get the idea.). 

To explain the conceputal difference between Conv-1D and Conv-2D, let's consider a sentence with 7 tokens that each token is embedded as a vector with dimention of 5. That is, the shape of the sentence is $7\times5$.

If we are using Conv-1D, we treat the input sentence with shape (7, 5) as one dimensional data with 5 channels. We will define rank-1 filter with 5 channels to perform the convolution computation over that one dimension (Note, we did not consider the dimenstion for batch size here). Following picture depicts the 1-dimentional computation:

<img src='images/1-dim-computation.png' style='height:400px;'>

If we are using Conv-2D, we treat the input sentence with shape (7, 5) as two dimensional data with 1 channel. We will define rank-2 filter with 1 channels to perform the convolution computation over that two dimension (Note, we did not consider the dimenstion for batch size here). Following picture depicts the 2-dimentional computation:

<img src='images/2-dim-computation.png' style='height:400px;'>

In this section, we will build two TextCNN models, one for 1-dimentional convolution and the other for 2-dimentional convolution.


### 9.1 TextCNN version 1


In [129]:
def TEXT_CNN_MODEL_v1(filter_sizes, max_seq_length, num_out_filters, drop_rate):
    
    input_ = Input(shape=(max_seq_length,))
    x = embedding_layer(input_)

    pools = []
    for i, filter_size in enumerate(filter_sizes):
        conv = Conv1D(filters=num_out_filters, kernel_size=filter_size, strides=1, padding='valid')(x)
        conv = BatchNormalization(axis=-1)(conv)
        conv = Activation('relu')(conv)
        pool = MaxPooling1D(pool_size=max_seq_length-filter_size+1, strides=1, padding='valid')(conv)
        pools.append(pool)
    concatenated_tensor = Concatenate(axis=1)(pools)
    
    flatten = Flatten()(concatenated_tensor)
    dropout = Dropout(drop_rate)(flatten)
    output = Dense(units=1, activation="sigmoid")(dropout)

    model = Model(inputs=input_, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.01), metrics=['accuracy'])
    
    return model

In [130]:
textcnn_model = TEXT_CNN_MODEL_v1([2,3,4], max_seq_length, 256, 0.3)
textcnn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           (None, 35)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 35, 50)       1500050     input_14[0][0]                   
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 34, 256)      25856       embedding_3[3][0]                
__________________________________________________________________________________________________
conv1d_8 (Conv1D)               (None, 33, 256)      38656       embedding_3[3][0]                
__________________________________________________________________________________________________
conv1d_9 (

In [131]:
plot_model(textcnn_model, 
           to_file='./model_images/text_cnn_model.png', 
           show_shapes=True, 
           show_layer_names=True)

<img src='./model_images/text_cnn_model.png'/>

In [132]:
batch_size = 256
epochs = 2

In [133]:
filepath_2 ="./models/textcnn_embeddings/weights-improvement-{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint_2 = ModelCheckpoint(filepath_2, monitor='val_acc', save_best_only=True, mode='max', verbose=1)

history_textcnn_1 = textcnn_model.fit(x=padded_train_seq, y=train_labels, validation_split=0.1, 
                                batch_size=batch_size, epochs=epochs, callbacks=[checkpoint_2], verbose=1)

Train on 1260000 samples, validate on 140000 samples
Epoch 1/2

Epoch 00001: val_acc improved from -inf to 0.76479, saving model to ./models/textcnn_embeddings/weights-improvement-01-0.7648.hdf5
Epoch 2/2

Epoch 00002: val_acc improved from 0.76479 to 0.77414, saving model to ./models/textcnn_embeddings/weights-improvement-02-0.7741.hdf5


### 9.2 TextRNN - version 2

In [134]:
def TEXT_CNN_MODEL_v2(filter_sizes, max_seq_length, dim, num_out_filters, drop_rate):
    
    input_ = Input(shape=(max_seq_length,))
    x = embedding_layer(input_)
    
    # target_shape: target shape. Tuple of integers. Does not include the batch axis.
    x = Reshape(target_shape=(max_seq_length, dim, 1))(x)
    pools = []
    for filter_size in filter_sizes:
        conv = Conv2D(filters=num_out_filters, kernel_size=(filter_size, dim), strides=(1, 1), padding='valid')(x)
        conv = BatchNormalization(axis=-1)(conv)
        conv = Activation('relu')(conv)
        pool = MaxPooling2D(pool_size=(max_seq_length-filter_size+1, 1), strides=(1,1), padding='valid')(conv)
        pools.append(pool)
    concatenated_tensor = Concatenate(axis=1)(pools)
    
    flatten = Flatten()(concatenated_tensor)
    dropout = Dropout(drop_rate)(flatten)
    output = Dense(units=1, activation="sigmoid")(dropout)

    model = Model(inputs=input_, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.01), metrics=['accuracy'])
    
    return model

In [135]:
textcnn_model_v2 = TEXT_CNN_MODEL_v2([2,3,4], max_seq_length, emb_dim, 256, 0.3)
textcnn_model_v2.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           (None, 35)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 35, 50)       1500050     input_15[0][0]                   
__________________________________________________________________________________________________
reshape_6 (Reshape)             (None, 35, 50, 1)    0           embedding_3[4][0]                
__________________________________________________________________________________________________
conv2d_11 (Conv2D)              (None, 34, 1, 256)   25856       reshape_6[0][0]                  
__________________________________________________________________________________________________
conv2d_12 

In [136]:
plot_model(textcnn_model_v2, 
           to_file='./model_images/text_cnn_model_v3.png', 
           show_shapes=True, 
           show_layer_names=True)

<img src='./model_images/text_cnn_model_v3.png'/>

In [137]:
batch_size = 256
epochs = 2

filepath_3 ="./models/textcnn_v2_embeddings/weights-improvement-{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint_3 = ModelCheckpoint(filepath_3, monitor='val_acc', save_best_only=True, mode='max', verbose=1)

history_textcnn_2 = textcnn_model_v2.fit(x=padded_train_seq, y=train_labels, validation_split=0.1, 
                                batch_size=batch_size, epochs=epochs, callbacks=[checkpoint_3], verbose=1)

Train on 1260000 samples, validate on 140000 samples
Epoch 1/2

Epoch 00001: val_acc improved from -inf to 0.75792, saving model to ./models/textcnn_v2_embeddings/weights-improvement-01-0.7579.hdf5
Epoch 2/2

Epoch 00002: val_acc improved from 0.75792 to 0.76992, saving model to ./models/textcnn_v2_embeddings/weights-improvement-02-0.7699.hdf5


## References:

[1]. The paper: [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/pdf/1408.5882.pdf)

[2]. [Overview and benchmark of traditional and deep learning models in text classification](https://ahmedbesbes.com/overview-and-benchmark-of-traditional-and-deep-learning-models-in-text-classification.html) and its [chinese version](https://mp.weixin.qq.com/s/z2bdlhaala2ko55MYiyXNw)

[3]. [Keras中Conv1D和Conv2D的区别](https://blog.csdn.net/hahajinbu/article/details/79535172)

[4]. [Sentiment analysis on Twitter using word2vec and keras](https://ahmedbesbes.com/sentiment-analysis-on-twitter-using-word2vec-and-keras.html)

[5]. [用深度学习（CNN RNN Attention）解决大规模文本分类问题 - 综述和实践](https://zhuanlan.zhihu.com/p/25928551)

[6]. [TextCnn原理及实践](https://blog.csdn.net/john_xyz/article/details/79210088)