[View in Colaboratory](https://colab.research.google.com/github/sumitdua10/CNN_Text_Classification_Restaurent_Reviews/blob/master/IMDB_Text_Classification_Deep_learning(CNN).ipynb)

In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input/word2vec-nlp-tutorial"))

['testData.tsv', 'sampleSubmission.csv', 'labeledTrainData.tsv', 'unlabeledTrainData.tsv']


In [0]:
#Read the IMDB dataset with 25K reviews for training. 

df = pd.read_csv("../input/word2vec-nlp-tutorial/labeledTrainData.tsv", sep = '\t', 
                 error_bad_lines=False )
print("Total no. of reviews are ", df.shape[0])
print("cols are ", df.columns)
print("Sample reviews are ")
print(df.loc[:5,['review','sentiment']])


Total no. of reviews are  25000
cols are  Index(['id', 'sentiment', 'review'], dtype='object')
Sample reviews are 
                                              review  sentiment
0  With all this stuff going down at the moment w...          1
1  \The Classic War of the Worlds\" by Timothy Hi...          1
2  The film starts with a manager (Nicholas Bell)...          0
3  It must be assumed that those who praised this...          0
4  Superbly trashy and wondrously unpretentious 8...          1
5  I dont know why people think this is such a ba...          1


In [0]:
#Import the stopwords (common words) to be removed from the corpus
"""
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
s = set(stopwords.words('english'))
s.remove('not')
print("Stopwords length", len(s))
"""

In [0]:
# 3. Remove the puncuation symbols or any other symbols that are not characters [^A-Za-z] and put the text in list Corpus
#s = set(s)
#corpus = []
#for i in range(0, df.shape[0]):
 #   review = re.sub('[^a-zA-Z]', ' ', df['review'][i])
  #  review = review.lower().split()
    #ps = PorterStemmer()
    #review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
   # review = [word for word in review if not word in s]
    #review = ' '.join(review)
    #corpus.append(review)
#print(corpus[0])

In [0]:
word2vec = {}
with open('../input/glove6b50dtxt/glove.6B.50d.txt', encoding="utf8") as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))


Found 400000 word vectors.


In [0]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
MAX_VCOCAB_SIZE = 5000
EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH = 1500

tokenizer = Tokenizer( filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=' ')
sequences = tokenizer.fit_on_texts(df['review'])
word_index = tokenizer.word_index
documents = tokenizer.texts_to_sequences(df['review'])
#print(word_index)
token_count = len(word_index)+1
print('Found {} unique tokens.'.format(token_count))

#print(t.word_counts)
print("Total documents ", tokenizer.document_count)
#print(t.word_index)
#print(t.word_docs)
print("max sequence length:", max(len(s) for s in documents))
print("min sequence length:", min(len(s) for s in documents))

# pad sequences so that we get a N x T matrix
data = pad_sequences(documents, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print('Shape of data tensor:', data.shape)
print(data[1])


Found 88583 unique tokens.
Total documents  25000
max sequence length: 2493
min sequence length: 10
Shape of data tensor: (25000, 1500)
[  1 353 322 ...   0   0   0]


In [0]:
print('Filling pre-trained embeddings...')
embedding_matrix = np.zeros((token_count, EMBEDDING_DIM))
for word, i in word_index.items():
  #if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word) #get(word) is used instead of [word] as it won't give exception in case word is not found
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i,:] = embedding_vector

print("Sample embedded dimension ")
print(embedding_matrix[10][:5])


Filling pre-trained embeddings...
Sample embedded dimension 
[ 0.11891   0.15255  -0.082073 -0.74144   0.75917 ]


In [0]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Dropout, Dense, GlobalAveragePooling1D 
from keras.layers import Embedding, Conv2D, GlobalMaxPooling1D 
from keras import regularizers

embedding_layer = Embedding(
  token_count,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False)

In [0]:
model = Sequential()
model.add(embedding_layer)#, input_shape= (token_count, EMBEDDING_DIM))
model.add(Conv1D(filters = 64, kernel_size = 4, padding = 'same', activation='relu'))
                 #input_shape=(token_count,EMBEDDING_DIM)))
model.add(MaxPooling1D())#kernel_size=500))
model.add(Conv1D(filters = 128, kernel_size = 3, padding = 'same',  activation='relu', 
                 kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.25))
model.add(MaxPooling1D())
model.add(Conv1D(filters = 256, kernel_size = 2, padding = 'same', activation='relu'))
model.add(Dropout(0.5))
model.add(MaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(64, activation='relu'))
#model.add(Conv1D(128, 3, activation='relu'))
model.add(GlobalMaxPooling1D())

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_25 (Embedding)     (None, 1500, 50)          4429150   
_________________________________________________________________
conv1d_70 (Conv1D)           (None, 1500, 64)          12864     
_________________________________________________________________
max_pooling1d_57 (MaxPooling (None, 750, 64)           0         
_________________________________________________________________
conv1d_71 (Conv1D)           (None, 750, 128)          24704     
_________________________________________________________________
dropout_45 (Dropout)         (None, 750, 128)          0         
_________________________________________________________________
max_pooling1d_58 (MaxPooling (None, 375, 128)          0         
_________________________________________________________________
conv1d_72 (Conv1D)           (None, 375, 256)          65792     
__________

In [0]:
from keras.layers import Input, Dense, Concatenate
from keras.models import Model

inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))

x = embedding_layer(inputs)
print(x)
x1 = Conv1D(filters = 100, kernel_size = 2, padding = 'same', activation='relu')(x)
                 #input_shape=(token_count,EMBEDDING_DIM)))
x1 = GlobalMaxPooling1D()(x1)

x2 = Conv1D(filters = 100, kernel_size = 3, padding = 'same', activation='relu')(x)
                 #input_shape=(token_count,EMBEDDING_DIM)))
x2 = GlobalMaxPooling1D()(x2) #pool_size=1500


x3 = Conv1D(filters = 100, kernel_size = 4, padding = 'same', activation='relu')(x)
                 #input_shape=(token_count,EMBEDDING_DIM)))
x3 = GlobalMaxPooling1D()(x3)

# a layer instance is callable on a tensor, and returns a tensor
print(x3)
x = Concatenate()([x1,x2,x3])
print(x)
x = Dense(256)(x)
x = Dropout(0.25)(x)
x = Dense(128, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)

# This creates a model that includes
# the Input layer and three Dense layers
Fmodel = Model(inputs=inputs, outputs=output)
Fmodel.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
print(Fmodel.summary())


Tensor("embedding_25_3/embedding_lookup/Identity:0", shape=(?, 1500, 50), dtype=float32)
Tensor("global_max_pooling1d_21/Max:0", shape=(?, 100), dtype=float32)
Tensor("concatenate_7/concat:0", shape=(?, 300), dtype=float32)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, 1500)         0                                            
__________________________________________________________________________________________________
embedding_25 (Embedding)        (None, 1500, 50)     4429150     input_16[0][0]                   
__________________________________________________________________________________________________
conv1d_76 (Conv1D)              (None, 1500, 100)    10100       embedding_25[3][0]               
___________________________________________________________________________________

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data, df['sentiment'], 
                                                    test_size=0.2, random_state=42)


print(x_train.shape)
model.fit(x_train, y_train , batch_size=96, epochs=2, validation_split = 0.25)
#score = model.evaluate(x_test, y_test, batch_size=32)

(20000, 1500)
Train on 15000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f142107e2e8>

In [0]:
Fmodel.fit(x_train, y_train , batch_size=96, epochs=2, validation_split = 0.25)

In [0]:
print("Concatenated CNN Result")
print("Loss & accuracty on test set is", Fmodel.evaluate(x_test, y_test))

print("Traditional CNN Result")
print("Loss & accuracty on test set is", model.evaluate(x_test, y_test))

Concatenated CNN Result
 288/5000 [>.............................] - ETA: 2s