<a href="https://colab.research.google.com/github/venky1812/My_NLP_Learning/blob/main/Text_Classification_using_DNN_CNN_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#from google.colab import drive

#drive.mount('/content/gdrive')

# Text Classificatiion pipeline using DNN

1.Tokenize the texts and convert them into word index vectors.





2.Pad the text sequences so that all text vectors are of the same length.

3.Map every word index to an embedding vector. We do that by multiplying word index vectors with the embedding matrix. The embedding matrix can either be populated using pre-trained embeddings or it can be trained for embeddings on this corpus.

4.Use the output from Step 3 as the input to a neural network architecture.

In [2]:
#import shutil
#shutil.unpack_archive("drive/MyDrive/ml_dl_datasets/aclImdb_v1.tar.gz", "drive/MyDrive/ml_dl_datasets/")

In [3]:
#!unzip "drive/MyDrive/ml_dl_datasets/imdb_sentiment_analysis/IMDB Dataset.csv.zip" -d "drive/MyDrive/ml_dl_datasets/imdb_sentiment_analysis/"

In [4]:
import os
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import Constant

In [15]:
import gensim.downloader as api

# Setting seed value for reproducibility 

In [5]:
tf.random.set_seed(1234)
np.random.seed(1234)

Defining dataset directory paths

In [6]:
base_data_dir  = "drive/MyDrive/ml_dl_datasets/"

DATA_DIR = os.path.join(base_data_dir, 'imdb_sentiment_analysis/IMDB Dataset.csv') #source: http://ai.stanford.edu/~amaas/data/sentiment/
#TEST_DATA_DIR = os.path.join(base_data_dir, 'imdb_sentiment_analysis/test')

## Hyperparameters 

MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000 
EMBEDDING_DIM = 100 
VALIDATION_SPLIT = 0.2



Loading the data

In [7]:
def get_data(data_dir):
  print(data_dir)
  texts = [] # list of text samples
  labels = [] # list of text
  label_index ={'pos':1,'neg':0}
  for name in sorted(os.listdir(data_dir)):
    if name != 'unsup':
      path = os.path.join(data_dir,name)
      if os.path.isdir(path):
        label_id = label_index[name]
        print(path)
        for fname in sorted(os.listdir(path)):
          fpath = os.path.join(path, fname)
          text = open(fpath).read()
          texts.append(text)
          labels.append(label_id)

  return texts,labels


In [8]:
dataset = pd.read_csv(DATA_DIR)

cat_map = {'positive':1,'negative':0}

dataset['sentiment'] = dataset['sentiment'].map(cat_map) 

train_texts, train_labels = dataset[:40000]['review'].tolist() , dataset[:40000]['sentiment'].tolist()

test_texts, test_labels = dataset[40000:]['review'].tolist() , dataset[40000:]['sentiment'].tolist()

print(len(train_texts))
print(len(train_labels))
print(len(test_texts))
print(len(test_labels))

40000
40000
10000
10000


In [9]:
#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer
#Tokenizer is fit on training data only, and that is used to tokenize both train and test data.
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS,oov_token = '<OOV>')
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes
test_sequences = tokenizer.texts_to_sequences(test_texts)
word_index = tokenizer.word_index # useful for preparing embedding matrix
print('Found %s unique tokens.' % len(word_index))

Found 112174 unique tokens.


In [10]:
### Applying Padding to make text sequences to same length
trainvalid_data  = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

### converting labels to categorical
trainvalid_labels = to_categorical(np.asarray(train_labels))
test_labels = to_categorical(np.asarray(test_labels))


# split the training data into a training set and a validation set

In [11]:
indices = np.arange(trainvalid_data.shape[0])
np.random.shuffle(indices)
trainvalid_data = trainvalid_data[indices]
trainvalid_labels = trainvalid_labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])
x_train = trainvalid_data[:-num_validation_samples]
y_train = trainvalid_labels[:-num_validation_samples]
x_val = trainvalid_data[-num_validation_samples:]
y_val = trainvalid_labels[-num_validation_samples:]
#This is the data we will use for CNN and RNN training
print('Splitting the train data into train and valid is done')

Splitting the train data into train and valid is done


In [16]:
print("preparing the embedding matrix")

model_gigaword = api.load("glove-wiki-gigaword-100")



preparing the embedding matrix


In [23]:
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1

### initializing embedding matrix with Zeros 
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word , i in word_index.items():
  if i > MAX_NUM_WORDS:
    continue
  # words not found in embedding index will be all-zeros.
  if word in model_gigaword:
    embedding_matrix[i] = model_gigaword[word]


In [29]:
def build_cnn_model(num_words,embedding_mode):

  cnnmodel = Sequential()

  if embedding_mode == 1 :
    print("Using Pretrained Embeddings")
    cnnmodel.add(Embedding(num_words,EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,trainable=False))
  else:
    print("Training Embeddings")
    cnnmodel.add(Embedding(num_words,128))

  cnnmodel.add(MaxPooling1D(5))
  cnnmodel.add(Conv1D(128, 5, activation='relu'))
  cnnmodel.add(MaxPooling1D(5))
  cnnmodel.add(Conv1D(128, 5, activation='relu'))
  cnnmodel.add(GlobalMaxPooling1D())
  cnnmodel.add(Dense(128, activation='relu'))
  cnnmodel.add(Dense(2, activation='softmax'))

  cnnmodel.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
  return cnnmodel                         

In [None]:

embedding_model = build_cnn_model(num_words,0)
#Train the model. Tune to validation set. 
embedding_model.fit(x_train, y_train,batch_size=128,epochs=8, validation_data=(x_val, y_val))


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7ff73b4064e0>

In [None]:
score, acc = embedding_model.evaluate(test_data, test_labels)
print('Test accuracy with CNN:', acc)

Test accuracy with CNN: 0.8896999955177307


### Training CNN using pre trained embeddings

In [30]:
embedding_model = build_cnn_model(num_words,1)
#Train the model. Tune to validation set. 
embedding_model.fit(x_train, y_train,batch_size=128,epochs=8, validation_data=(x_val, y_val))

Using Pretrained Embeddings
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7f93d145d7b8>

In [31]:
score, acc = embedding_model.evaluate(test_data, test_labels)
print('Test accuracy with CNN:', acc)

Test accuracy with CNN: 0.7422000169754028


## Using LSTM model for prediction