<a href="https://colab.research.google.com/github/vinodkumartz/Deep-Learning-Project/blob/main/sentiment_analysis_simplernn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
docs = ['go india',
		'india india',
		'hip hip hurray',
		'jeetega bhai jeetega india jeetega',
		'bharat mata ki jai',
		'kohli kohli',
		'sachin sachin',
		'dhoni dhoni',
		'modi ji ki jai',
		'inquilab zindabad']

In [None]:
# Importing the Tokenizer class from the Keras library for text preprocessing.
from keras.preprocessing.text import Tokenizer

# Initializing a Tokenizer object.
tokenizer = Tokenizer()


In [None]:
# Fitting the Tokenizer on the list of documents (docs) to build the vocabulary.
# This step tokenizes the text and creates a word index, where each unique word in the documents
# is assigned a unique integer index. The Tokenizer learns the vocabulary based on the input text data.
tokenizer.fit_on_texts(docs)


In [None]:
# Getting the total number of unique words in the vocabulary learned by the Tokenizer.
# The length of the word_index dictionary corresponds to the total number of unique words.
num_unique_words = len(tokenizer.word_index)

17

In [None]:
# Converting the list of documents (docs) into sequences of integers using the Tokenizer.
# Each document is tokenized into a sequence of integers, where each integer represents
# the index of the corresponding word in the vocabulary learned by the Tokenizer.
sequences = tokenizer.texts_to_sequences(docs)


[[9, 1],
 [1, 1],
 [3, 3, 10],
 [2, 11, 2, 1, 2],
 [12, 13, 4, 5],
 [6, 6],
 [7, 7],
 [8, 8],
 [14, 15, 4, 5],
 [16, 17]]

In [None]:
from keras.utils import pad_sequences
# Padding the sequences to ensure uniform length for input into neural network models.
# Padding is performed to ensure that all sequences have the same length, which is required
# for batch processing in neural networks. 'post' padding adds zeros at the end of sequences.

sequences = pad_sequences(sequences, padding='post')

sequences

array([[ 9,  1,  0,  0,  0],
       [ 1,  1,  0,  0,  0],
       [ 3,  3, 10,  0,  0],
       [ 2, 11,  2,  1,  2],
       [12, 13,  4,  5,  0],
       [ 6,  6,  0,  0,  0],
       [ 7,  7,  0,  0,  0],
       [ 8,  8,  0,  0,  0],
       [14, 15,  4,  5,  0],
       [16, 17,  0,  0,  0]], dtype=int32)

In [None]:
# Importing necessary libraries
from keras.models import Sequential
from keras.layers import Embedding

# Initializing a Sequential model
model = Sequential()

# Adding an Embedding layer to the model
# - The input dimension (17) represents the vocabulary size, i.e., the total number of unique words.
# - The output dimension (2) represents the size of the embedding vector for each word.
# - The input length (5) specifies the length of input sequences (number of words in each sequence).
model.add(Embedding(input_dim=17, output_dim=2, input_length=5))

# Displaying the summary of the model architecture
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 5, 2)              34        
                                                                 
Total params: 34
Trainable params: 34
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Compiling the model with specified optimizer and metrics.
# - The 'adam' optimizer is used for gradient-based optimization. It adapts learning rates for each parameter.
# - 'accuracy' is chosen as the metric to monitor during training, which calculates the accuracy of the model predictions.

model.compile(optimizer='adam', loss='accuracy')


In [None]:
# Making predictions using the trained model on the input sequences.
# The model.predict() function generates predictions for each input sequence in 'sequences'.
pred = model.predict(sequences)

# Printing the predicted values.
# 'pred' contains the predicted output values for each input sequence.
print(pred)


[[[-0.03378457 -0.0223366 ]
  [-0.01869411  0.0397962 ]
  [ 0.04738592  0.03497881]
  [ 0.04738592  0.03497881]
  [ 0.04738592  0.03497881]]

 [[-0.01869411  0.0397962 ]
  [-0.01869411  0.0397962 ]
  [ 0.04738592  0.03497881]
  [ 0.04738592  0.03497881]
  [ 0.04738592  0.03497881]]

 [[ 0.00716299  0.02782274]
  [ 0.00716299  0.02782274]
  [-0.02727993 -0.02907238]
  [ 0.04738592  0.03497881]
  [ 0.04738592  0.03497881]]

 [[ 0.04634063 -0.01492285]
  [ 0.04544855 -0.02214533]
  [ 0.04634063 -0.01492285]
  [-0.01869411  0.0397962 ]
  [ 0.04634063 -0.01492285]]

 [[ 0.03687553  0.00542127]
  [ 0.00882751  0.02693466]
  [ 0.00960891 -0.02113671]
  [-0.01385436  0.00159916]
  [ 0.04738592  0.03497881]]

 [[ 0.04272473 -0.02110641]
  [ 0.04272473 -0.02110641]
  [ 0.04738592  0.03497881]
  [ 0.04738592  0.03497881]
  [ 0.04738592  0.03497881]]

 [[ 0.01033523 -0.02812191]
  [ 0.01033523 -0.02812191]
  [ 0.04738592  0.03497881]
  [ 0.04738592  0.03497881]
  [ 0.04738592  0.03497881]]

 [[ 0.

In [None]:
# Importing necessary libraries for the IMDB sentiment analysis example.
from keras.datasets import imdb  # Importing the IMDB dataset for sentiment analysis.
from keras.preprocessing.text import Tokenizer  # Importing the Tokenizer for text preprocessing.
from keras.utils import pad_sequences  # Importing pad_sequences for sequence padding.
from keras import Sequential  # Importing the Sequential model for building neural networks.
from keras.layers import Dense, SimpleRNN, Embedding, Flatten  # Importing layers for the neural network architecture.

# The IMDB dataset is a popular dataset for sentiment analysis, containing movie reviews along with their sentiment labels.
# Tokenizer is used for tokenizing text data, pad_sequences is used for padding sequences to a fixed length,
# Sequential is used to define a linear stack of layers for building neural networks,
# and Dense, SimpleRNN, Embedding, and Flatten are specific layers used to define the architecture of the neural network model.


In [None]:
# Loading the IMDB dataset and splitting it into training and testing sets.
# The dataset consists of movie reviews encoded as sequences of word indices, along with their corresponding sentiment labels.
# - (X_train, y_train): Training set containing movie reviews (sequences of word indices) and their sentiment labels.
# - (X_test, y_test): Testing set containing movie reviews (sequences of word indices) and their sentiment labels.
(X_train, y_train), (X_test, y_test) = imdb.load_data()


In [None]:
# Padding the sequences in the training and testing sets to ensure uniform length.
# Padding is performed to ensure that all sequences have the same length, which is necessary for batch processing in neural networks.
# - X_train: Training set containing sequences of word indices representing movie reviews.
# - X_test: Testing set containing sequences of word indices representing movie reviews.
# - 'post' padding adds zeros at the end of sequences.
# - 'maxlen=50' specifies the maximum length of sequences after padding, ensuring that all sequences are truncated or padded to a length of 50.
X_train = pad_sequences(X_train, padding='post', maxlen=50)
X_test = pad_sequences(X_test, padding='post', maxlen=50)


In [None]:
X_train.shape

(25000, 50)

In [None]:
# Padding the sequences in the training and testing sets to ensure uniform length.
# Padding is performed to ensure that all sequences have the same length, which is necessary for batch processing in neural networks.
# - X_train: Training set containing sequences of word indices representing movie reviews.
# - X_test: Testing set containing sequences of word indices representing movie reviews.
# - 'post' padding adds zeros at the end of sequences.
# - 'maxlen=50' specifies the maximum length of sequences after padding, ensuring that all sequences are truncated or padded to a length of 50.
X_train = pad_sequences(X_train, padding='post', maxlen=50)
X_test = pad_sequences(X_test, padding='post', maxlen=50)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 2)           20000     
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                1120      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 21,153
Trainable params: 21,153
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Compiling and training the model on the IMDB dataset for sentiment analysis.
# - The model is compiled with the Adam optimizer, binary cross-entropy loss function, and accuracy metric.
# - Training is performed for 5 epochs.
# - Validation data is provided to evaluate the model's performance on the testing set during training.
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

# Training the model on the training data.
# - X_train: Input sequences of word indices representing movie reviews in the training set.
# - y_train: Target sentiment labels (0 for negative, 1 for positive) corresponding to the reviews in the training set.
# - epochs=5: Number of epochs (iterations over the entire training data) for training the model.
# - validation_data=(X_test, y_test): Validation data to evaluate the model's performance on the testing set during training.
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
