> There are 50,000 reviews, 25,000 for training and 25,000 for testing
> Each set consisting of 50% positive and 50% negative reviews

In [1]:
# LOAD DATA
from tensorflow.keras.datasets import imdb

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
# num_words=10000 will keep top 10,000 most frequently occuring words in the training data

ModuleNotFoundError: No module named 'tensorflow'

In [6]:
# As we mentioned data is splitted into 25,000 for training and 25,000 for testing
print(len(train_data))
print(len(test_data))

25000
25000


In [8]:
# train_data and test_data are list of reviews, each review is a list of word indices
# train_lables and test_labels are list of 0's and 1's, where 0 is negative and 1 is positive

len(train_data[0]) # there are 218 words in 1st review

218

In [15]:
train_labels[0] # 1st review is positive review, 0 is negative and 1 is positive

1

In [17]:
# Because you’re restricting yourself to the top 10,000 most frequent words, no word index will exceed 10,000:

# train_data mai 25,000 reviews hain hum har review ka max lenge or uski list banayegen to 25,000 reviews ka 
# ..1 maximum hoga to 25,000 ki list banjaygi then hum us list ka max lenge to 9999 ayega because we are taking top 10,000 words

max([max(sequence) for sequence in train_data])

9999

In [33]:
# Decode review back to English

word_index = imdb.get_word_index() # it is a dict mapping words to an integer index

reverse_word_index = dict(
    [(value, key) for (key, value) in word_index.items()] # Reverses it, mapping integer indices to words
)

decoded_review = ' '.join(
    [reverse_word_index.get(i - 3, '?') for i in train_data[0]] # decodes the review
)

# indices are offset by 3 because 0, 1, and 2 are reserved indices for "padding", "start of sequence" and "unknown"
# word_index # 'fawn': 34701,
# reverse_word_index # 34701: 'fawn',
##############
decoded_review1 = []
' '.join(
    for i in train_data[0]:
        decoded_review1.append(reverse_word_index.get(i, '?'))
        print(decoded_review1)
        
)

print(type(decoded_review))
print(len(decoded_review))
# print(decoded_review)

<class 'str'>
1113


## Preparing the data

In [42]:
# Encoding the integer sequences into a binary matrix

import numpy as np

# def vectorize_sequences(sequences, dimension=10000):
#     results = np.zeros((len(sequences), dimension)) # creates all_zero_matrix of shape (len(sequences), dimension)
#     for i, sequence in enumerate(sequences):
#         results[i, sequence] = 1 # sets specific indices of results[i] to 1s
#     return results

# x_train = vectorize_sequences(train_data)
# x_test = vectorize_sequences(test_data)

results = np.zeros((len(train_data), 10000))

print(results)
print(len(train_data))

# for i, sequence in enumerate(train_data):
#   print(i, sequence)

for i, sequence in enumerate(train_data):
    print(results[i, sequence])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
25000


In [None]:
# Now samples look like now:
x_train[0]

In [None]:
# x_test[0]

In [None]:
# vectorize labels:

y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

# Now data is ready to be fed into Neural Network

### Defining a model

In [None]:
from tensorflow.keras import models
from tensorflow.keras import layers

In [None]:
model = models.Sequential()

model.add(layers.Dense(16, activation='relu', inpput_shape='10000',))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

### Compiling the model

In [None]:
# passing optimizer, loss, and metrics as string which is possible bcoz these are packaged as part of Keras

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

### * configure the parameters of optimizer
### * custom loss function and metrics

In [None]:
# Configuring the optimizer and using custom losses and metrics

####### DON'T RUN THIS CELL #########

from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import metrics

model.compile(optimizer=optimizer.RMSprop(lr=0.001, loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy]))

## Setting aside a validation set

* In order to monitor the accuracy of the model during training on the data it has never seen before, we will create a validation set setting apart 10,000 samples from the original training data

In [None]:
x_val = x_train[:10000]   # from 0 to 9,999 = 10,000 samples
partial_x_train = x_train[10000:]  # from 10,000 to end 

y_val = y_train[:10000]
partial_y_train = y_train[10000:]

## Training the model

In [None]:
history = model.fit(partial_x_train, 
                    partial_y_train, 
                    epochs=20, 
                    batch_size=512, 
                    validation_data=(x_val, y_val))

* On CPU, this will take less than 2 seconds per epoch—training is over in 20 seconds.
* At the end of every epoch, there is a slight pause as the model computes its loss and accuracy on the 10,000 samples of the validation data.
* call to `model.fit()` returns a History object. This object has the history, which is a dictionary containing data about everything that happened during training

In [None]:
histroy_dict = history.history

history_dict

In [None]:
# This dict contains four entries
history_dict.keys()

## Plotting the training and validation loss

In [None]:
import matplotlib.pyplot as plt

In [None]:
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']

epochs = range(1, len(accuracy) + 1)

plt.plot(epochs, loss_values, 'bo', label='Training Loss') # "bo" is for "blue dot"
plt.plot(epochs, val_loss_values, 'b', label='Validation Loss') # "b" is for "blue solid line"
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

## Plotting the training and validation accuracy

In [None]:
plt.clf()  # clears the figure

accuracy_values = history_dict['accuracy']
val_accuracy_values = history_dict['val_accuracy']

plt.plot(epochs, accuracy, 'bo', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'b', label='Validation accuracy')
plt.title('Training and Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

* `Training loss` **decreases** with every epoch
* `Training accuracy` **increases** with every epoch
* Quantity we are trying to minimize should be `less with every iteration` but that is `not the case for validation loss and accuracy`, they seem to peak at fourth epoch.
* To prevent overfitting, we could stop training after three epochs

## Retraining a model from scratch
Let's train a network from scrach for four epochs and then evaluate it on the test data

In [None]:
model1 = models.Sequential()

model1.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model1.add(layers.Dense(16, activation='relu'))
model1.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model1.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model1.fit(x_train, y_train, epochs=4, batch_size=512)

In [None]:
results = model1.evaluate(x_test, y_test)

# The final results are as follows
results

In [None]:
# This fairly naive approach achieves an accuracy of 88%. With state-of-the-art
# ..approaches, you should be able to get close to 95%.

## Generating predictions on the new data using a trained network
after we have trained a network, we will want to use it in a practical setting 

In [None]:
model1.predict(x_test)

## Further experiments
* You used two hidden layers. Try using `one or three hidden layers`, and see `how doing so affects validation and test accuracy`
* Try using layers with `more hidden units` or `fewer hidden units`: `32 units`, `64 units`, and so on.
* Try using the `mse loss function` instead of binary_crossentropy.
* Try using the `tanh activation` (an activation that was popular in the early days of neural networks) instead of relu.

In [None]:
model2 = models.Sequential()

model2.add(layers.Dense(32, activation='tanh', input_shape=(10000,)))
model2.add(layers.Dense(32, activation='tanh'))
model2.add(layers.Dense(32, activation='tanh'))
model2.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model2.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy'])

In [None]:
model2.fit(x_train, y_train, epochs=20, batch_size=512)

In [None]:
results = model2.evaluate(x_test, y_test)

results