In [1]:
%pip install tensorflow.keras.datasets

from tensorflow.keras.datasets import imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)



Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement tensorflow.keras.datasets (from versions: none)

[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\krish\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for tensorflow.keras.datasets


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 0us/step


In [None]:
# print(train_data, train_labels) 
word_index = imdb.get_word_index() #word_index is a dict mapping words to an integer index
reverse_word_index = dict(
    [(value, key) for (key, value) in word_index.items()]
) #reversing dict so we can get word for number
decoded_review = " ".join(
    [reverse_word_index.get(i-3, "?") for i in train_data[0]]
) #decoded the review, indices are offset by 3, reserved for "padding", "start of sequence" and "unknown"

print(decoded_review)


? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you thi

In [20]:
# Preparing the data

# encoding the integer sequences via multi-hot encoding

# print(train_data)

import numpy as np
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    # print(results)
    for i, sequence in enumerate(sequences):
        for j in sequence:
            results[i, j] = 1.
        
    return results

x_train = vectorize_sequences(train_data)
# print(x_train)
x_test = vectorize_sequences(test_data)

In [None]:
y_train = np.asarray(train_labels).astype("float32") #vectorizing labels(turning them into NumPy arrays), casting them to float32 for compatibility and performance
y_test = np.asarray(test_labels).astype("float32")

print(train_labels)
print(y_train)
print(type( train_labels[0]))

[1 0 0 ... 0 1 0]
[1. 0. 0. ... 0. 1. 0.]
<class 'numpy.int64'>


In [None]:
# Building your model

# input data is vectors and the labels are scalars(1s and 0s), type of model that performs well on such a problem is plain stack of densely connected(Dense) layers with relu activation
# for now choosing layers and units for each layer based on the book

# model definition

from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Dense(16, activation="relu"), #first argument passed to each Dense layer is the number of units in the layer: the dimensionality of representation space of the layer
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

# 16 units mean the weight matrix W will have shape (input_dimension, 16), dot product with W
# will project the input data onto a 16-dimensional representation space, then add the bias vector b and apply relu operation
# having more units(a higher dim representation space) allows our model to learn more complex represntations, but it makes the model more computatioinally expensive and may lead to learning 
# unwanted patterns(patterns that will improve performance on the training data but on the test data)

# intermediate layers use relu as their activation function and the final layers uses a sigmoid activation
# so as to output probability(a score between 0 and 1 indicating how likely the sample is to have the target 1: how likely review positive)

# a relu(rectified linear unit) is a function meant to zero out negative values
# where a sigmoid "squashes" arbitrary values into the [0,1] interval(something interpreted as probability)

<!-- Activation functions and why are they necessary -->

without it(like relu), the dense layer would consist of two linear operations - a dot product and an addition:
output = dot(input, W) + b

the layer could only learn linear transformations(affine transformations) fo the input data: 
the hypothesis space of the layer would be the set of all possible linear transformations of the input data into a 16-dim space. Such a hypothesis space too restricted and won't benefit from multiple layers of representations as a deep stack of linear layers would still implement a linear operation.

in order to get access to a much richer hypothesis space that benefits from deep representations, we need a non-linearity or activation function. relu a popular choice(other similar prelu, elu and so on)

In [27]:
# now we choose a loss function and an optimizer
# it is best to use binary_crossentropy loss as best with models that output probabilities
# Crossentropy is a quantity from the field of information theory that measures the distance between probability
# distributions or in this case between the ground truth distribution and your predictions

# compiling the model

model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])



In [None]:
# Validating approach