<a href="https://colab.research.google.com/github/vedants556/Collabs/blob/main/DL_PBLE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Vedant Shelar

22102A0027

DL PBLE


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

# For reproducibility
tf.random.set_seed(42)
np.random.seed(42)

In [None]:
# Load dataset with subword encoding
import tensorflow_datasets as tfds
(train_data, test_data), info = tfds.load(
    "imdb_reviews",
    split=["train", "test"],
    as_supervised=True,
    with_info=True
)

# Inspect vocab size
print("Subword encoder not used, default text")

Subword encoder not used, default text


In [None]:
# Keras built-in
vocab_size = 10000
maxlen = 200

(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=vocab_size)

# Pad sequences
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
model = keras.Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=32, input_length=maxlen),
    layers.Flatten(),
    layers.Dense(16, activation="sigmoid"),   # hidden layer with sigmoid neurons
    layers.Dense(1, activation="sigmoid")     # output layer with sigmoid
])

model.summary()



In [None]:
model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

history = model.fit(
    x_train, y_train,
    epochs=5,
    batch_size=512,
    validation_split=0.2
)

Epoch 1/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 56ms/step - accuracy: 0.5368 - loss: 0.7115 - val_accuracy: 0.7272 - val_loss: 0.6105
Epoch 2/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.7775 - loss: 0.5565 - val_accuracy: 0.8426 - val_loss: 0.4242
Epoch 3/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.8621 - loss: 0.3841 - val_accuracy: 0.8594 - val_loss: 0.3610
Epoch 4/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - accuracy: 0.8967 - loss: 0.3028 - val_accuracy: 0.8650 - val_loss: 0.3328
Epoch 5/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.9162 - loss: 0.2541 - val_accuracy: 0.8678 - val_loss: 0.3162


In [None]:
results = model.evaluate(x_test, y_test)
print(f"Test accuracy: {results[1]*100:.2f}%")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8654 - loss: 0.3198
Test accuracy: 86.78%


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Load raw text
(train_text, train_labels), (test_text, test_labels) = tf.keras.datasets.imdb.load_data(num_words=vocab_size)

# Decode integer sequences back to words (for CountVectorizer)
word_index = keras.datasets.imdb.get_word_index()
reverse_word_index = {v: k for k, v in word_index.items()}

def decode_review(text_ints):
    return " ".join([reverse_word_index.get(i - 3, "?") for i in text_ints])

train_text_decoded = [decode_review(s) for s in x_train]
test_text_decoded = [decode_review(s) for s in x_test]

# Bag of words
vectorizer = CountVectorizer(max_features=10000)
x_train_bow = vectorizer.fit_transform(train_text_decoded)
x_test_bow = vectorizer.transform(test_text_decoded)

# Train logistic regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(x_train_bow, y_train)

# Evaluate
lr_accuracy = lr_model.score(x_test_bow, y_test)
print(f"Logistic Regression test accuracy: {lr_accuracy*100:.2f}%")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Logistic Regression test accuracy: 84.56%
