In [11]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Input, Dense, Embedding, MultiHeadAttention, LayerNormalization, Dropout, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.tokenize import sent_tokenize
import re

In [12]:
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return text

In [4]:
def load_imdb_data():
    imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

    train_data = imdb['train']
    train_sentences, train_labels = [], []

    for sentence, label in tfds.as_numpy(train_data):
        train_sentences.append(clean_text(sentence.decode('utf-8')))
        train_labels.append(int(label))

    return train_sentences, train_labels

In [5]:
def summarize_document(document, model, tokenizer, max_len, top_n=3):
    sentences = sent_tokenize(document)
    sequence_data = tokenizer.texts_to_sequences(sentences)
    padded_sequences = pad_sequences(sequence_data, maxlen=max_len, padding='post')

    predictions = model.predict(padded_sequences)

    sentence_scores = [(index, score) for index, score in enumerate(predictions)]
    top_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)[:top_n]

    summary = [sentences[i] for i, _ in top_sentences]
    return ' '.join(summary)

In [6]:
train_sentences, train_labels = load_imdb_data()

max_len = 100
embedding_dim = 128
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_sentences)

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')

model = tf.keras.Sequential([
tf.keras.layers.Embedding(max_words, embedding_dim, input_length=max_len),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.INIADG_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.INIADG_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.INIADG_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.




In [7]:
num_epochs = 5
model.fit(train_padded, np.array(train_labels), epochs=num_epochs, validation_split=0.2)

model.save("/content/drive/MyDrive/Temp/imdb_model.h5")

Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step - accuracy: 0.7308 - loss: 0.5099 - val_accuracy: 0.8268 - val_loss: 0.3835
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - accuracy: 0.9033 - loss: 0.2468 - val_accuracy: 0.8548 - val_loss: 0.3497
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.9372 - loss: 0.1726 - val_accuracy: 0.8454 - val_loss: 0.3871
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.9553 - loss: 0.1246 - val_accuracy: 0.8324 - val_loss: 0.4329
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.9709 - loss: 0.0888 - val_accuracy: 0.8268 - val_loss: 0.6386




In [8]:
imdb_model = tf.keras.models.load_model('/content/drive/MyDrive/Temp/imdb_model.h5')

for layer in imdb_model.layers:
    layer.trainable = False

max_length = 100
input_shape = (max_length,)
inputs = tf.keras.Input(shape=input_shape)

x = imdb_model(inputs)

x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dense(32, activation='relu')(x)
x = tf.keras.layers.Dense(1, activation='sigmoid')(x)

summarization_model = tf.keras.Model(inputs=inputs, outputs=x)

summarization_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

summarization_model.summary()



In [19]:
# Example document for summarization with more sentences
example_document = (
  "This movie was fantastic! I loved the plot and the acting was superb. "
  "However, I felt that the ending was a bit rushed. The cinematography was stunning, "
  "and the music added a wonderful depth to the scenes. The character development was rich, "
  "and I appreciated how the film tackled complex themes. I found myself emotionally invested "
  "in the characters and their journeys. There were moments of humor that broke the tension, "
  "which I enjoyed. Overall, it was an enjoyable experience, but it could have been more impactful "
  "if the pacing had been better. I would recommend this film to anyone who loves a good story. "
  "It provided a perfect mix of humor and drama, and I think it will resonate well with many audiences. "
  "The performances were not only entertaining but also very moving, capturing the essence of the story."
)

summary = summarize_document(example_document, summarization_model, tokenizer, max_len)

print("Summary:")
print(summary)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 425ms/step
Summary:
It provided a perfect mix of humor and drama, and I think it will resonate well with many audiences. The performances were not only entertaining but also very moving, capturing the essence of the story. I loved the plot and the acting was superb.


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True