In [1]:
import re
import pandas as pd
import numpy as np
import nltk.corpus
from datasets import load_dataset
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
train = load_dataset("cnn_dailymail", "3.0.0", split="train[:20]")
val = load_dataset("cnn_dailymail", "3.0.0", split="validation[:5]")
val.shape, train.shape

((5, 3), (20, 3))

In [3]:
def clean(text):
    # normalize characters
    text = text.lower()

    # remove punctuation, non-ascii characters, and other patterns
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    text = re.sub("(\\t)", " ", str(text)).lower()
    text = re.sub("(\\r)", " ", str(text)).lower()
    text = re.sub("(\\n)", " ", str(text)).lower()
    text = re.sub("(\.\s+)", " ", str(text)).lower()
    text = re.sub("(\-\s+)", " ", str(text)).lower()
    text = re.sub("(\:\s+)", " ", str(text)).lower()

    # remove stopwords
    stop = stopwords.words('english')
    text = " ".join([word for word in text.split() if word not in (stop)])

    return text.strip()

In [4]:
train_clean_articles = []
for sent in train['article']:
    s = clean(sent)
    train_clean_articles.append(s)

train_clean_summaries = []
for sent in train['highlights']:
    s = clean(sent)
    train_clean_summaries.append(s)



val_clean_articles = []
for sent in val['article']:
    s = clean(sent)
    val_clean_articles.append(s)

val_clean_summaries = []
for sent in val['highlights']:
    s = clean(sent)
    val_clean_summaries.append(s)



In [5]:
train_clean = pd.DataFrame()
train_clean['train_cleaned_article'] = pd.Series(train_clean_articles)
train_clean['train_cleaned_summary'] = pd.Series(train_clean_summaries)

val_clean = pd.DataFrame()
val_clean['val_cleaned_article'] = pd.Series(val_clean_articles)
val_clean['val_cleaned_summary'] = pd.Series(val_clean_summaries)

In [6]:
# Check how much % of text have 0-1000 words
cnt = 0
for i in train_clean['train_cleaned_article']:
    if len(i.split()) <= 1000:
        cnt = cnt + 1
print(cnt / len(train_clean['train_cleaned_article']))

# Check how much % of summaries have 0-75 words
cnt = 0
for i in train_clean['train_cleaned_summary']:
    if len(i.split()) <= 75:
        cnt = cnt + 1
print(cnt / len(train_clean['train_cleaned_summary']))

1.0
1.0


In [7]:
max_article_len = 1000
max_summary_len = 75

train_cleaned_article = np.array(train_clean['train_cleaned_article'])
train_cleaned_summary= np.array(train_clean['train_cleaned_summary'])

short_article = []
short_summary = []

for i in range(len(train_cleaned_article)):
    if len(train_cleaned_summary[i].split()) <= max_summary_len and len(train_cleaned_article[i].split()) <= max_article_len:
        short_article.append(train_cleaned_article[i])
        short_summary.append(train_cleaned_summary[i])
        
post_train_clean = pd.DataFrame({'article': short_article,'summary': short_summary})

post_train_clean.head(2)


Unnamed: 0,article,summary
0,london england reuters harry potter star danie...,harry potter star daniel radcliffe gets 20m fo...
1,editors note behind scenes series cnn correspo...,mentally ill inmates miami housed forgotten fl...


In [8]:
# Add start and end tokens

post_train_clean['summary'] = post_train_clean['summary'].apply(lambda x: 'senstart ' + x \
        + ' senend')

post_train_clean.head(2)

Unnamed: 0,article,summary
0,london england reuters harry potter star danie...,senstart harry potter star daniel radcliffe ge...
1,editors note behind scenes series cnn correspo...,senstart mentally ill inmates miami housed for...


In [9]:
val_cleaned_article = np.array(val_clean['val_cleaned_article'])
val_cleaned_summary= np.array(val_clean['val_cleaned_summary'])

short_article = []
short_summary = []

for i in range(len(val_cleaned_article)):
    if len(val_cleaned_summary[i].split()) <= max_summary_len and len(val_cleaned_article[i].split()) <= max_article_len:
        short_article.append(val_cleaned_article[i])
        short_summary.append(val_cleaned_summary[i])
        
post_val_clean = pd.DataFrame({'article': short_article,'summary': short_summary})

post_val_clean.head(2)

Unnamed: 0,article,summary
0,cnnshare gift multiplied may sound like esoter...,zully broussard decided give kidney stranger n...
1,cnnon 6th april 1996 san jose clash dc united ...,20th mls season begins weekend league changed ...


In [10]:
# Add start and end tokens

post_val_clean['summary'] = post_val_clean['summary'].apply(lambda x: 'senstart ' + x \
        + ' senend')

post_val_clean.head(2)

Unnamed: 0,article,summary
0,cnnshare gift multiplied may sound like esoter...,senstart zully broussard decided give kidney s...
1,cnnon 6th april 1996 san jose clash dc united ...,senstart 20th mls season begins weekend league...


In [11]:
x_train = list(post_train_clean['article'])
x_val = list(post_val_clean['article'])
x_tokenizer = Tokenizer() 
x_tokenizer.fit_on_texts(x_train)

In [12]:
thresh = 3

cnt = 0
tot_cnt = 0

for key, value in x_tokenizer.word_counts.items():
    tot_cnt = tot_cnt + 1
    if value < thresh:
        cnt = cnt + 1
    
print("% of rare words in vocabulary: ", (cnt / tot_cnt) * 100)

% of rare words in vocabulary:  77.8688524590164


In [13]:
# Prepare a tokenizer, again -- by not considering the rare words
x_tokenizer = Tokenizer(num_words = tot_cnt - cnt) 
x_tokenizer.fit_on_texts(x_train)

# Convert text sequences to integer sequences 
x_train_seq = x_tokenizer.texts_to_sequences(x_train) 
x_val_seq = x_tokenizer.texts_to_sequences(x_val)

# Pad zero upto maximum length
x_train = pad_sequences(x_train_seq,  maxlen=max_article_len, padding='post')
x_val = pad_sequences(x_val_seq, maxlen=max_article_len, padding='post')

# Size of vocabulary (+1 for padding token)
x_voc = x_tokenizer.num_words + 1

print("Size of vocabulary in X = {}".format(x_voc))

Size of vocabulary in X = 595


In [14]:
y_train = list(post_train_clean['summary'])
y_val = list(post_val_clean['summary'])

In [15]:
# Prepare a tokenizer on testing data
y_tokenizer = Tokenizer()   
y_tokenizer.fit_on_texts(y_train)


thresh = 3
cnt = 0
tot_cnt = 0

for key, value in y_tokenizer.word_counts.items():
    tot_cnt = tot_cnt + 1
    if value < thresh:
        cnt = cnt + 1
    
print("% of rare words in vocabulary:",(cnt / tot_cnt) * 100)

# Prepare a tokenizer, again -- by not considering the rare words
y_tokenizer = Tokenizer(num_words=tot_cnt-cnt) 
y_tokenizer.fit_on_texts(list(y_train))

# Convert text sequences to integer sequences 
y_train_seq = y_tokenizer.texts_to_sequences(y_train) 
y_val_seq = y_tokenizer.texts_to_sequences(y_val) 

# Pad zero upto maximum length
y_train = pad_sequences(y_train_seq, maxlen=max_summary_len, padding='post')
y_val = pad_sequences(y_val_seq, maxlen=max_summary_len, padding='post')

# Size of vocabulary (+1 for padding token)
y_voc = y_tokenizer.num_words + 1

print("Size of vocabulary in Y = {}".format(y_voc))

% of rare words in vocabulary: 95.69377990430623
Size of vocabulary in Y = 19


### Model

In [16]:
latent_dim = 200
embedding_dim = 100

# Encoder
encoder_inputs = Input(shape=(max_article_len, ))

# Embedding layer
enc_emb = Embedding(x_voc, embedding_dim,
                    trainable=True)(encoder_inputs)

# Encoder LSTM 1
encoder_lstm1 = LSTM(latent_dim, return_sequences=True,
                     return_state=True, dropout=0.4,
                     recurrent_dropout=0.4)
(encoder_output1, state_h1, state_c1) = encoder_lstm1(enc_emb)

# Encoder LSTM 2
encoder_lstm2 = LSTM(latent_dim, return_sequences=True,
                     return_state=True, dropout=0.4,
                     recurrent_dropout=0.4)
(encoder_output2, state_h2, state_c2) = encoder_lstm2(encoder_output1)

# Encoder LSTM 3
encoder_lstm3 = LSTM(latent_dim, return_state=True,
                     return_sequences=True, dropout=0.4,
                     recurrent_dropout=0.4)
(encoder_outputs, state_h, state_c) = encoder_lstm3(encoder_output2)

# Set up the decoder, using encoder_states as the initial state
decoder_inputs = Input(shape=(None, ))

# Embedding layer
dec_emb_layer = Embedding(y_voc, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

# Decoder LSTM
decoder_lstm = LSTM(latent_dim, return_sequences=True,
                    return_state=True, dropout=0.4,
                    recurrent_dropout=0.2)
(decoder_outputs, decoder_fwd_state, decoder_back_state) = \
    decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# Dense layer
decoder_dense = TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [17]:

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1000)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 1000, 100)            59500     ['input_1[0][0]']             
                                                                                                  
 lstm (LSTM)                 [(None, 1000, 200),          240800    ['embedding[0][0]']           
                              (None, 200),                                                        
                              (None, 200)]                                                        
                                                                                              

In [18]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

In [19]:
len(x_train)

20

In [20]:
model.fit(
    [x_train, y_train[:, :-1]],
    y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:, 1:],
    epochs=7,
    callbacks=[es],
    batch_size=128,
    validation_data=([x_val, y_val[:, :-1]],
                     y_val.reshape(y_val.shape[0], y_val.shape[1], 1)[:
                     , 1:]),)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x1b5f47fcc90>

In [21]:
reverse_target_word_index = y_tokenizer.index_word
reverse_source_word_index = x_tokenizer.index_word
target_word_index = y_tokenizer.word_index

In [22]:
target_word_index

{'senstart': 1,
 'senend': 2,
 'says': 3,
 'president': 4,
 'bush': 5,
 'new': 6,
 'first': 7,
 'found': 8,
 'us': 9,
 'cnn': 10,
 'colonoscopy': 11,
 'nfl': 12,
 'two': 13,
 'killed': 14,
 'snow': 15,
 'say': 16,
 'july': 17,
 'bees': 18,
 'potter': 19,
 'turns': 20,
 'monday': 21,
 'five': 22,
 'hes': 23,
 'driver': 24,
 'wednesday': 25,
 'powers': 26,
 'transferred': 27,
 'vice': 28,
 'routine': 29,
 'chief': 30,
 'falcons': 31,
 'without': 32,
 'pay': 33,
 'vick': 34,
 'violence': 35,
 'prostitution': 36,
 'group': 37,
 'children': 38,
 '2002': 39,
 'press': 40,
 'empty': 41,
 'weapon': 42,
 'home': 43,
 'london': 44,
 'friday': 45,
 'club': 46,
 '2004': 47,
 'since': 48,
 'oakland': 49,
 'police': 50,
 'miles': 51,
 '15': 52,
 'beckham': 53,
 'contract': 54,
 'collapse': 55,
 'disorder': 56,
 'bank': 57,
 'harry': 58,
 'star': 59,
 'daniel': 60,
 'radcliffe': 61,
 'gets': 62,
 '20m': 63,
 'fortune': 64,
 '18': 65,
 'young': 66,
 'actor': 67,
 'plans': 68,
 'fritter': 69,
 'cash': 

In [23]:
# Inference Models

# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs,
                      state_h, state_c])

# Decoder setup

# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim, ))
decoder_state_input_c = Input(shape=(latent_dim, ))
decoder_hidden_state_input = Input(shape=(max_article_len, latent_dim))

# Get the embeddings of the decoder sequence
dec_emb2 = dec_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
(decoder_outputs2, state_h2, state_c2) = decoder_lstm(dec_emb2,
        initial_state=[decoder_state_input_h, decoder_state_input_c])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Final decoder model
decoder_model = Model([decoder_inputs] + [decoder_hidden_state_input,
                      decoder_state_input_h, decoder_state_input_c],
                      [decoder_outputs2] + [state_h2, state_c2])

In [24]:
def decode_sequence(input_seq):

    # Encode the input as state vectors.
    (e_out, e_h, e_c) = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['senstart']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        (output_tokens, h, c) = decoder_model.predict([target_seq]
                + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]

        if sampled_token != 'senend':
            decoded_sentence += ' ' + sampled_token

        # Exit condition: either hit max length or find the stop word.
        if sampled_token == 'senend' or len(decoded_sentence.split()) \
            >= max_summary_len - 1:
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        (e_h, e_c) = (h, c)

    return decoded_sentence

In [25]:
# To convert sequence to summary
def seq2summary(input_seq):
    newString = ''
    for i in input_seq:
        if i != 0 and i != target_word_index['senstart'] and i \
            != target_word_index['senend']:
            newString = newString + reverse_target_word_index[i] + ' '

    return newString


# To convert sequence to text
def seq2text(input_seq):
    newString = ''
    for i in input_seq:
        if i != 0:
            newString = newString + reverse_source_word_index[i] + ' '

    return newString

In [26]:
for i in range(0, 2):
    print('Review:', seq2text(x_train[i]))
    print('Original summary:', seq2summary(y_train[i]))
    print('Predicted summary:', decode_sequence(x_train[i].reshape(1,
           max_article_len)))
    print('\n')

Review: london england reuters harry potter star radcliffe reported million million 18 monday money wont radcliffe harry potter harry potter order around world young says cash away cars celebrity dont one people turn 18 car told australian earlier month dont think ill things like things pounds 18 radcliffe able see part six number one movie uk hell birthday ill sort said none first five potter held fund able says feet ground people say star goes told reporters last month try hard go way would latest boy harry potter order last two watch give latest life potter movie called boy son later year also appear december boys australian four boys earlier year made playing meanwhile even closer hes think im going sort game told reuters email friend 2007 reuters may 
Original summary: says first 
Predicted summary:  senstart senstart senstart senstart senstart senstart senstart senstart senstart senstart senstart senstart senstart senstart senstart senstart senstart senstart senstart senstart sen

In [27]:
reverse_target_word_index

{1: 'senstart',
 2: 'senend',
 3: 'says',
 4: 'president',
 5: 'bush',
 6: 'new',
 7: 'first',
 8: 'found',
 9: 'us',
 10: 'cnn',
 11: 'colonoscopy',
 12: 'nfl',
 13: 'two',
 14: 'killed',
 15: 'snow',
 16: 'say',
 17: 'july',
 18: 'bees',
 19: 'potter',
 20: 'turns',
 21: 'monday',
 22: 'five',
 23: 'hes',
 24: 'driver',
 25: 'wednesday',
 26: 'powers',
 27: 'transferred',
 28: 'vice',
 29: 'routine',
 30: 'chief',
 31: 'falcons',
 32: 'without',
 33: 'pay',
 34: 'vick',
 35: 'violence',
 36: 'prostitution',
 37: 'group',
 38: 'children',
 39: '2002',
 40: 'press',
 41: 'empty',
 42: 'weapon',
 43: 'home',
 44: 'london',
 45: 'friday',
 46: 'club',
 47: '2004',
 48: 'since',
 49: 'oakland',
 50: 'police',
 51: 'miles',
 52: '15',
 53: 'beckham',
 54: 'contract',
 55: 'collapse',
 56: 'disorder',
 57: 'bank',
 58: 'harry',
 59: 'star',
 60: 'daniel',
 61: 'radcliffe',
 62: 'gets',
 63: '20m',
 64: 'fortune',
 65: '18',
 66: 'young',
 67: 'actor',
 68: 'plans',
 69: 'fritter',
 70: 'cas

In [29]:
y_train[0]

array([1, 3, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0])