# Text Summarization

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset

import nltk
from nltk.tokenize import sent_tokenize

In [2]:
dataset = load_dataset("cnn_dailymail", version="3.0.0")

Downloading builder script:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/default to /root/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
print(f"Features: {dataset['train'].column_names}")

Features: ['article', 'highlights', 'id']


In [4]:
print(f"Dataset shape: {dataset['train'].shape}")

Dataset shape: (287113, 3)


In [5]:
print(f"Number of Documents in Dataset: {len(dataset['train'])}")

Number of Documents in Dataset: 287113


In [6]:
docs_df = pd.DataFrame()
docs_df["text"] = dataset["train"]["article"]
docs_df["summary"] = dataset["train"]["highlights"]
docs_df["id"] = dataset["train"]["id"]
docs_df.sample(5)

Unnamed: 0,text,summary,id
81928,"ISLAMABAD, Pakistan (CNN) -- Shazia knows what...",Pakistan's eunuch community campaigns for prop...,e82d14139e2cdc530b31cd54d1a2e006674bc2ea
229962,By . Declan Warrington . ‘The Celtic Warrior’ ...,'The Celtic Warrior' Steve Collins is set to t...,b5c99d80232f1aa8387f3edf09663258ee3a3c81
283915,By . Daily Mail Reporter . UPDATED: . 12:47 ES...,The superhydrophobic spray is now available at...,fbd50db9f9f9da59f5cfc8c800dd9f7dfde91e10
112646,By . Lawrence Booth . Follow @@the_topspin . S...,The visitors were on eight for four at the beg...,1d5745894c14dc8d98f7845cc94053e35d0f528e
270437,By . Daily Mail Reporter . PUBLISHED: . 10:17 ...,"John Eastman of Waterbury, Connecticut is accu...",ea44e457f1e07d9ce969943ba302d70416451462


In [7]:
sample = docs_df.loc[:,'text':'summary'].head(1)

In [8]:
print(f"""Article (excerpt of 500 characters, total length: {len(sample["text"])}):""")
print(sample["text"][:500])
print(f'\nSummary (length: {len(sample["summary"])}):')
print(sample["summary"])

Article (excerpt of 500 characters, total length: 1):
0    It's official: U.S. President Barack Obama wan...
Name: text, dtype: object

Summary (length: 1):
0    Syrian official: Obama climbed to the top of t...
Name: summary, dtype: object


In [9]:
sample_text = dataset["train"][1]["article"][:2000]
# We'll collect the generated summaries of each model in a dictionary
summaries = {}

In [10]:
sample_text

'(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men\'s 4x100m relay. The fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds. The U.S finished second in 37.56 seconds with Canada taking the bronze after Britain were disqualified for a faulty handover. The 26-year-old Bolt has now collected eight gold medals at world championships, equaling the record held by American trio Carl Lewis, Michael Johnson and Allyson Felix, not to mention the small matter of six Olympic titles. The relay triumph followed individual successes in the 100 and 200 meters in the Russian capital. "I\'m proud of myself and I\'ll continue to work to dominate for as long as possible," Bolt said, having previously expressed his intention to carry on until the 2016 Rio Olympics. Victory was never se

***
## **Baseline Summarization model**
***

In [11]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
string = "The U.S. are a country. The U.N. is an organization."
sent_tokenize(sample_text)

["(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men's 4x100m relay.",
 'The fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds.',
 'The U.S finished second in 37.56 seconds with Canada taking the bronze after Britain were disqualified for a faulty handover.',
 'The 26-year-old Bolt has now collected eight gold medals at world championships, equaling the record held by American trio Carl Lewis, Michael Johnson and Allyson Felix, not to mention the small matter of six Olympic titles.',
 'The relay triumph followed individual successes in the 100 and 200 meters in the Russian capital.',
 '"I\'m proud of myself and I\'ll continue to work to dominate for as long as possible," Bolt said, having previously expressed his intention to carry on until the 2016 Rio Olympics.'

In [13]:
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

In [14]:
summaries["baseline"] = three_sentence_summary(sample_text)

In [15]:
summaries['baseline']

"(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men's 4x100m relay.\nThe fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds.\nThe U.S finished second in 37.56 seconds with Canada taking the bronze after Britain were disqualified for a faulty handover."

***
## Extractive Summarization
***

In [16]:
!pip install pytextrank

Collecting pytextrank
  Downloading pytextrank-3.2.4-py3-none-any.whl (30 kB)
Collecting networkx[default]>=2.6
  Downloading networkx-2.6.3-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting graphviz>=0.13
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting icecream>=2.1
  Downloading icecream-2.1.3-py2.py3-none-any.whl (8.4 kB)
Collecting asttokens>=2.0.1
  Downloading asttokens-2.2.1-py2.py3-none-any.whl (26 kB)
Collecting executing>=0.3.1
  Downloading executing-1.2.0-py2.py3-none-any.whl (24 kB)
Installing collected packages: executing, networkx, graphviz, asttokens, icecream, pytextrank
  Attempting uninstall: networkx
    Found existing installation: networkx 2.5
    Uninstalling networkx-2.5:
      Successfully uninstal

In [17]:
import spacy
import pytextrank

In [18]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('textrank')
#tr = pytextrank.TextRank()
#nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)

<pytextrank.base.BaseTextRankFactory at 0x7fa85497cb10>

In [19]:
doc = nlp(sample_text)

# examine the top-ranked phrases in the document
for p in doc._.phrases:
    print('{:.4f} {:5d}  {}'.format(p.rank, p.count, p.text))
    print(p.chunks)

0.0946     2  English Gardner
[English Gardner, English Gardner]
0.0890     2  Alexandria Anderson
[Alexandria Anderson, Alexandria Anderson]
0.0802     2  Bolt ag
[Bolt ag, Bolt ag]
0.0771     9  Bolt
[Bolt, Bolt, Bolt, Bolt, Bolt, Bolt, Bolt, Bolt, Bolt]
0.0751     2  Usain Bolt
[Usain Bolt, Usain Bolt]
0.0737     1  United States rival Justin Gatlin
[United States rival Justin Gatlin]
0.0729     1  second place
[second place]
0.0710     4  France
[France, France, France, France]
0.0672     1  United States
[United States]
0.0669     2  world championships
[world championships, world championships]
0.0632     1  third leg runner Rakieem Salaam
[third leg runner Rakieem Salaam]
0.0629     1  silver
[silver]
0.0607     1  American trio Carl Lewis
[American trio Carl Lewis]
0.0603     3  second
[second, second, second]
0.0600     2  Nickel Ashmeade
[Nickel Ashmeade, Nickel Ashmeade]
0.0571     2  Allyson Felix
[Allyson Felix, Allyson Felix]
0.0570     1  the second handover
[the second 

In [20]:
for sent in doc._.textrank.summary(limit_phrases=15, limit_sentences=10):
    print(sent)

Defending champions, the United States, were initially back in the bronze medal position after losing time on the second handover between Alexandria Anderson and English Gardner, but promoted to silver when France were subsequently disqualified for an illegal handover.
The fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds.
The 26-year-old Bolt has now collected eight gold medals at world championships, equaling the record held by American trio Carl Lewis, Michael Johnson and Allyson Felix, not to mention the small matter of six Olympic titles.
Their quartet recorded a championship record of 41.29 seconds, well clear of France, who crossed the line in second place in 42.73 seconds.
Fraser-Pryce, like Bolt ag
"I'm proud of myself and I'll continue to work to dominate for as long as possible," Bolt said, having previously expressed his intention to carry on u

***
## Seq2Seq model
***

In [21]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
import re
from nltk.corpus import stopwords   
from tensorflow.keras.layers import Input, LSTM, GRU, Embedding,Attention, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings
import time
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

In [22]:
nlp = spacy.blank("en")

In [23]:
reviews_df = pd.read_csv("../input/amazon-fine-food-reviews/Reviews.csv")

In [24]:
reviews_df.head(1)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labr...


**a) Text Cleaning**

In [25]:
#Removes non-alphabetic characters:
def text_strip(column):
    for row in column:
        
        #ORDER OF REGEX IS VERY VERY IMPORTANT!!!!!!
        
        row=re.sub("(\\t)", ' ', str(row)).lower() #remove escape charecters
        row=re.sub("(\\r)", ' ', str(row)).lower() 
        row=re.sub("(\\n)", ' ', str(row)).lower()
        
        row=re.sub("(__+)", ' ', str(row)).lower()   #remove _ if it occors more than one time consecutively
        row=re.sub("(--+)", ' ', str(row)).lower()   #remove - if it occors more than one time consecutively
        row=re.sub("(~~+)", ' ', str(row)).lower()   #remove ~ if it occors more than one time consecutively
        row=re.sub("(\+\++)", ' ', str(row)).lower()   #remove + if it occors more than one time consecutively
        row=re.sub("(\.\.+)", ' ', str(row)).lower()   #remove . if it occors more than one time consecutively
        
        row=re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(row)).lower() #remove <>()|&©ø"',;?~*!
        
        row=re.sub("(mailto:)", ' ', str(row)).lower() #remove mailto:
        row=re.sub(r"(\\x9\d)", ' ', str(row)).lower() #remove \x9* in text
        row=re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(row)).lower() #replace INC nums to INC_NUM
        row=re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', str(row)).lower() #replace CM# and CHG# to CM_NUM
        
        
        row=re.sub("(\.\s+)", ' ', str(row)).lower() #remove full stop at end of words(not between)
        row=re.sub("(\-\s+)", ' ', str(row)).lower() #remove - at end of words(not between)
        row=re.sub("(\:\s+)", ' ', str(row)).lower() #remove : at end of words(not between)
        
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 space
        
        row = re.sub("(\s+)",' ',str(row)).lower() #remove multiple spaces
        
        #Should always be last
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces

        yield row

In [26]:
clean_text = text_strip(reviews_df['Text'])
clean_summary = text_strip(reviews_df['Summary'])

In [27]:
t = time.time()

text = [str(doc) for doc in nlp.pipe(clean_text, batch_size=5000)]

#print('Time to clean up everything: {} mins'.format(round((time.time() - t) / 60, 2)))

In [28]:
t = time.time()

#Batch the data points into 5000 and run on all cores for faster preprocessing
summary = ['sostok _START_ '+ str(doc) + ' _END_ eostok' for doc in nlp.pipe(clean_summary, batch_size=5000)]

#print('Time to clean up everything: {} mins'.format(round((time.time() - t) / 60, 2)))

In [29]:
fe=pd.DataFrame()

fe['final_text'] = pd.Series(text)
fe['final_summary'] = pd.Series(summary)

In [30]:
fe.head(1)

Unnamed: 0,final_text,final_summary
0,i have bought several of the vitality canned dog food products and have found them all to be of good quality the product looks more like stew than processed meat and it smells better my labrador i...,sostok _START_ good quality dog food _END_ eostok


In [31]:
#Model to summarize the text between 0-15 words for Summary and 0-60 words for Text
max_text_length = 60  
max_summary_length = 8

In [32]:
txt=[]
summ=[]

clean_text =np.array(fe['final_text'])
clean_summary=np.array(fe['final_summary'])

for i in range(len(clean_text)):
    if(len(clean_summary[i].split())<=max_summary_length and len(clean_text[i].split())<=max_text_length):
        txt.append(clean_text[i])
        summ.append(clean_summary[i])
        
fe_df = pd.DataFrame({'text':txt,'summary':summ})

In [33]:
fe_df.head()

Unnamed: 0,text,summary
0,i have bought several of the vitality canned dog food products and have found them all to be of good quality the product looks more like stew than processed meat and it smells better my labrador i...,sostok _START_ good quality dog food _END_ eostok
1,product arrived labeled as jumbo salted peanuts the peanuts were actually small sized unsalted not sure if this was an error or if the vendor intended to represent the product as jumbo .,sostok _START_ not as advertised _END_ eostok
2,if you are looking for the secret ingredient in robitussin believe have found it got this in addition to the root beer extract ordered which was good and made some cherry soda the flavor is very m...,sostok _START_ cough medicine _END_ eostok
3,great taffy at great price there was wide assortment of yummy taffy delivery was very quick if your taffy lover this is deal.,sostok _START_ great taffy _END_ eostok
4,this taffy is so good it is very soft and chewy the flavors are amazing would definitely recommend you buying it very satisfying,sostok _START_ wonderful tasty taffy _END_ eostok


For training purpose with hardware constraint the size is reduced

In [34]:
fe_df = fe_df.loc[:0.4*len(fe_df)]

Split the data to TRAIN and VALIDATION datasets

In [35]:
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val=train_test_split(np.array(fe_df['text']),np.array(fe_df['summary']),test_size=0.2,random_state=0,shuffle=True)

In [36]:
x_train[0],y_train[0]

('great jerky spicy flavorful little pricy but worth it if you like jerky will be buying more but check out tillamook website cause it might be cheaper there ',
 'sostok _START_ good stuff  _END_ eostok')

In [37]:
len(x_train),len(x_val)

(77952, 19489)

***
**Preparing the Tokenizer**

A tokenizer builds the vocabulary and converts a word sequence to an integer sequence. Go ahead and build tokenizers for text and summary:

a) Text Tokenizer
***

In [38]:
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(x_train))

#convert text sequences into integer sequences
x_train    =   x_tokenizer.texts_to_sequences(x_train) 
x_val   =   x_tokenizer.texts_to_sequences(x_val)

#padding zero upto maximum length
x_train    =   pad_sequences(x_train,  maxlen=max_text_length, padding='post') 
x_val   =   pad_sequences(x_val, maxlen=max_text_length, padding='post')

In [39]:
#size of vocabulary ( +1 for padding token)
x_voc_size   =  len(x_tokenizer.word_index) +1

print("Size of vocabulary in X = {}".format(x_voc_size))

Size of vocabulary in X = 31212


b) Summary Tokenizer

In [40]:
#preparing a tokenizer for summary on training data 
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(y_train))

#convert summary sequences into integer sequences
y_train    =   y_tokenizer.texts_to_sequences(y_train) 
y_val   =   y_tokenizer.texts_to_sequences(y_val) 

#padding zero upto maximum length
y_train    =   pad_sequences(y_train, maxlen=max_summary_length, padding='post')
y_val   =   pad_sequences(y_val, maxlen=max_summary_length, padding='post')

In [41]:
y_voc_size  =   len(y_tokenizer.word_index) +1

## Seq2Seq Attention mechanism model building

In [42]:
latent_dim = 300 

# Encoder
encoder_inputs = Input(shape=(max_text_length,))

#embedding layer
enc_emb =  Embedding(x_voc_size, latent_dim,trainable=True)(encoder_inputs)

#encoder lstm 1
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

#encoder lstm 2
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

#encoder lstm 3
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))

#embedding layer
dec_emb_layer = Embedding(y_voc_size, latent_dim,trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.5,recurrent_dropout=0.3)
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])

#dense layer
decoder_dense =  TimeDistributed(Dense(y_voc_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model 
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

2023-01-20 03:27:01.529927: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-20 03:27:01.531686: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-20 03:27:01.532860: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-20 03:27:01.534268: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 60)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 60, 300)      9363600     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 60, 300), (N 721200      embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________

In [43]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [44]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)
epochs = 20
BATCH_SIZE = 128

In [45]:
history=model.fit([x_train,y_train[:,:-1]], y_train.reshape(y_train.shape[0],y_train.shape[1], 1)[:,1:] ,epochs=epochs,callbacks=[early_stop],batch_size=BATCH_SIZE, validation_data=([x_val,y_val[:,:-1]], y_val.reshape(y_val.shape[0],y_val.shape[1], 1)[:,1:]))


2023-01-20 03:27:11.937620: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20


2023-01-20 03:27:17.945251: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 38373888 exceeds 10% of free system memory.
2023-01-20 03:27:17.986640: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 38373888 exceeds 10% of free system memory.
2023-01-20 03:27:18.045366: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 38373888 exceeds 10% of free system memory.
2023-01-20 03:27:18.075854: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 38373888 exceeds 10% of free system memory.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 00015: early stopping


In [46]:
model.save("seq2seq_model.h5")

2023-01-20 04:40:13.952736: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 37454400 exceeds 10% of free system memory.


 build the dictionary to convert the index to word for target and source vocabulary:

In [47]:
reverse_target_word_index=y_tokenizer.index_word
reverse_source_word_index=x_tokenizer.index_word
target_word_index=y_tokenizer.word_index

In [48]:
# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(max_text_length,latent_dim))

# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs) 
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2) 

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])

In [49]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]
        
        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (max_summary_length-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

In [50]:
def seq2summary(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index['sostok']) and i!=target_word_index['eostok']):
            newString=newString+reverse_target_word_index[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+reverse_source_word_index[i]+' '
    return newString

In [51]:
for i in range(0,10):
    print("Review:",seq2text(x_train[i]))
    print("Original summary:",seq2summary(y_train[i]))
    print("Predicted summary:",decode_sequence(x_train[i].reshape(1,max_text_length)))
    print("\n")

Review: great jerky spicy flavorful little pricy but worth it if you like jerky will be buying more but check out tillamook website cause it might be cheaper there 
Original summary: start good stuff end 
Predicted summary:  start good stuff end


Review: i didn know what to expect when ordering these but gave it try anyway kinda like mix between cracker and pretzel chunk with sunflower seeds the salt and pepper flavoring is great and tends to be self limiting in consumption the pepper keeps you from eating too many at time very tasty 
Original summary: start very tasty end 
Predicted summary:  start very tasty end


Review: good coffee at very reasonable price approx 5 00 lb prefer the whole bean but this was extremely good for the price subscription price 
Original summary: start good coffee end 
Predicted summary:  start good coffee end


Review: i have big dogs and they love these treats in their everlasting balls my golden retriever can make one treat last days but the black lab c

***
## Abstractive Summarization
***

In [52]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

from transformers import pipeline, set_seed

## **Using T5 Transformer**

In [53]:
pipe = pipeline("summarization", model="t5-large")
pipe_out = pipe(sample_text)
summaries["t5"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.75G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [54]:
summaries['t5']

"usain bolt wins his third gold medal of the world championships in the men's 4x100m relay .\nthe 26-year-old anchored Jamaica to victory in the event in the Russian capital .\nhe has now collected eight gold medals at the championships, equaling the record ."

***
## **Using BART model**
***

BART also uses an encoder-decoder architecture and is trained to reconstruct corrupted inputs. It combines the pretraining schemes of BERT and GPT-2.

In [55]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

Downloading:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [56]:
summaries['bart']

"Usain Bolt wins his third gold of the world championships in Moscow.\nBolt anchors Jamaica to victory in the men's 4x100m relay.\nThe 26-year-old has now won eight gold medals at world championships.\nJamaica's women also win gold in the relay, beating France in the process."

***
## Using Pegasus Model
***

In [57]:
pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail")
pipe_out = pipe(sample_text)
summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [58]:
summaries['pegasus']

"Usain Bolt wins third gold of world championships.\nAnchors Jamaica to victory in men's 4x100m relay.\nEighth gold at the championships for Bolt.\nJamaica also win women's 4x100m relay ."