In [66]:
import contractions
import string
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from datasets import load_dataset

from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

In [67]:
# Example usage:
articles = [
    """BOGOTA, Colombia (CNN) -- A key rebel commander and fugitive from a U.S. drug trafficking indictment was killed over the weekend in an air attack on a guerrilla encampment, the Colombian military said Monday. Alleged cocaine trafficker and FARC rebel Tomas Medina Caracas in an Interpol photo. Tomas Medina Caracas, known popularly as "El Negro Acacio," was a member of the high command of the Fuerzas Armadas Revolucionarias de Colombia and, according to Colombian and U.S. officials, helped manage the group's extensive cocaine trafficking network. He had been in the cross-hairs of the U.S. Justice Department since 2002. He was charged with conspiracy to import cocaine into the United States and manufacturing and distributing cocaine within Colombia to fund the FARC's 42-year insurgency against the government. U.S. officials alleged Medina Caracas managed the rebel group's sales of cocaine to international drug traffickers, who in turn smuggled it into the United States. He was also indicted in the United States along with two other FARC commanders in November 2002 on charges of conspiring to kidnap two U.S. oil workers from neighboring Venezuela in 1997 and holding one of them for nine months until a $1 million ransom was paid. Officials said the army's Rapid Response Force, backed by elements of the Colombian Air Force, tracked Medina Caracas down at a FARC camp in the jungle in the south of the country. "After a bombardment, the troops occupied the camp, and they've found 14 dead rebels so far, along with rifles, pistols, communications equipment and ... four GPS systems," Defense Minister Juan Manuel Santos said at a news conference. "The death of 'El Negro Acacio' was confirmed by various sources, including members of FARC itself." Medina Caracas commanded FARC's 16th Front in the southern departments of Vichada and Guainia. Established in 1964 as the military wing of the Colombian Communist Party, FARC is Colombia's oldest, largest, most capable and best-equipped Marxist rebel group, according to the U.S. Department of State. E-mail to a friend . Journalist Fernando Ramos contributed to this report."""
]

summaries = [
    """Tomas Medina Caracas was a fugitive from a U.S. drug trafficking indictment . "El Negro Acacio" allegedly helped manage extensive cocaine network . U.S. Justice Department indicted him in 2002 . Colombian military: He was killed in an attack on a guerrilla encampment ."""
]

In [68]:
import pickle
with open('article_tokenizer.pickle', 'rb') as handle:
    article_tokenizer = pickle.load(handle)

In [69]:
with open('summary_tokenizer.pickle', 'rb') as handle:
    summary_tokenizer = pickle.load(handle)

In [108]:
article_max_len = 500
summary_vocab_size = len(summary_tokenizer.word_index) + 1 + 2
article_vocab_size = len(article_tokenizer.word_index) + 1 + 2
embedding_dim = 128
summary_max_len = 50

In [109]:
encoder_inputs = Input(shape=(article_max_len,))
encoder_embedding = Embedding(article_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm1 = LSTM(128, return_sequences=True, return_state=True)
encoder_outputs1, _, _ = encoder_lstm1(encoder_embedding)
encoder_lstm2 = LSTM(128, return_sequences=True, return_state=True)
encoder_outputs2, _, _ = encoder_lstm2(encoder_outputs1)
encoder_lstm3 = LSTM(128, return_sequences=False, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm3(encoder_outputs2)
encoder_states = [state_h, state_c]
encoder_model = Model(encoder_inputs, encoder_states)

In [110]:
decoder_inputs = Input(shape=(summary_max_len,))
decoder_embedding = Embedding(summary_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(128, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

In [111]:
# Define the encoder model from the existing model
encoder_model = Model(encoder_inputs, encoder_states)

# Define the decoder model from the existing model
decoder_inputs_single = Input(shape=(1,))
decoder_embedding_single = Embedding(summary_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs_single)
decoder_outputs_single, state_h_single, state_c_single = decoder_lstm(decoder_embedding_single, initial_state=encoder_states)
decoder_states_single = [state_h_single, state_c_single]
decoder_model = Model([decoder_inputs_single] + encoder_states, [decoder_outputs_single] + decoder_states_single)

In [112]:
# Attention mechanism
attention = tf.keras.layers.Attention()([decoder_outputs, encoder_outputs])
context_vector = tf.keras.layers.Concatenate(axis=-1)([decoder_outputs, attention])

# Pointer network
pointer_probs = TimeDistributed(Dense(1, activation='sigmoid'))(context_vector)
pointer_probs = tf.keras.layers.Flatten()(pointer_probs)
pointer_probs = tf.keras.layers.RepeatVector(summary_max_len)(pointer_probs)

# Final probabilities
vocab_probs = TimeDistributed(Dense(summary_vocab_size, activation='softmax'))(decoder_outputs)
final_probs = K.concatenate([vocab_probs, pointer_probs], axis=-1)

model = Model([encoder_inputs, decoder_inputs], final_probs)

In [113]:
encoder_model.load_weights('encoder_weights.h5')
decoder_model.load_weights('decoder_weights.h5')
model.load_weights('seq2seq_pg.h5')

In [114]:
def summarize(article):
    # Preprocess the article
    article = 'start ' + article + ' end'
    print(f"Preprocessed article: {article}")
    
    # Convert the article to a sequence
    sequence = article_tokenizer.texts_to_sequences([article])
    print(f"Sequence: {sequence}")
    
    # Pad the sequence
    sequence = pad_sequences(sequence, maxlen=article_max_len, padding='post')
    print(f"Padded sequence: {sequence}")
    
    # Get the initial encoder states
    initial_states = encoder_model.predict(sequence)
    print(f"Initial states: {initial_states}")
    
    # Create an array to hold the generated summary
    generated_summary = []
    
    # The first input to the decoder is always the '<start>' token
    current_word = summary_tokenizer.word_index['start']
    print(f"Current word: {current_word} ({summary_tokenizer.index_word[current_word]})")
    
    # Continue generating words until '<end>' token is generated or max length is reached
    for i in range(summary_max_len):
        # Get the next word probabilities and next states
        probs, h, c = decoder_model.predict([np.array([current_word])] + initial_states)
        print(f"Next word probabilities: {probs}")
        print(f"Next states: {[h, c]}")
        
        # Get the index of the word with the highest probability
        next_word = np.argmax(probs[0, 0, :])
        print(f"Next word: {next_word} ({summary_tokenizer.index_word.get(next_word, '?')})")
        
        # If the next word is the '<end>' token, stop generating
        if next_word == summary_tokenizer.word_index['end']:
            break
        
        # Otherwise, add the next word to the generated summary
        generated_summary.append(summary_tokenizer.index_word[next_word])
        
        # Update the current word and initial states
        current_word = next_word
        initial_states = [h, c]
        print(f"Generated summary so far: {' '.join(generated_summary)}")
    
    # Return the generated summary as a string
    return ' '.join(generated_summary)


In [115]:
summarize(articles[0])

Preprocessed article: start BOGOTA, Colombia (CNN) -- A key rebel commander and fugitive from a U.S. drug trafficking indictment was killed over the weekend in an air attack on a guerrilla encampment, the Colombian military said Monday. Alleged cocaine trafficker and FARC rebel Tomas Medina Caracas in an Interpol photo. Tomas Medina Caracas, known popularly as "El Negro Acacio," was a member of the high command of the Fuerzas Armadas Revolucionarias de Colombia and, according to Colombian and U.S. officials, helped manage the group's extensive cocaine trafficking network. He had been in the cross-hairs of the U.S. Justice Department since 2002. He was charged with conspiracy to import cocaine into the United States and manufacturing and distributing cocaine within Colombia to fund the FARC's 42-year insurgency against the government. U.S. officials alleged Medina Caracas managed the rebel group's sales of cocaine to international drug traffickers, who in turn smuggled it into the Unite

'as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as as'