In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import copy
import matplotlib.pyplot as plt
import json
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras import backend as K 
import gensim
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from keras.models import model_from_json
import tensorflow as tf
import warnings
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

path_train ='/content/drive/My Drive/Colab Notebooks/data/train_dataset.pkl'
path_test= '/content/drive/My Drive/Colab Notebooks/data/test_dataset.pkl'

stories_train = pickle.load(open(path_train, 'rb'))
stories_test = pickle.load(open(path_test, 'rb'))
stories = stories_train + stories_test
print("total number of CNN data: ",len(stories))

Mounted at /content/drive
total number of CNN data:  311956


In [3]:

# number of highlights to be used
NUM_HIGHLIGHT= 1
# number of storie sentences to be used
NUM_STORY = 2

# join sentence in both stories and highlights together for each data sample
processed_stories = copy.deepcopy(stories)

# join stories and highlights into 2 column pd dataframe
for each_story in processed_stories:
  # join highlights
  each_story['highlights'] = ' '.join(each_story['highlights'][0:NUM_HIGHLIGHT])
  # join story sentences
  each_story['story'] = ' '.join(each_story['story'][0:NUM_STORY])


stories[2],processed_stories[2]

({'highlights': ['northwestern university football players vote friday on whether to unionize',
   'supporters say athletes who generate huge money for schools deserve protections',
   'vote result wont be known until after national labor relations board reviews case'],
  'id': 'id_cnn_91222',
  'story': ['northwestern universitys football players voted friday on whether to form a workers union one of the most highprofile efforts by college athletes to demand more rights possibly including payment',
   'but the result of the vote might not be known for months',
   'the national labor relations board allowed the vote after its chicago office ruled in march that northwestern football players can unionize deeming them school employees because of the hours they put in the control the university has over them and the revenue they generate',
   'however northwestern asked the nrlb for a review and the nlrb said the results of fridays vote wont be made public until that is finished the review

In [4]:
# join stories and highlights into 2 column pd dataframe
df_cnn = pd.DataFrame(processed_stories)
# print(df_cnn.iloc[1,0])
# print(df_cnn.iloc[1,1])
# df_cnn

In [5]:

#Shorten the Summaries and Text between max len defined above
def count_word(sentence):
  return len(sentence.split(' '))

# count #word in highlight/summary
df_cnn['num_word_highlights'] = df_cnn.apply(lambda row: count_word(row['highlights']), axis=1)
# count #word in story
df_cnn['num_word_story'] = df_cnn.apply(lambda row: count_word(row['story']), axis=1)

print('max num_word_highlights:',max(df_cnn['num_word_highlights']),'max num_word_story:',max(df_cnn['num_word_story']))
print('mean num_word_highlights:',np.mean(df_cnn['num_word_highlights']),'max num_word_story:',np.mean(df_cnn['num_word_story']))
print('median num_word_highlights:',np.median(df_cnn['num_word_highlights']),'max num_word_story:',np.median(df_cnn['num_word_story']))


max num_word_highlights: 118 max num_word_story: 388
mean num_word_highlights: 12.434481144776827 max num_word_story: 53.134179179115
median num_word_highlights: 12.0 max num_word_story: 51.0


In [6]:
MAX_LEN_TEXT=100
MAX_LEN_SUM=30

MIN_LEN_TEXT = 10
MIN_LEN_SUM = 5

df_cnn=df_cnn[ (df_cnn['num_word_highlights']>MIN_LEN_SUM) & (df_cnn['num_word_highlights']<=MAX_LEN_SUM) \
             & (df_cnn['num_word_story']>MIN_LEN_TEXT) & (df_cnn['num_word_story']<=MAX_LEN_TEXT)]
print(df_cnn.shape)

(305544, 5)


In [7]:
# put _START_ and _END_ tokens at shorten_highlights
df_cnn.loc[:,'shorten_highlights']= df_cnn['highlights']
df_cnn.loc[:,'shorten_story']= df_cnn['story']

df_cnn['shorten_highlights'] = '_START_ '+df_cnn['shorten_highlights'].astype(str)+' _END_'
# df_cnn

In [8]:
# add sostok and eostok (token) to shorten_highlights
df_cnn['shorten_highlights'] = df_cnn.apply(lambda row : 'sostok '+ row['shorten_highlights'] + ' eostok',axis=1)

# df_cnn.head(5)

In [9]:
# train,vali, test: 266182,29576, 9786

## we don't use validation data ##
TRAIN_SIZE = 266182 # get from seq2seq ipynb training
VALI_SIZE = 29576

X_train,X_vali,Y_train,Y_vali=train_test_split(np.array(df_cnn['shorten_story']),np.array(df_cnn['shorten_highlights']),train_size=TRAIN_SIZE,random_state=0,shuffle=False)
X_vali,X_test,Y_vali,Y_test=train_test_split(X_vali,Y_vali,train_size=VALI_SIZE,random_state=0,shuffle=False)

# X_test, Y_test = np.array(df_cnn['shorten_story']) , np.array(df_cnn['shorten_highlights'])
print(X_train.shape,X_vali.shape,X_test.shape)

(266182,) (29576,) (9786,)


In [10]:
%%time

# https://stackoverflow.com/questions/51956000/what-does-keras-tokenizer-method-exactly-do
#prepare a tokenizer for reviews on test data (story)
X_tokenizer = Tokenizer() 
X_tokenizer.fit_on_texts(list(X_train))

# min number of words before being counted as infrequent
THRESHOLD=4

count=0
total_count=0
frequency=0
total_freq=0

for key,value in X_tokenizer.word_counts.items():
    total_count += 1
    total_freq += value
    if(value < THRESHOLD):
        count += 1
        frequency += value
print("total words = ",total_count)
print("total unique rare words = ", count)
print("% of rare words in vocabulary:",(count/total_count)*100)
print("Total Coverage of rare words:",(frequency/total_freq)*100)

##prepare a tokenizer for reviews on training data: keep only non-rare words
X_tokenizer = Tokenizer(num_words=total_count-count) 
X_tokenizer.fit_on_texts(list(X_train))

##convert text sequences into integer sequences (i.e one-hot encodeing all the words)
X_train_seq    =   X_tokenizer.texts_to_sequences(X_train) 
X_test_seq    =   X_tokenizer.texts_to_sequences(X_test) 

##padding zero upto maximum length since each sample have diff length of story
X_train    =   pad_sequences(X_train_seq,  maxlen=MAX_LEN_TEXT, padding='post')
X_test    =   pad_sequences(X_test_seq,  maxlen=MAX_LEN_TEXT, padding='post')

##size of vocabulary ( +1 for padding token)
X_voc   =  X_tokenizer.num_words + 1

print("Size of vocabulary in X = {}".format(X_voc))

total words =  174558
total unique rare words =  114273
% of rare words in vocabulary: 65.46420101055237
Total Coverage of rare words: 1.1496141966465028
Size of vocabulary in X = 60286
CPU times: user 38.4 s, sys: 59.9 ms, total: 38.4 s
Wall time: 38.5 s


In [11]:
%%time
#prepare a tokenizer for reviews on test data (highlights)
Y_tokenizer = Tokenizer()   
Y_tokenizer.fit_on_texts(list(Y_train))

THRESHOLD=6

count=0
total_count=0
frequency=0
total_freq=0

for key,value in Y_tokenizer.word_counts.items():
    total_count += 1
    total_freq += value
    if(value < THRESHOLD):
        count += 1
        frequency += value

print("total words = ",total_count)
print("total unique rare words = ", count)
print("% of rare words in vocabulary:",(count/total_count)*100)
print("Total Coverage of rare words:",(frequency/total_freq)*100)

#prepare a tokenizer for reviews on training data
Y_tokenizer = Tokenizer(num_words=total_count-count) 
Y_tokenizer.fit_on_texts(list(Y_train))

#convert text sequences into integer sequences (i.e one hot encode the text in Y)
Y_train_seq    =   Y_tokenizer.texts_to_sequences(Y_train)
Y_test_seq    =   Y_tokenizer.texts_to_sequences(Y_test)

#padding zero upto maximum length
Y_train    =   pad_sequences(Y_train_seq, maxlen=MAX_LEN_SUM, padding='post')
Y_test    =   pad_sequences(Y_test_seq, maxlen=MAX_LEN_SUM, padding='post')

#size of vocabulary
Y_voc  =   Y_tokenizer.num_words +1
print("Size of vocabulary in Y = {}".format(Y_voc))

total words =  114363
total unique rare words =  87200
% of rare words in vocabulary: 76.24843699448249
Total Coverage of rare words: 3.3273843982612212
Size of vocabulary in Y = 27164
CPU times: user 17.9 s, sys: 92.8 ms, total: 18 s
Wall time: 18 s


In [12]:
# We will remove "Summary" length of just 2, which has only START and END
idx=[]
for i in range(len(Y_train)):
    count=0
    for j in Y_train[i]:
        if j != 0:
            count += 1
    if(count == 2):
      idx.append(i)
Y_train=np.delete(Y_train,idx, axis=0)
X_train=np.delete(X_train,idx, axis=0)

idx=[]
for i in range(len(Y_test)):
    count=0
    for j in Y_test[i]:
        if j != 0:
            count += 1
    if(count == 2):
      idx.append(i)
Y_test=np.delete(Y_test,idx, axis=0)
X_test=np.delete(X_test,idx, axis=0)

In [15]:
print("Size of vocabulary from the w2v model = {}".format(X_voc))

K.clear_session()


Size of vocabulary from the w2v model = 60286


In [16]:
# get words and index mapping of both highlights and stories 
REVERSE_TARGET_WORD_INDEX = Y_tokenizer.index_word
REVERSE_SOURCE_WORD_INDEX = X_tokenizer.index_word
TARGET_WORD_INDEX = Y_tokenizer.word_index

# Y_tokenizer.index_word,target_word_index

**LOAD MODELS for decoding test dataset**


In [17]:
#### LOAD MODELS #######
save_loc = '/content/drive/My Drive/Colab Notebooks/data/trained_seq2seq_model'

def load_model(model_filename, model_weights_filename):
    with open(model_filename, 'r', encoding='utf8') as f:
        model = model_from_json(f.read())
    model.load_weights(model_weights_filename)
    print("Successfully load '{}' ! and its weights '{} ".format(model_filename,model_weights_filename))
    return model

ENCODER_MODEL = load_model(save_loc+'/encoder_model.json', save_loc+'/encoder_model_weights.h5')
DECODER_MODEL = load_model(save_loc+'/decoder_model.json', save_loc+'/decoder_model_weights.h5')

Successfully load '/content/drive/My Drive/Colab Notebooks/data/trained_seq2seq_model/encoder_model.json' ! and its weights '/content/drive/My Drive/Colab Notebooks/data/trained_seq2seq_model/encoder_model_weights.h5 
Successfully load '/content/drive/My Drive/Colab Notebooks/data/trained_seq2seq_model/decoder_model.json' ! and its weights '/content/drive/My Drive/Colab Notebooks/data/trained_seq2seq_model/decoder_model_weights.h5 


In [18]:

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = ENCODER_MODEL.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = TARGET_WORD_INDEX['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      
        output_tokens, h, c = DECODER_MODEL.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # check if prediction is in
        sampled_token = REVERSE_TARGET_WORD_INDEX[sampled_token_index]

        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (MAX_LEN_SUM-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c
    return decoded_sentence[len('start  '):-len('end')-1]
    # return decoded_sentence

    
def seq2summary(input_seq):
    new_string=''
    
    for i in input_seq:
        if((i != 0 and i != TARGET_WORD_INDEX['sostok']) and i != TARGET_WORD_INDEX['eostok']):
            new_string= new_string + REVERSE_TARGET_WORD_INDEX[i]+' '
    return new_string[len('start'):-len('end')-1]

def seq2text(input_seq):
    new_string=''

    for i in input_seq:
        if(i != 0):
            new_string = new_string + REVERSE_SOURCE_WORD_INDEX[i]+' '
    return new_string


In [19]:
# create new dataframe to store test data (tokenized data and predicted summaries)
df_test = df_cnn.iloc[TRAIN_SIZE+VALI_SIZE:,]
print(df_test.shape)

df_test['tokenized_story'] = '-'
df_test['tokenized_highlights'] = '-'
df_test['predicted_highlights'] = '-'

list_id = list(df_test['id'])
# df_test.loc[df_test.]
# df_test.head()

(9786, 7)


In [20]:
%%time
# for i in range(20,30):
for i in tqdm(range(5000,len(df_test))):
# for i in tqdm(range(0,5000)):


    # print("Story:",seq2text(X_test[i]))
    # print("Original summary:",seq2summary(Y_test[i]))
    # print("Predicted summary:",decode_sequence(X_test[i].reshape(1,MAX_LEN_TEXT)))
    # print("\n")

    df_test.loc[df_test['id']== list_id[i], 'tokenized_story'] = seq2text(X_test[i])
    df_test.loc[df_test['id']== list_id[i], 'tokenized_highlights'] = seq2summary(Y_test[i])
    df_test.loc[df_test['id']== list_id[i], 'predicted_highlights'] = decode_sequence(X_test[i].reshape(1,MAX_LEN_TEXT))

100%|██████████| 4786/4786 [1:05:32<00:00,  1.22it/s]

CPU times: user 1h 8min 27s, sys: 1min 14s, total: 1h 9min 41s
Wall time: 1h 5min 32s





In [None]:
# df_test = df_cnn.iloc[TRAIN_SIZE+VALI_SIZE:,]
# df_test.shape

In [None]:
# df_test.head(20)


In [21]:
# df.to_csv()
loc_save = '/content/drive/My Drive/Colab Notebooks/data'

df_test.to_csv(loc_save+'/saved_predicted_sum.csv')