#  Running our models on the How2/WikiHow/CNN data. 

Following are the high level steps we are following in this notebook:
* **Load Test Data :** Summary provided with the article.  
* **Use PreProcessed3 data  :**  Pre-Processed 3 data has following details:
 * Remove Special Characters from Text
 * Remove Stop Words from Text
 * Lemmatize Text
 * Remove invalid and non-english words. 
* **Execute following Models  :**  We are executing multiple models including:
 * Extractive Summary Model (BERT)
 * Abstractive Summary Model (BERT2BERT for CNN/Dailymail)
 * Abstractive T5 Model (pre-trained model that was trained on our data). 

In [1]:
%%capture
##############
## INSTALLS ##
##############

#!pip install bert-extractive-summarizer
#!pip install transformers
#!pip install neuralcoref
#!pip install datasets==1.0.2
#!pip install git-python==1.0.3
#!pip install sacrebleu==1.4.12
#!pip install rouge_score
#!pip install rouge-metric

#!pip install rouge
#!pip install py-rouge
#!pip install pyrouge
#!pip install torch
#!pip install sentencepiece
#!pip install nlp

#!python -m nltk.downloader all
#!python -m spacy download en_core_web_md
#!python -m spacy download en
#!python -m spacy download en_core_web_sm



In [2]:
###########
# IMPORTS #
###########

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
from pprint import pprint
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_md
from bs4 import BeautifulSoup
import requests
import re
import string
import pandas as pd
import csv
import rouge
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
#from rouge_score import rouge_scorer
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM  
from transformers import BertTokenizer, EncoderDecoderModel
from tqdm import tqdm_pandas
from tqdm import tqdm
from summarizer import Summarizer
from simplet5 import SimpleT5
from datetime import datetime

nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

Global seed set to 42
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sunitc/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/sunitc/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/sunitc/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
%%capture
###############
# GLOBAL VARS #
###############
start_time = datetime.now()

aggregator='Avg'
apply_avg = aggregator == 'Avg'
apply_best = aggregator == 'Best'
vectorizer = TfidfVectorizer()
tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")  
abstractive_summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
extractive_summarizer_model = Summarizer()
modelt5 = SimpleT5()
modelt5.from_pretrained(model_type="t5", model_name="t5-base")
nltk.download("stopwords")
stop_words = stopwords.words('english')

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[nltk_data] Downloading package stopwords to /home/sunitc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
nlp = en_core_web_md.load()

In [5]:
########
# DATA #
########


# setting number of rows to low number so notebook runs in minutes and not hours. 
num_rows_each_df = 10

cnn_dailymail_df = pd.read_csv(os.getcwd() + "/data/cnn_dm_df.csv",encoding = "utf-8")
wikihow_df = pd.read_csv(os.getcwd() + "/data/wikihow_df.csv",encoding = "utf-8")
how2_df = pd.read_csv(os.getcwd() + "/data/how2_df.csv",encoding = "utf-8")

wikihow_df = wikihow_df[(wikihow_df.article_pp1.str.len() < 3700) & (wikihow_df.summary.str.len() > 100)]
how2_df = how2_df[(how2_df.article_pp1.str.len() < 3700) & (how2_df.summary.str.len() > 100)]
cnn_dailymail_df = cnn_dailymail_df[(cnn_dailymail_df.article_pp1.str.len() > 250) & (cnn_dailymail_df.summary.str.len() > 100)]

if len(wikihow_df) > num_rows_each_df:
    wikihow_df = wikihow_df.head(num_rows_each_df)
    
if len(how2_df) > num_rows_each_df:
    how2_df = how2_df.head(num_rows_each_df)
    
if len(cnn_dailymail_df) > num_rows_each_df:
    cnn_dailymail_df = cnn_dailymail_df.head(num_rows_each_df)
    
merged_df = pd.concat([how2_df, wikihow_df,cnn_dailymail_df], axis=0)
merged_df = merged_df[merged_df.article_pp1.str.len() > 250]

merged_df.head(2)

Unnamed: 0.1,Unnamed: 0,summary,article,data_source,article_pp1,article_pp2,article_pp3,num_words_article,num_sentences_article,num_words_summary,num_sentences_summary,num_words_article_pp1,num_sentences_article_pp1,num_words_article_pp2,num_sentences_article_pp2,num_words_article_pp3,num_sentences_article_pp3
2,2,learn about how hand washing can help prevent ...,hi ! this is david jackel on behalf of expert ...,How2,cold come direct contact somebody else virus o...,most colds come from direct conotact that you ...,cold come direct contact somebody else virus o...,359,14,20,2,123,1,284,11,116,1
3,3,how to julienne cucumbers to make kimchi for k...,the other way we can do cucumbers which is als...,How2,way cucumber also nice pickling cucumber find ...,the other way we can do cucumbers which is als...,way cucumber also nice cucumber find work best...,171,6,26,2,62,1,169,6,56,1


In [6]:
print(len(merged_df))

29


In [7]:
####################
# HELPER FUNCTIONS #
####################


def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)


def RemoveIntroFromText(script):
    sentences = [x for x in script.sents]
    i=0
    new_text=""
    print("Original text: \n")
    displacy.render(script, jupyter=True, style='ent')
    print("Some preprocessing details: \n************\n")
    is_intro=False
    
    for sent in sentences:
        at_least_one_person=0
        print("Sentence ", i, ": ", sentences[i])
        d= dict([(str(x), x.label_) for x in nlp(str(sent)).ents])
        print(d)
        if len(d)>0:
            print(d)
            for key in d:
                #print("key:",key, "; value=", d[key])
                #print(sent)
                if (d[key]=="PERSON"):
                    at_least_one_person+=1
        if "expertvillage" in str(sent).lower() or "expert village" in str(sent).lower():
            is_intro=True
        if (at_least_one_person>0):
            print("the sentence has at least one person:")
            print("Sentence ", i, ": ", sentences[i])    
        if (i<4 and (at_least_one_person>0  or is_intro)):
            print("the sentence is likely an introduction")
            new_text=''
        else:
            new_text+=str(sent)
            if not (str(sent).strip()[-1] in string.punctuation): 
                print ("Missing punctuation at the end", sent, "; last char is ", str(sent).strip()[-1])
                new_text+=". "
        i+=1
    print("\n*************\nNew text, hopefully without person introduction:\n**********\n", new_text)
    return new_text


def RemoveIntroFromTextMiddle(text):
    script = nlp(text)
    sentences = [x for x in script.sents]
    #print("sentences.....")
    #print(sentences)
    i=0
    new_text=""
    print("Original text: \n")
    displacy.render(script, jupyter=True, style='ent')
    print("Some preprocessing details: \n************\n")
    is_intro=False
    
    for sent in sentences:
        at_least_one_person=0
        print("Sentence ", i, ": ", sentences[i])
        d= dict([(str(x), x.label_) for x in nlp(str(sent)).ents])
        print(d)
        if len(d)>0:
            print(d)
            for key in d:
                #print("key:",key, "; value=", d[key])
                #print(sent)
                if (d[key]=="PERSON"):
                    at_least_one_person+=1
        if "expertvillage" in str(sent).lower() or "expert village" in str(sent).lower():
            is_intro=True
        if (at_least_one_person>0):
            print("the sentence has at least one person:")
            print("Sentence ", i, ": ", sentences[i])    
        if (i<4 and (at_least_one_person>0  or is_intro)):
            print("skipping the sentence as it is likely an introduction")
            #new_text=''
        else:
            new_text+=str(sent)
            if not (str(sent).strip()[-1] in string.punctuation): 
                print ("Missing punctuation at the end", sent, "; last char is ", str(sent).strip()[-1])
                new_text+=". "
        i+=1
    print("\n*************\nNew text, hopefully without person introduction:\n**********\n", new_text)
    return new_text


def RemoveIntroFromTextNonVerbose(script):
    sentences = [x for x in script.sents]
    i=0
    new_text="" 
    #displacy.render(script, jupyter=True, style='ent') 
    is_intro=False
    
    for sent in sentences:
        at_least_one_person=0
        d= dict([(str(x), x.label_) for x in nlp(str(sent)).ents])
        if len(d)>0:
            for key in d:
                if (d[key]=="PERSON"):
                    at_least_one_person+=1
        if "expertvillage" in str(sent).lower() or "expert village" in str(sent).lower():
            is_intro=True
        if (i<4 and (at_least_one_person>0  or is_intro)):
             new_text=''
        else:
            new_text+=str(sent)
            if not (str(sent).strip()[-1] in string.punctuation): 
                 new_text+=". "
        i+=1
    return new_text


#Raw Text Summarization
def generate_abstractive_summary(raw_string, model = abstractive_summarizer_model, max_length=512):
    """This function produces an abstractive summary for a given article"
    Params:
    raw_string: an article string.
    model: An abstractive summarizer model"""
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(raw_string, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return output_str[0]


def generate_extractive_summary(raw_string, model = extractive_summarizer_model, min_summary_length = 50):
    """This function produces an extractive summary for a given article"
    Params:
    raw_string: an article string.
    model: An extractive summarizer model"""
    output_str = model(raw_string, min_length = min_summary_length)
    return output_str


def process_article(text):
    #print("proces article")
    article = text.split(".")
    sentences = []

    for sentence in article:
        #print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    
    return sentences

def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)
 
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix


def generate_summary(in_text, top_n=5):
    summarize_text = []
    try:
        # Step 1 - Read text anc split it
        sentences =  process_article(in_text)
        # Step 2 - Generate Similary Martix across sentences
        sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
        # Step 3 - Rank sentences in similarity martix
        sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
        scores = nx.pagerank(sentence_similarity_graph)
        # Step 4 - Sort the rank and pick top sentences
        ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
        #print("Indexes of top ranked_sentence order are ", ranked_sentence)    
        for i in range(top_n):
            summarize_text.append(" ".join(ranked_sentence[i][1]))
        # Step 5 - Offcourse, output the summarize text
        #print("Summarize Text: \n", ". ".join(summarize_text))
    except:
        return ""
    finally:
        return ". ".join(summarize_text)

def generate_abstractive_summary_T5(raw_string):
    # using epoch 5
    modelt5.load_model("t5","outputs/simplet5-epoch-6-train-loss-1.5226", use_gpu=False)
    return modelt5.predict(raw_string)[0]

#def prepare_results(p, r, f):
#    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)

def print_rogue_scores(hypo, refe):
    scores = evaluator.get_scores(hypo, refe)
    #scores = evaluator.get_scores(all_hypothesis, all_references)
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
            for hypothesis_id, results_per_ref in enumerate(results):
                nb_references = len(results_per_ref['p'])
                for reference_id in range(nb_references):
                    print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                    print('\t' + '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * results_per_ref['p'][reference_id], 'R', 100.0 * results_per_ref['r'][reference_id], 'F1', 100.0 * results_per_ref['f'][reference_id]))
                    #print('\t' + prepare_results(results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
            print()
        else:
            print('\t' + '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * results['p'], 'R', 100.0 * results['r'], 'F1', 100.0 * results['f']))
            #print("x") #prepare_results(results['p'], results['r'], results['f']))

In [8]:
####################
# Rouge Evaluator  #
####################

evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                           max_n=4,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)



In [9]:
# TEST Loop for Abstractive and Extractive summarization

icount = 0

for article in merged_df['article_pp3']:
    if len(article) > 200:
        print("Article Len=", len(article))
        print(article)
        e_summary = generate_extractive_summary(article, min_summary_length=50)
        a_summary = generate_abstractive_summary(article, model = abstractive_summarizer_model)
        t5_summary = generate_abstractive_summary_T5(article)
        all_summary = e_summary + "." + a_summary + "." + t5_summary + "."
        s_s_summary = generate_abstractive_summary(all_summary, model = abstractive_summarizer_model)
        
        print("----------------")    
        print("e-summary=",e_summary)
        print("----------------")    
        print("a-summary=",a_summary)
        print("----------------")    
        print("t5-summary=",t5_summary)
        print("----------------")    
        print("ss-summary=",s_s_summary)
        print("-------------------------------------------------------------------------------------------------") 
        icount +=1
    
    if icount > 3:
        break 
    


Article Len= 758
cold come direct contact somebody else virus often time like shaking hand somebody close quarter hugging touching anything close somebody else thing touching glass touching silverware touching food stuff like constantly washing hand especially traveling close quarter people exposed someone might sick always washing hand warm water soap wash vigorously least second make sure loosen germ wo always access warm water soap carry hand always keep hand car bag traveling something take whenever need put little bit hand shaking hand people public event know sick shaking hand someone else sick really worth getting sick use hand important thing remember hand vessel germ reach body always keep hand frequently possible also always wash hand eating touching face
----------------
e-summary= 
----------------
e2-summary= 
----------------
a-e-summary= hand vessel germs are common in the united states and the u. s. people are exposed to the cold come direct contact with a stranger. han

In [10]:
import warnings
warnings.filterwarnings('ignore')

e_list = []
a_list = []
t5_list = []
sum_sum_list = []

iCount = 0

for article in merged_df['article_pp3']:
    #print(article)
    print(iCount, end=",")
    iCount =  iCount + 1
    
    e_summary = generate_extractive_summary(article, min_summary_length=100)
    a_summary = generate_abstractive_summary(article, model = abstractive_summarizer_model)
    t5_summary = generate_abstractive_summary_T5(article)
    all_summary = e_summary + "." + e2_summary + "." + a_e_summary + "." + a_summary + "." + t5_summary + "."
    s_s_summary = generate_abstractive_summary(all_summary, model = abstractive_summarizer_model)
    
    e_list.append(e_summary)
    a_list.append(a_summary)
    t5_list.append(t5_summary)
    sum_sum_list.append(s_s_summary)
    
    #break 


print("e_summ len=", len(e_list))
print("a_summ len=", len(a_list))
print("t5_summ len=", len(t5_list))
print("ss_summ len=", len(sum_sum_list))

merged_df['e_summarization'] = e_list
merged_df['a_summarization'] = a_list
merged_df['t5_summarization'] = t5_list
merged_df['ss_summarization'] = sum_sum_list

merged_df.to_csv(os.getcwd() + "/data/merged_df_with_Summarization.csv")


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,

Token indices sequence length is longer than the specified maximum sequence length for this model (968 > 512). Running this sequence through the model will result in indexing errors


20,21,

Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors


22,

Token indices sequence length is longer than the specified maximum sequence length for this model (544 > 512). Running this sequence through the model will result in indexing errors


23,24,25,

Token indices sequence length is longer than the specified maximum sequence length for this model (900 > 512). Running this sequence through the model will result in indexing errors


26,

Token indices sequence length is longer than the specified maximum sequence length for this model (595 > 512). Running this sequence through the model will result in indexing errors


27,28,e_summ len= 29
e2_summ len= 29
a_e_summ len= 29
a_summ len= 29
t5_summ len= 29
ss_summ len= 29


In [11]:

hypo=merged_df['summary'].tolist()
refe1=merged_df['e_summarization'].tolist() #[reference]
refe2=merged_df['a_summarization'].tolist() #[reference]
refe3=merged_df['t5_summarization'].tolist() #[reference]
refe6=merged_df['ss_summarization'].tolist() #[reference]

print("Rogue for Extractive Summarization")
print_rogue_scores(hypo,refe1)    
print("Rogue for Abstractive Summarization")
print_rogue_scores(hypo,refe2)   
print("Rogue for T5 Summarization")
print_rogue_scores(hypo,refe3)        
print("Rogue for SS Summarization")
print_rogue_scores(hypo,refe6) 


Rogue for Extractive Summarization
		rouge-1:	P:  3.53	R:  2.14	F1:  2.51
		rouge-2:	P:  0.14	R:  0.15	F1:  0.14
		rouge-3:	P:  0.00	R:  0.00	F1:  0.00
		rouge-4:	P:  0.00	R:  0.00	F1:  0.00
		rouge-l:	P:  3.52	R:  2.24	F1:  2.63
		rouge-w:	P:  2.06	R:  0.53	F1:  0.81
Rogue for Extractive_2 Summarization
		rouge-1:	P:  0.00	R:  0.00	F1:  0.00
		rouge-2:	P:  0.00	R:  0.00	F1:  0.00
		rouge-3:	P:  0.00	R:  0.00	F1:  0.00
		rouge-4:	P:  0.00	R:  0.00	F1:  0.00
		rouge-l:	P:  0.00	R:  0.00	F1:  0.00
		rouge-w:	P:  0.00	R:  0.00	F1:  0.00
Rogue for Abstractive Summarization
		rouge-1:	P: 24.15	R: 19.61	F1: 20.55
		rouge-2:	P:  2.87	R:  2.34	F1:  2.45
		rouge-3:	P:  0.10	R:  0.14	F1:  0.12
		rouge-4:	P:  0.00	R:  0.00	F1:  0.00
		rouge-l:	P: 23.24	R: 18.93	F1: 20.15
		rouge-w:	P: 13.73	R:  4.76	F1:  6.80
Rogue for Abstractive of Extractive Summarization
		rouge-1:	P: 24.15	R: 19.61	F1: 20.55
		rouge-2:	P:  2.87	R:  2.34	F1:  2.45
		rouge-3:	P:  0.10	R:  0.14	F1:  0.12
		rouge-4:	P:  0.00	R: 

In [12]:
end_time = datetime.now()

In [13]:
print('Duration: {}'.format(end_time - start_time))

Duration: 0:12:29.004730
