# Simple Extractive Summarization based on word frequencies


# Install and import ependencies

In [None]:
!pip install unidecode
!pip install pandas
!pip install re
!pip install pickle
!pip install sklearn
!pip install numpy
!pip install spacy

In [None]:
import spacy
import pandas as pd
import numpy as np
import os
import re
from spacy.lang.pt.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import en_core_web_sm
nlp = en_core_web_sm.load()f
import unidecode
from string import ascii_letters, punctuation
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Prepare data

In [None]:
wikihow = pd.read_csv(r'/content/drive/MyDrive/summarization/wikihowAll.csv')

In [None]:
wikihow = wikihow.astype(str)
wikihow=wikihow[wikihow['text'].isnull()==False]
wikihow=wikihow[wikihow['headline'].isnull()==False]
wikihow=wikihow[wikihow['text']!='nan']
wikihow.drop_duplicates(subset=['text'],inplace=True)
wikihow['word_count'] = wikihow['text'].str.count(' ') + 1
wikihow['headline_count'] = wikihow['headline'].str.count(' ') + 1

In [None]:
path = '/content/drive/MyDrive/summarization/wikihow_all.pkl'

In [None]:
# pd.to_pickle(wikihow, path)

In [None]:
# wikihow_ = pd.read_pickle(path)

In [None]:
contraction = { 
    "ain't": "is not", 
    "aren't": "are not", 
    "can't": "cannot", 
    "'cause": "because", 
    "could've": "could have", 
    "couldn't": "could not",
    "didn't": "did not", 
    "doesn't": "does not", 
    "don't": "do not", 
    "hadn't": "had not", 
    "hasn't": "has not", 
    "haven't": "have not",
    "haven t": "have not",
    "he'd": "he would", 
    "he'll": "he will", 
    "he's": "he is", 
    "how'd": "how did", 
    "how'd'y": "how do you", 
    "how'll": "how will", 
    "how's": "how is",
    "I'd": "I would", 
    "I'd've": "I would have", 
    "I'll": "I will", 
    "I'll've": "I will have", 
    "I'm": "I am", 
    "I've": "I have", 
    "i'd": "i would",
    "i'd've": "i would have", 
    "i'll": "i will", 
    "i'll've": "i will have", 
    "i'm": "i am", 
    "i've": "i have", 
    "isn't": "is not", 
    "it'd": "it would",
    "it'd've": "it would have", 
    "it'll": "it will", 
    "it'll've": "it will have", 
    "it's": "it is", 
    "let's": "let us", 
    "ma'am": "madam",
    "mayn't": "may not", 
    "might've": "might have", 
    "mightn't": "might not", 
    "mightn't've": "might not have", 
    "must've": "must have",
    "mustn't": "must not", 
    "mustn't've": "must not have", 
    "needn't": "need not", 
    "needn't've": "need not have", 
    "o'clock": "of the clock",
    "oughtn't": "ought not", 
    "oughtn't've": "ought not have", 
    "shan't": "shall not", 
    "sha'n't": "shall not", 
    "shan't've": "shall not have",
    "she'd": "she would", 
    "she'd've": "she would have", 
    "she'll": "she will", 
    "she'll've": "she will have", 
    "she's": "she is",
    "should've": "should have", 
    "shouldn't": "should not", 
    "shouldn't've": "should not have", 
    "so've": "so have", 
    "so's": "so as",
    "this's": "this is", 
    "that'd": "that would", 
    "that'd've": "that would have", 
    "that's": "that is", 
    "there'd": "there would",
    "there'd've": "there would have", 
    "there's": "there is", 
    "here's": "here is", 
    "they'd": "they would", 
    "they'd've": "they would have",
    "they'll": "they will", 
    "they'll've": "they will have", 
    "they're": "they are", 
    "they've": "they have", 
    "to've": "to have",
    "wasn't": "was not", 
    "we'd": "we would", 
    "we'd've": "we would have", 
    "we'll": "we will", 
    "we'll've": "we will have", 
    "we're": "we are",
    "we've": "we have", 
    "weren't": "were not", 
    "what'll": "what will", 
    "what'll've": "what will have", 
    "what're": "what are",
    "what's": "what is", 
    "what've": "what have", 
    "when's": "when is", 
    "when've": "when have", 
    "where'd": "where did", 
    "where's": "where is",
    "where've": "where have", 
    "who'll": "who will", 
    "who'll've": "who will have", 
    "who's": "who is", 
    "who've": "who have",
    "why's": "why is", 
    "why've": "why have", 
    "will've": "will have", 
    "won't": "will not", 
    "won't've": "will not have",
    "would've": "would have", 
    "wouldn't": "would not", 
    "wouldn't've": "would not have", 
    "y'all": "you all",
    "y'all'd": "you all would", 
    "y'all'd've": "you all would have", 
    "y'all're": "you all are", 
    "y'all've": "you all have",
    "you'd": "you would", 
    "you'd've": "you would have", 
    "you'll": "you will", 
    "you'll've": "you will have",
    "you're": "you are", 
    "you've": "you have"}

In [None]:
stop_words = set(stopwords.words('english')) 

In [None]:
def clean(text):
  text = text.lower()
  text = unidecode.unidecode(text)
  text = text.replace('&', ' and ')
  text = text.replace('@', ' at ')
  text = ' '.join([contraction[t] if t in contraction else t for t in text.split(" ")])
  text = re.sub(r"http\S+", "", text)
  text = re.sub(r"\n", " ", text)
  text = re.sub(r"\n\n", " ", text)
  text = re.sub(r"'s\b","",text)
  text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text)
  text = text.strip(" ")
  text = re.sub('"','', text)
  text = re.sub(' +',' ', text).strip() 

  if len(text) != 0 and text[len(text)-1].isdigit():
    text = text[0:len(text)-1]
  return text



In [None]:
wikihow["clean_title"] = wikihow['title'].map(lambda text: clean(text))

In [None]:
wikihow["clean_text"] = wikihow['text'].map(lambda text: clean(text))

In [None]:
wikihow["clean_headline"] = wikihow['headline'].map(lambda text: clean(text))

## Pickle prepared data (test)

In [None]:
path = '/content/drive/MyDrive/summarization/wikihow_all_clean_extraction.pkl'
pd.to_pickle(wikihow, path)

In [None]:
path = '/content/drive/MyDrive/summarization/wikihow_all_clean_extraction.pkl'
wikihow = pd.read_pickle(path)

# Extract sentences based on simple frequences

In [None]:
def write_to(data, filename):
    filehandler = open(filename, 'wb') 
    pickle.dump(data, filehandler)

In [4]:
def extract_summary_for(text):

    corpus = [sentence.text.lower() for sentence in text.sents ]

    cv = CountVectorizer(stop_words=list(STOP_WORDS))   
    cv_fit=cv.fit_transform(corpus)    
    word_list = cv.get_feature_names();    
    count_list = cv_fit.toarray().sum(axis=0)
    word_frequency = dict(zip(word_list,count_list))
    sorted_word_frequencies=sorted(word_frequency.values())
    higher_word_frequencies = [word for word,freq in word_frequency.items() if freq in sorted_word_frequencies[-3:]]
    higher_frequency = sorted_word_frequencies[-1]
    for word in word_frequency.keys():  
        word_frequency[word] = (word_frequency[word]/higher_frequency)
    sentence_rank={}
    for sentence in text.sents:
        for word in sentence :       
            if word.text.lower() in word_frequency.keys():            
                if sentence in sentence_rank.keys():
                    sentence_rank[sentence]+=word_frequency[word.text.lower()]
                else:
                    sentence_rank[sentence]=word_frequency[word.text.lower()]

    top_sentences=(sorted(sentence_rank.values())[::-1])
    top_three=top_sentences[:3]
    summary=[]
    for sentence,strength in sentence_rank.items():  
        if strength in top_three:
            summary.append(sentence)
        else:
            continue

    return ''.join([s.text for s in summary])

# Summaries of short articles

In [None]:
shorty = wikihow["word_count"] < 800

short_articles = wikihow[shorty].copy()

In [None]:
path = '/content/drive/MyDrive/summarization/wikihow_short_clean_extraction.pkl'
pd.to_pickle(short_articles, path)

In [None]:
short_articles['summary'] = ""

In [None]:
path = '/content/drive/MyDrive/summarization/wikihow_short_clean_summary_extra.pkl'

for ind in short_articles.index:
  text = short_articles['clean_text'][ind]
  doc = nlp(text)

  try:
    s = extract_summary_for(doc)
  except ValueError:
    print('error')
    print(ind)
    continue

  short_articles['summary'][ind]=s

  if ind >0 and ind % 100 == 0:
    print('pickled')
    pd.to_pickle(short_articles, path)
    
pd.to_pickle(short_articles, path)

# Summaries of shorter articles

In [None]:
shortys = wikihow["word_count"] < 300

shorter_articles = wikihow[shortys].copy()

In [None]:
path = '/content/drive/MyDrive/summarization/wikihow_shorter_clean_extraction.pkl'
pd.to_pickle(shorter_articles, path)

In [None]:
shorter_articles['summary'] = ""

In [None]:
path = '/content/drive/MyDrive/summarization/wikihow_shorter_clean_summary_extra.pkl'

for ind in shorter_articles.index:
  text = shorter_articles['clean_text'][ind]
  doc = nlp(text)

  try:
    s = extract_summary_for(doc)
  except ValueError:
    print('error')
    print(ind)
    continue

  shorter_articles['summary'][ind]=s

  if ind > 0 and ind % 100 == 0:
    print('pickled')
    pd.to_pickle(shorter_articles, path)
    
pd.to_pickle(shorter_articles, path)

# Evaluation




## Installation von nlg-eval, bleurt, functions

In [None]:
import io

def write_list_to(filename, da_list):
  with open(filename, 'w') as f:
    for item in da_list:
      f.write("%s\n" % item)


def calculate_bleurt(filename):
  with io.open(filename,'r') as f:
    bl = f.readlines()

  b = [float(i) for i in bl]
  score = sum(b)/len(b)
  print('score for ' + filename)
  print(score)


In [None]:
!pip3 install git+https://github.com/Maluuba/nlg-eval.git@master


In [None]:
!nlg-eval --setup

In [None]:
!git clone https://github.com/google-research/bleurt.git

In [None]:
cd bleurt/

In [None]:
!pip3 install .

In [None]:
!python -m unittest bleurt.score_test
!python -m unittest bleurt.score_not_eager_test
!python -m unittest bleurt.finetune_test

In [None]:
!wget https://storage.googleapis.com/bleurt-oss/bleurt-large-512.zip .

In [None]:
!unzip bleurt-large-512.zip

## Short Articles summary evaluation



In [None]:
to_eval_hyp = short_articles['summary']
to_eval_ref = short_articles['clean_headline']

In [None]:
h_filename = '/content/drive/MyDrive/summarization/evaluation/short_articles_hypothesis.txt'
r_filename = '/content/drive/MyDrive/summarization/evaluation/short_articles_reference.txt'

In [None]:
write_list_to(h_filename, to_eval_hyp)
write_list_to(r_filename, to_eval_ref)

In [None]:
!nlg-eval --hypothesis=/content/drive/MyDrive/summarization/evaluation/short_articles_hypothesis.txt --references=/content/drive/MyDrive/summarization/evaluation/short_articles_reference.txt --no-skipthoughts --no-glove 

In [None]:
!python -m bleurt.score \
  -candidate_file=/content/drive/MyDrive/summarization/evaluation/short_articles_hypothesis.txt \
  -reference_file=/content/drive/MyDrive/summarization/evaluation/short_articles_reference.txt \
  -bleurt_checkpoint=/content/bleurt/bleurt-large-512\
  -scores_file=bleurt_scores_short_articles_summary.txt

In [None]:
calculate_bleurt('bleurt_scores_short_articles_summary.txt')

## Shorter Articles summary evaluation

In [None]:
to_eval_hyp = shorter_articles['summary']
to_eval_ref = shorter_articles['clean_headline']

In [None]:
h_filename = '/content/drive/MyDrive/summarization/evaluation/shorter_articles_hypothesis.txt'
r_filename = '/content/drive/MyDrive/summarization/evaluation/shorter_articles_reference.txt'

In [None]:
write_list_to(h_filename, to_eval_hyp)
write_list_to(r_filename, to_eval_ref)

In [None]:
!nlg-eval --hypothesis=/content/drive/MyDrive/summarization/evaluation/shorter_articles_hypothesis.txt --references=/content/drive/MyDrive/summarization/evaluation/shorter_articles_reference.txt --no-skipthoughts --no-glove 

In [None]:
!python -m bleurt.score \
  -candidate_file=/content/drive/MyDrive/summarization/evaluation/shorter_articles_hypothesis.txt \
  -reference_file=/content/drive/MyDrive/summarization/evaluation/shorter_articles_reference.txt \
  -bleurt_checkpoint=/content/bleurt/bleurt-large-512\
  -scores_file=bleurt_scores_shorter_articles_summary.txt

In [None]:
calculate_bleurt('bleurt_scores_shorter_articles_summary.txt')

# Manual evaluation

In [5]:
with open("/content/guardian.txt", "r", encoding="utf-8") as f:
        guardian = " ".join(f.readlines())
doc = nlp(guardian)   
s = extract_summary_for(doc)