In [1]:
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/Summarization/')

In [2]:
!pip install aylien-apiclient
!pip install bert-extractive-summarizer
!pip install spacy==2.3.0
!pip install transformers
!pip install neuralcoref



In [6]:
from models.summarizer_aylien import summarize_aylien2_text
from models.summarizer_bertext import summarizer_bert_get
from models.summarizer_tfidf import tfidf
from models.summarizer_textrank import text_rank, word_embeddings
from models.summarizer_hgf import hgf

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
import pandas as pd
data = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSOPwZMHK11TcvPxxVbAGsM2RAaoD0OR1qMA_PhjkhJOxr57kOuWrC2gDIBHYdv9xVz9Q2TLT1Nqbbp/pub?gid=0&single=true&output=csv")

In [8]:
models = ['aylien', 'bert_ext', 'tfidf', 'text_rank', 'hgf']
functions = [summarize_aylien2_text, summarizer_bert_get, tfidf, text_rank, hgf]

In [9]:
def get_result(text, idx, models = models, functions = functions):
  try:
    function = functions[idx]
    return function(text)['summary']
  except:
    print("Not a model. Try again")
    return None

In [10]:
def get_result(text, idx, models = models, functions = functions):
    function = functions[idx]
    return function(text)['sentences']

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
benchmark = data

In [13]:
import time

trained = False
if not trained:
  for model, function in zip(models, functions):
    
    a = time.time()
    idx = models.index(model)
    print(model)
    benchmark['summarizer_' + model] = benchmark.Text.apply(get_result, idx = idx)
    print(time.time() - a)
    print("")
else:
  pass

aylien
20.755327701568604

bert_ext
133.22255659103394

tfidf
0.7081794738769531

text_rank
8.881894826889038

hgf


Token indices sequence length is longer than the specified maximum sequence length for this model (1714 > 1024). Running this sequence through the model will result in indexing errors


Error: “Saving on
Error: Everyone s
Error: With many 
Error: Home
What 
Error: Pakistan?s
Error: The United
Error: He studied
489.63214802742004



In [14]:
benchmark.to_csv("summarized.csv")

ROUGE

In [15]:
data = benchmark

In [16]:
benchmark = data[20:].reset_index()

In [17]:
!pip install rouge

Collecting rouge
  Downloading https://files.pythonhosted.org/packages/43/cc/e18e33be20971ff73a056ebdb023476b5a545e744e3fc22acd8c758f1e0d/rouge-1.0.0-py3-none-any.whl
Installing collected packages: rouge
Successfully installed rouge-1.0.0


In [18]:
from rouge import Rouge 
rouge = Rouge()

In [19]:
def get_score(goal, summary, metric = 'rouge-2'):
  scores = [0, 0, 0]
  cur_score = rouge.get_scores(summary, goal)[0][metric]
  scores[0] = cur_score['f']
  scores[1] = cur_score['p']
  scores[2] = cur_score['r']
  return scores

In [22]:
import numpy as np

def run_value(model_name = 'model_1', score = 'rouge-2', benchmark = benchmark):
  current_score = []
  for i in range(len(benchmark)):
    try:
        current_score.append(get_score(benchmark.Summary[i], benchmark["summarizer_" + model_name][i], score))
    except:
      pass
  return np.mean(np.array(current_score), axis = 0)

In [23]:
for model in models:
  print(model, "results:")
  print(run_value(model, 'rouge-1'))
  print("")

aylien results:
[0.27414313 0.18299227 0.60500076]

bert_ext results:
[0.28960091 0.2115282  0.49285482]

tfidf results:
[0.13459062 0.233631   0.11504402]

text_rank results:
[0.26546771 0.18370732 0.55712367]

hgf results:
[0.38362214 0.42111492 0.35670044]



In [24]:
for model in models:
  print(model, "results:")
  print(run_value(model, 'rouge-2'))
  print("")

aylien results:
[0.11624826 0.07792272 0.24950382]

bert_ext results:
[0.10308811 0.07506351 0.17651932]

tfidf results:
[0.02340788 0.04996485 0.01827213]

text_rank results:
[0.09709848 0.06626009 0.20796744]

hgf results:
[0.16011476 0.17562902 0.14921399]



In [25]:
for model in models:
  print(model, "results:")
  print(run_value(model, 'rouge-l'))
  print("")

aylien results:
[0.28288402 0.19766904 0.52963513]

bert_ext results:
[0.2677355  0.20134692 0.4187969 ]

tfidf results:
[0.12718482 0.2137435  0.10890169]

text_rank results:
[0.25383894 0.17964382 0.47456523]

hgf results:
[0.36014311 0.37942236 0.34781736]

