### **Installations and dependencies**

In [None]:
# change to local directory
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/Demo capstone/Summarization")

In [None]:
# install packages (in addition to Colab pre-installed packages like torch)
!pip install aylien-apiclient
!pip install bert-extractive-summarizer
!pip install spacy==2.3.0
!pip install transformers
!pip install neuralcoref



In [None]:
# importing packages

import nltk
nltk.download('punkt')
nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# import functions

from summary.summarizer_aylien_model import summarizer_aylien_get
from summary.summarizer_bertext_model import summarizer_bertext_get
from summary.summarizer_tfidf_model import summarizer_tfidf_get
from summary.summarizer_textrank_model import summarizer_textrank_get, word_embeddings
from summary.summarizer_hgf_model import summarizer_hgf_get

start loading vectorization


### **Create summaryfile**

In [None]:
# getting data (from Cornell Newsroom dataset)
data = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSOPwZMHK11TcvPxxVbAGsM2RAaoD0OR1qMA_PhjkhJOxr57kOuWrC2gDIBHYdv9xVz9Q2TLT1Nqbbp/pub?gid=0&single=true&output=csv")[20:]
data.reset_index(inplace=True, drop = True)

In [None]:
data.head()

Unnamed: 0,URL,Summary,Text
0,https://www.theguardian.com/world/2017/jul/31/...,The police has banned jogging in groups in Sie...,Police in Sierra Leone have banned group joggi...
1,https://www.theguardian.com/world/2017/jul/31/...,Canada is investigating reports that Saudi Ara...,The Canadian government is investigating repor...
2,https://www.theguardian.com/world/2017/jul/30/...,As many as 76 passengers were rescued from cab...,Up to 76 people were rescued from cable cars s...
3,https://www.theguardian.com/world/2017/jul/30/...,"More than 22,000 people were evacuated on Satu...","More than 22,000 people were evacuated on Satu..."
4,https://www.theguardian.com/world/2017/jul/30/...,Pakistan's Opposition leader Imran Khan has ca...,"Pakistan?s opposition leader, Imran Khan, has ..."


In [None]:
# mapping method's name and function
models = ['aylien', 'bertext', 'tfidf', 'textrank', 'hgf']
functions = [summarizer_aylien_get, summarizer_bertext_get, summarizer_tfidf_get, summarizer_textrank_get, summarizer_hgf_get]

In [None]:
# function to get the corresponding summary with the method
def get_result(text, idx, models = models, functions = functions):
    function = functions[idx]
    return function(text, 30)['sentences']

In [None]:
import time

# get the summary and record the time
trained = False
if not trained:
  for model, function in zip(models, functions):
    
    a = time.time()
    idx = models.index(model)
    print(model)
    data['summarizer_' + model] = data.Text.apply(get_result, idx = idx)
    print("Average time:", round((time.time() - a)/len(data), 2))
    print("")
else:
  pass

aylien
Average time: 0.16

bertext
Average time: 2.46

tfidf
Average time: 0.02

textrank
Average time: 0.15

hgf
Average time: 23.79



In [None]:
print(data['summarizer_hgf'][0])

Group jogging has taken on a political tinge in parts of Sierra Leone. TV celebrity Amadu Lamarana Bah said the restriction was a waste of police time. Presidential candidates have organised fun runs in Freetown and the northern town of Makeni, with participants wearing T-shirts bearing the name of the party. 


In [None]:
data.head()

Unnamed: 0,URL,Summary,Text,summarizer_aylien,summarizer_bertext,summarizer_tfidf,summarizer_textrank,summarizer_hgf
0,https://www.theguardian.com/world/2017/jul/31/...,The police has banned jogging in groups in Sie...,Police in Sierra Leone have banned group joggi...,Police in Sierra Leone have banned group joggi...,Police in Sierra Leone have banned group joggi...,"Mohamed Kamara, a civil servant, disputed tha...",Kamara said.Jogging and playing football in la...,Group jogging has taken on a political tinge i...
1,https://www.theguardian.com/world/2017/jul/31/...,Canada is investigating reports that Saudi Ara...,The Canadian government is investigating repor...,The Canadian government is investigating repor...,allegations that have prompted renewed scrutin...,?We are looking at these claims very seriousl...,"In a statement issued last week, it highlighte...",Videos and photos posted on social media alleg...
2,https://www.theguardian.com/world/2017/jul/30/...,As many as 76 passengers were rescued from cab...,Up to 76 people were rescued from cable cars s...,Up to 76 people were rescued from cable cars s...,Up to 76 people were rescued from cable cars s...,"An official cause has not been announced, but...",German media reported that rescue teams in Col...,"A gondola crashed into a support pillar, leavi..."
3,https://www.theguardian.com/world/2017/jul/30/...,"More than 22,000 people were evacuated on Satu...","More than 22,000 people were evacuated on Satu...",,"More than 22,000 people were evacuated on Satu...",they said.,,Firefighters rushed to Tomorrowland festival i...
4,https://www.theguardian.com/world/2017/jul/30/...,Pakistan's Opposition leader Imran Khan has ca...,"Pakistan?s opposition leader, Imran Khan, has ...","in the country, following the dismissal of Naw...","in the country, following the dismissal of Naw...","He has been a true example,? Khan?s supporter...","said Ishfaq, referring to a claim by Khan that...",Opposition leader Imran Khan has called for a ...


In [None]:
# store values
data.to_csv("summarized.csv")