## Text Summarization --- Textrank algorithm



In [1]:
# data set: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

In [4]:
# Read data
df = pd.read_csv('drive/My Drive/Colab Notebooks/test.csv')
df.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [5]:
# delete the duplicated units in the column of article
df = df[~df.article.duplicated()]

In [6]:
df.loc[:,'article_length'] = df.article.apply(lambda x:len(x))

In [7]:
len(df.id.unique())

11488

In [None]:
# TextRank Summarization
import gensim
from gensim.summarization.summarizer import summarize

"""
Summarizes corpus with TextRank.
parameter
---------    
corpus: list - df["article"]    
ratio: length of the summary (20% of the article)
return    
list of summaries
"""

def textrank(corpus, ratio):    
    if type(corpus) is list:        
       corpus = [corpus]    
    list_summaries = [gensim.summarization.summarize(article,  
                     ratio=ratio) for article in corpus]    
    return list_summaries

## Apply the function to corpus
summary = textrank(corpus=df["article"], ratio=0.2)

In [9]:
df['TextRank_Summary'] = summary
df

Unnamed: 0,id,article,highlights,article_length,TextRank_Summary
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,2126,"More than squabbling over the arm rest, shrink..."
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,1753,"Next level drunk: Intoxicated Rahul Kumar, 17,..."
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,633,Dougie Freedman is set to sign a new deal at N...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,1790,Liverpool target Neto is also wanted by PSG an...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",4391,"The former Olympian and reality TV star, 65, w..."
...,...,...,...,...,...
11485,ed8674cc15b29a87d8df8de1efee353d71122272,Our young Earth may have collided with a body ...,Oxford scientists say a Mercury-like body stru...,3877,Our current theory of Earth’s formation involv...
11486,2f58d1a99e9c47914e4b1c31613e3a041cd9011e,A man facing trial for helping his former love...,Man accused of helping former lover kill woman...,4422,A man facing trial for helping his former love...
11487,411f6d57825161c3a037b4742baccd6cd227c0c3,A dozen or more metal implements are arranged ...,Marianne Power tried the tuning fork facial at...,7864,I'm here to try the wackiest face treatment to...
11488,b5683ef8342056b17b068e0d59bdbe87e3fe44ea,Brook Lopez dominated twin brother Robin with ...,Brooklyn Nets beat the Portland Trail Blazers ...,1107,Brook Lopez dominated twin brother Robin with ...


In [14]:
print("Textrank Summary:")
print(df['TextRank_Summary'][1])
print("--------------------------------------------------------------------------")
print("Original Highlights:")
print(df['highlights'][1])

Textrank Summary:
Next level drunk: Intoxicated Rahul Kumar, 17, climbed into the lions' enclosure at a zoo in Ahmedabad and began running towards the animals shouting 'Today I kill a lion!' Mr Kumar had been sitting near the enclosure when he suddenly made a dash for the lions, surprising zoo security.
--------------------------------------------------------------------------
Original Highlights:
Drunk teenage boy climbed into lion enclosure at zoo in west India .
Rahul Kumar, 17, ran towards animals shouting 'Today I kill a lion!'
Fortunately he fell into a moat before reaching lions and was rescued .


## Evaluation for Textrank

In [11]:
# Install and import rouge
!pip install rouge
from rouge import Rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [12]:
rouge = Rouge()
rouge.get_scores(summary, df.highlights, avg=True, ignore_empty=True)

{'rouge-1': {'r': 0.5165796733759895,
  'p': 0.24919757996768577,
  'f': 0.31428219244562355},
 'rouge-2': {'r': 0.22099304176471654,
  'p': 0.09517792911078259,
  'f': 0.12153481499816275},
 'rouge-l': {'r': 0.48139013410614534,
  'p': 0.23144944584797306,
  'f': 0.29221318382432016}}

## Reference

https://towardsdatascience.com/text-summarization-with-nlp-textrank-vs-seq2seq-vs-bart-474943efeb09
