In [1]:
from transformers import pipeline
import pandas as pd
from datasets import Dataset 
from transformers.pipelines.pt_utils import KeyDataset

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
news_articles_df = pd.read_csv("../Datasets/test_CNN_Article.csv")

In [33]:
# class labels
# 5 Sport
# 3 News
# 4 Politics
# 0 business
# 1 entertainment
news_articles_df.head(5)

Unnamed: 0,text,label
0,"(CNN)Kenya Goodson, a 46-year-old Black woman...",4
1,Story highlightsUK PM David Cameron promises t...,3
2,(CNN)Additional testing is planned after Kent...,5
3,Sign up for CNN's Wonder Theory science newsle...,3
4,London (CNN)Team GB sprinter Bianca Williams h...,5


In [34]:
news_articles_df['label'].value_counts()

label
3    2755
5    2302
4     344
0     122
2      95
1      68
Name: count, dtype: int64

In [35]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn", device= 0)

In [36]:
news_articles_df.dropna(subset=['text'], inplace=True)

# remove articles which are too short and dont require summarizations
very_short_news_articles = []
for ind in news_articles_df.index:
    if len(news_articles_df['text'][ind]) < 60:
        very_short_news_articles.append(ind)

news_articles_df.drop(index=very_short_news_articles,inplace= True)

# break very long articles and use the first 1024 tokens in the article
very_long_articles = []
for ind in news_articles_df.index:
    if len(news_articles_df['text'][ind]) > 1024:
        very_long_articles.append(ind)
        news_articles_df.loc[ind, "text"] = news_articles_df['text'][ind][:1024]

dataset = Dataset.from_pandas(news_articles_df)


In [37]:
len(very_long_articles)

5585

In [38]:
len(very_short_news_articles)

4

In [39]:
generated_summaries = []
news_wc_list = []
summary_wc_list = []
count = 0


for out in pipe(KeyDataset(dataset, "text"),max_length = 60, min_length = 20):
    entire_news_text = KeyDataset(dataset, "text")[0]
    summary = out[0]["summary_text"]
    
    generated_summaries.append(summary)
    news_wc_list.append(entire_news_text.count(" ") + 1)
    summary_wc_list.append(summary.count(" ") + 1)

    count += 1
    if count % 100 ==0: print(count)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800


Your max_length is set to 60, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)


1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500


Your max_length is set to 60, but your input_length is only 49. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)


3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700


Your max_length is set to 60, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)


4800
4900
5000
5100
5200
5300


Your max_length is set to 60, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)


5400
5500
5600


In [40]:
len(generated_summaries),len(news_wc_list), len(summary_wc_list)

(5680, 5680, 5680)

In [41]:
generated_summaries[0]

"Kenya Goodson, a 46-year-old Black woman from Tuscaloosa, Alabama, has voted in every election since she was 19. Last week's Supreme Court decision to allow Alabama's new congressional map has left Goodson discouraged about casting a ballot herself."

In [42]:
news_wc_list[0], summary_wc_list[0]

(169, 38)

In [43]:
# calculate summary length % based on news article length
summary_precentage_list = []
for idx in range(len(generated_summaries)):

    summary_precentage_list.append((summary_wc_list[idx]/news_wc_list[idx]) * 100)



In [45]:
summary_precentage_list[:10]

[22.485207100591715,
 21.893491124260358,
 20.118343195266274,
 21.301775147928996,
 16.56804733727811,
 12.42603550295858,
 13.017751479289942,
 17.159763313609467,
 27.218934911242602,
 18.34319526627219]

In [47]:
# average summary as percentage length of the original news article

sum(summary_precentage_list) / len(summary_precentage_list)

# on average the generated summary was around 20% in length of the original news article

20.119280773397783

In [49]:
maximum_percentage= max(summary_precentage_list)
summary_precentage_list.index(maximum_percentage)

217

In [50]:
dataset[217]

{'text': "A version of this story appeared in the January 21 edition of CNN's Royal News, a weekly dispatch bringing you the inside track on Britain's royal family. Sign up here.  London (CNN)After the Queen's actions to retire Prince Andrew last week, questions have been raised about his remaining constitutional duty.As Counsellor of State he may still be called upon to pick up some of the Queen's duties if she were temporarily out of action due to illness or if she was traveling.Under the Regency Act 1937, the group that the monarch can empower are the next four in line to the throne who are over the age of 21. Those royals are Princes Charles, William, Andrew and Harry (after skipping over the Cambridge and Sussex children due to their young age). The late Prince Philip, as the sovereign's spouse, would also have been included if he were still alive. Read MoreTo activate the counsellors, the Queen issues a directive known as Letters Patent to formally grant authority to two or more 

In [51]:
generated_summaries[217]

"Prince Andrew may still be called upon to pick up some of the Queen's duties if she were temporarily out of action due to illness or if she was traveling. Under the Regency Act 1937, the monarch can empower the next four in line to the throne who are over the age of"

In [None]:
# # summary config

# SAMPLE = False

# news_wc_list, summary_wc_list , summaries = [], [], []

# count = 0
# for ind in news_articles_df.index:
#     news_article = news_articles_df['text'][ind][:1024]
#     news_wc = news_article.count(" ") + 1

#     if news_wc >= 10:
#         MAX_LEN = min(40,news_wc)
#         MIN_LEN = 10

#         result_obj = pipe(news_article, max_length = MAX_LEN, min_length = MIN_LEN, do_sample = SAMPLE)
#         result_summary = result_obj[0]['summary_text']

        
#         summary_wc = result_summary.count(" ") + 1

#         summaries.append(result_summary)
#         news_wc_list.append(news_wc)
#         summary_wc_list.append(summary_wc)

#     else:
#         summaries.append(None)
#         news_wc_list.append(0)
#         summary_wc_list.append(0)

#     count +=1

#     if count % 100 ==0: print(count)


