# 8. Text Summarization

## Setup

In [22]:
%run __init__.py

INFO:root:Starting logger


In [3]:
import pandas as pd

DF_FILE_PATH = os.path.join(NOTEBOOK_2_RESULTS_DIR, 'protocols_dataframe.pkl')

df = pd.read_pickle(DF_FILE_PATH)

In [4]:
protocols_no_abstract = df['full_text_no_abstract_cleaned'].values
protocols_no_abstract[0][:500]

'Scratch Wound Healing Assay. Grow cells in DMEM supplemented with 10% FBS. Seed cells into 24-well tissue culture plate at a density that after 24 h of growth, they should reach ~70-80% confluence as a monolayer. Do not change the medium. Gently and slowly scratch the monolayer with a new 1 ml pipette tip across the center of the well. While scratching across the surface of the well, the long-axial of the tip should always be perpendicular to the bottom of the well. The resulting gap distance th'

## Extractive summary

In [5]:
from collections import Counter
from string import punctuation

import spacy
import en_core_sci_lg


nlp = en_core_sci_lg.load()

def extract_summary(text, limit):
    keyword = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
    doc = nlp(text.lower())
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.pos_ in pos_tag):
            keyword.append(token.text)
    
    freq_word = Counter(keyword)
    max_freq = Counter(keyword).most_common(1)[0][1]
    for w in freq_word:
        freq_word[w] = (freq_word[w]/max_freq)
        
    sent_strength={}
    for sent in doc.sents:
        for word in sent:
            if word.text in freq_word.keys():
                if sent in sent_strength.keys():
                    sent_strength[sent]+=freq_word[word.text]
                else:
                    sent_strength[sent]=freq_word[word.text]
    
    summary = []
    
    sorted_x = sorted(sent_strength.items(), key=lambda kv: kv[1], reverse=True)
    
    counter = 0
    for i in range(len(sorted_x)):
        summary.append(str(sorted_x[i][0]).capitalize())

        counter += 1
        if(counter >= limit):
            break
            
    return ' '.join(summary)

In [7]:
num_sentences = 4

summaries = [extract_summary(t, num_sentences)
             for t in protocols_no_abstract]
summaries[0]

'Grow cells for additional 48 h (or the time required if different cells are used). Wash the cells twice with 1x pbs, then fix the cells with 3.7% paraformaldehye for 30 min. Seed cells into 24-well tissue culture plate at a density that after 24 h of growth, they should reach ~70-80% confluence as a monolayer. After scratching, gently wash the well twice with medium to remove the detached cells.'

In [8]:
protocols_no_abstract[0]

'Scratch Wound Healing Assay. Grow cells in DMEM supplemented with 10% FBS. Seed cells into 24-well tissue culture plate at a density that after 24 h of growth, they should reach ~70-80% confluence as a monolayer. Do not change the medium. Gently and slowly scratch the monolayer with a new 1 ml pipette tip across the center of the well. While scratching across the surface of the well, the long-axial of the tip should always be perpendicular to the bottom of the well. The resulting gap distance therefore equals to the outer diameter of the end of the tip. The gap distance can be adjusted by using different types of tips. Scratch a straight line in one direction. Scratch another straight line perpendicular to the first line to create a cross in each well. After scratching, gently wash the well twice with medium to remove the detached cells. Replenish the well with fresh medium. Note: Medium may contain ingredients of interest that you want to test, e.g., chemicals that inhibit/promote ce

## Abstractive summary

In [4]:
df.head()

Unnamed: 0,pr_id,title,abstract,materials,procedure,equipment,background,categories,authors,full_text,full_text_no_abstract,full_text_cleaned,full_text_no_abstract_cleaned,num_chars_text
0,e100,Scratch Wound Healing Assay,The scratch wound healing assay has been widel...,"Human MDA-MB-231 cell line (ATCC, catalog numb...",Grow cells in DMEM supplemented with 10% FBS.|...,BD Falcon 24-well tissue culture plate (Fisher...,,Cancer Biology|Invasion & metastasis|Cell biol...,Yanling Chen,Scratch Wound Healing Assay. The scratch wound...,Scratch Wound Healing Assay. Grow cells in DME...,Scratch Wound Healing Assay. The scratch wound...,Scratch Wound Healing Assay. Grow cells in DME...,2583
1,e1029,ADCC Assay Protocol,Antibody-dependent cell-mediated cytotoxicity ...,Raji cells (ATCC)|A/California/04/2009 (H1N1) ...,Preperation of Target Cells\n\n\t\t\n\n\n\t\t\...,Round bottomed 96-well plate|Temperature contr...,,Immunology|Immune cell function|Cytotoxicity|C...,Vikram Srivastava|Zheng Yang|Ivan Fan Ngai...,ADCC Assay Protocol. Antibody-dependent cell-m...,ADCC Assay Protocol. Preperation of Target Cel...,ADCC Assay Protocol. Antibody-dependent cell-m...,ADCC Assay Protocol. Preperation of Target Cel...,3824
2,e1072,Catalase Activity Assay in Candida glabrata,Commensal and pathogenic fungi are exposed to ...,Yeast strains \nNote: BG14 was used as the C. ...,Preparation of total soluble extracts\n\t\t\n\...,Orbital incubator shaker|Microfuge tubes|50 ml...,,Microbiology|Microbial biochemistry|Protein|Ac...,Emmanuel Orta-Zavalza|Marcela Briones-Martin...,Catalase Activity Assay in Candida glabrata. C...,Catalase Activity Assay in Candida glabrata. P...,Catalase Activity Assay in Candida glabrata. C...,Catalase Activity Assay in Candida glabrata. P...,4207
3,e1077,RNA Isolation and Northern Blot Analysis,The northern blot is a technique used in molec...,Vero cells (kidney epithelial cells extracted ...,RNA extraction\n\t\t\n\n\t\t\t\tCells were see...,"100 mm cell culture dishes (Corning, catalog n...",,Microbiology|Microbial genetics|RNA|RNA extrac...,Ying Liao|To Sing Fung|Mei Huang|Shouguo Fang|...,RNA Isolation and Northern Blot Analysis. The ...,RNA Isolation and Northern Blot Analysis. RNA ...,RNA Isolation and Northern Blot Analysis. The ...,RNA Isolation and Northern Blot Analysis. RNA ...,6890
4,e1090,Flow Cytometric Analysis of Autophagic Activit...,Flow cytometry allows very sensitive and relia...,"Cells lines of interest (HepG2, HUH7, CMK, K56...",Maintain cells under standard tissue culture c...,"37 °C, 5% CO2 humidified incubator|Centrifuge|...",,Microbiology|Antimicrobial assay|Autophagy ass...,Metodi Stankov|Diana Panayotova-Dimitrova|Ma...,Flow Cytometric Analysis of Autophagic Activit...,Flow Cytometric Analysis of Autophagic Activit...,Flow Cytometric Analysis of Autophagic Activit...,Flow Cytometric Analysis of Autophagic Activit...,5890


In [5]:
x = df['full_text_no_abstract_cleaned'].values
y_true = df['abstract'].values

In [40]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

inputs = tokenizer(x[0], return_tensors='pt')
summary_ids = model.generate(inputs['input_ids'])
result = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False)
         for g in summary_ids]
result

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json from cache at C:\Users\alex/.cache\torch\transformers\5f0de1d2bbb8eb1a3b69656622293b3328b06b701663a9d4109359751cb4e739.5e72c6158467741b29afbcad014cd97414f17a191d39253eef90d7bfe969cc1f
INFO:transformers.configuration_utils:Model config BartConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "id2label":

['Scratch Wound Healing Assay. Grow cells in DMEM supplemented with 10% FBS. Seed cells into 24-well tissue culture plate at a density that after 24 h of growth, they should reach ~70-80% confluence as a monolayer. Do not change the medium.']

In [None]:
from transformers import AutoModelForSeq2SeqLM

model_dirs = ['bart_cnn', 'bart_xsum',
              'distillbart_cnn_protocols',
              'distillbart_xsum_protocols']
base_model_dir = os.path.join(DATA_DIR, 'text_summarization_models')

for model_dir in model_dirs:
    complete_model_path = os.path.join(os.path.join(base_model_dir, model_dir), 'best_tfmr')
    model = AutoModelForSeq2SeqLM.from_pretrained(complete_model_path)
    model...

In [49]:
import torch

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

def trim_batch(input_ids, pad_token_id, attention_mask=None):
    """Remove columns that are populated exclusively by pad_token_id"""
    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
    if attention_mask is None:
        return input_ids[:, keep_column_mask]
    else:
        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])


complete_model_path = os.path.join(os.path.join(base_model_dir, 'distillbart_xsum_protocols'), 'best_tfmr')
model = AutoModelForSeq2SeqLM.from_pretrained(complete_model_path).to(DEFAULT_DEVICE)
tokenizer = AutoTokenizer.from_pretrained(complete_model_path)

results = []
for doc in x:
    batch = tokenizer(doc, return_tensors="pt", truncation=True, padding="max_length").to(DEFAULT_DEVICE)
    input_ids, attention_mask = trim_batch(**batch, pad_token_id=tokenizer.pad_token_id)
    summaries = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_start_token_id=None
    )
    dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    results.append(dec[0])
    if len(dec) != 1:
        print('AAAAA?')
        break

INFO:transformers.configuration_utils:loading configuration file E:\hercules\hercules-challenge-protocols\data\text_summarization_models\distillbart_xsum_protocols\best_tfmr\config.json
INFO:transformers.configuration_utils:Model config BartConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "eos_token_ids": [
    2
  ],
  "extra_pos_embeddings": 2,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_

In [55]:
results[0]

' Scratch Wound Healing Assay is a scratch-wound healing assay that can be used to determine whether a wound can be healed or not.'

In [56]:
y[0]

'The scratch wound healing assay has been widely adapted and modified to study the effects of a variety of experimental conditions, for instance, gene knockdown or chemical exposure, on mammalian cell migration and proliferation. In a typical scratch wound healing assay, a “wound gap” in a cell monolayer is created by scratching, and the “healing” of this gap by cell migration and growth towards the center of the gap is monitored and often quantitated.  Factors that alter the motility and/or growth of the cells can lead to increased or decreased rate of “healing” of the gap (Lampugnani, 1999). This assay is simple, inexpensive, and experimental conditions can be easily adjusted for different purposes. The assay can also be used for a high-throughput screen platform if an automated system is used (Yarrow and Perlman, 2004).'

## Evaluation

In [61]:
import numpy as np

from rouge_score import rouge_scorer

def compute_rouge_fscores(results, y):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge1_scores = []
    rougel_scores = []

    for y_pred, y_true in zip(results, y):
        rouge_score = scorer.score(y_pred, y_true)
        rouge1_scores.append(rouge_score['rouge1'].fmeasure)
        rougel_scores.append(rouge_score['rougeL'].fmeasure)

    return (np.mean(rouge1_scores), np.mean(rougel_scores))


In [63]:
final_score_rougel

0.16371107501329363