# 8. Text Summarization

## Setup

In [1]:
%run __init__.py

INFO:root:Starting logger


In [2]:
import pandas as pd

DF_FILE_PATH = os.path.join(NOTEBOOK_2_RESULTS_DIR, 'protocols_dataframe.pkl')

df = pd.read_pickle(DF_FILE_PATH)

In [3]:
protocols_no_abstract = df['full_text_no_abstract_cleaned'].values
protocols_no_abstract[0][:500]

'Scratch Wound Healing Assay. Grow cells in DMEM supplemented with 10% FBS. Seed cells into 24-well tissue culture plate at a density that after 24 h of growth, they should reach ~70-80% confluence as a monolayer. Do not change the medium. Gently and slowly scratch the monolayer with a new 1 ml pipette tip across the center of the well. While scratching across the surface of the well, the long-axial of the tip should always be perpendicular to the bottom of the well. The resulting gap distance th'

## Extractive summary

In [4]:
from collections import Counter
from string import punctuation

import spacy
import en_core_sci_lg


nlp = en_core_sci_lg.load()

def extract_summary(text, limit):
    keyword = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
    doc = nlp(text.lower())
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.pos_ in pos_tag):
            keyword.append(token.text)
    
    freq_word = Counter(keyword)
    max_freq = Counter(keyword).most_common(1)[0][1]
    for w in freq_word:
        freq_word[w] = (freq_word[w]/max_freq)
        
    sent_strength={}
    for sent in doc.sents:
        for word in sent:
            if word.text in freq_word.keys():
                if sent in sent_strength.keys():
                    sent_strength[sent]+=freq_word[word.text]
                else:
                    sent_strength[sent]=freq_word[word.text]
    
    summary = []
    
    sorted_x = sorted(sent_strength.items(), key=lambda kv: kv[1], reverse=True)
    
    counter = 0
    for i in range(len(sorted_x)):
        summary.append(str(sorted_x[i][0]).capitalize())

        counter += 1
        if(counter >= limit):
            break
            
    return ' '.join(summary)

In [5]:
num_sentences = 4

extractive_summaries = [extract_summary(t, num_sentences)
                        for t in protocols_no_abstract]
extractive_summaries[0]

'Grow cells for additional 48 h (or the time required if different cells are used). Wash the cells twice with 1x pbs, then fix the cells with 3.7% paraformaldehye for 30 min. Seed cells into 24-well tissue culture plate at a density that after 24 h of growth, they should reach ~70-80% confluence as a monolayer. After scratching, gently wash the well twice with medium to remove the detached cells.'

## Abstractive summary

In [6]:
x = df['full_text_no_abstract_cleaned'].values
y_true = df['abstract'].values

In [7]:
import torch

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

def trim_batch(input_ids, pad_token_id, attention_mask=None):
    """Remove columns that are populated exclusively by pad_token_id"""
    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
    if attention_mask is None:
        return input_ids[:, keep_column_mask]
    else:
        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])

def get_model_predictions(model, tokenizer, x):
    return [_predict(model, tokenizer, doc) for doc in x]

def _predict(model, tokenizer, doc):
    batch = tokenizer(doc, return_tensors="pt", truncation=True, padding="max_length").to(DEFAULT_DEVICE)
    input_ids, attention_mask = trim_batch(**batch, pad_token_id=tokenizer.pad_token_id)
    summaries = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        decoder_start_token_id=None
    )
    dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    return dec[0]


INFO:transformers.file_utils:PyTorch version 1.6.0+cpu available.


In [None]:
base_model_dir = os.path.join(DATA_DIR, 'text_summarization_models')
models_names = ['facebook/bart-large-cnn',
                'distillbart_cnn_protocols',
                'distillbart_xsum_protocols']

model_results = {}
for name in models_names:
    model_path = name if 'distillbart' not in name \
                      else os.path.join(os.path.join(base_model_dir, name), 'best_tfmr')
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(DEFAULT_DEVICE)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model_results[name] = get_model_predictions(model, tokenizer, x)

# add extractive results
model_results['tfidf_extractive'] = extractive_summaries

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json from cache at C:\Users\alex/.cache\torch\transformers\5f0de1d2bbb8eb1a3b69656622293b3328b06b701663a9d4109359751cb4e739.5e72c6158467741b29afbcad014cd97414f17a191d39253eef90d7bfe969cc1f
INFO:transformers.configuration_utils:Model config BartConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "id2label":

In [None]:
import random

result_sample_idx = random.randint(100)

for model_name, results in model_results.items():
    print(model_name)
    print(results[result_sample_idx])
    print('\n')

## Evaluation

In [None]:
import numpy as np

from rouge_score import rouge_scorer

def _compute_mean(scores):
    precision = [score.precision for score in scores]
    recall = [score.recall for score in scores]
    fmeasure = [score.fmeasure for score in scores]
    return {
        'precision': np.mean(precision),
        'recall': np.mean(recall),
        'fmeasure': np.mean(fmeasure)
    }

def compute_rouge_scores(results, y):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge1_scores = []
    rougel_scores = []

    for y_pred, y_true in zip(results, y):
        rouge_score = scorer.score(y_pred, y_true)
        rouge1_scores.append(rouge_score['rouge1'])
        rougel_scores.append(rouge_score['rougeL'])

    return {
        'rouge1': _compute_mean(rouge1_scores),
        'rougeL': _compute_mean(rougel_scores)
    }


In [None]:
model_scores = {}
for model_name, predictions in model_results.items():
    model_scores[model_name] = compute_rouge_scores(predictions[], y_true[])

In [None]:
for model_name, scores in model_scores.items():
    print(model_name)
    print(scores)
    print('\n')

## Saving the results

### Predictions

In [None]:
results_df = df[['pr_id', 'title']]
for model_name, predictions in model_results.items():
    results_df = results_df.assign(**{model_name: predictions})
results_df.head()

In [None]:
PREDICTIONS_OUTPUT_PATH = os.path.join(NOTEBOOK_8_RESULTS_DIR, 'predictions.csv')
results_df.to_csv(PREDICTIONS_OUTPUT_PATH, index=False)

### Rouge scores

In [None]:
scores_df = pd.DataFrame({
    'model': list(model_scores.keys()),
    'rouge1_prec': [v['rouge1']['precision'] for v in model_scores.values()],
    'rouge1_recall': [v['rouge1']['recall'] for v in model_scores.values()],
    'rouge1_fmeasure': [v['rouge1']['fmeasure'] for v in model_scores.values()],
    'rougeL_prec': [v['rougeL']['precision'] for v in model_scores.values()],
    'rougeL_recall': [v['rougeL']['recall'] for v in model_scores.values()],
    'rougeL_fmeasure': [v['rougeL']['fmeasure'] for v in model_scores.values()],
})
scores_df

In [None]:
SCORES_OUTPUT_PATH = os.path.join(NOTEBOOK_8_RESULTS_DIR, 'scores.csv')
scores_df.to_csv(PREDICTIONS_OUTPUT_PATH, index=False)