<a href="https://colab.research.google.com/github/ngockhanh5110/nlp-vietnamese-text-summarization/blob/main/notebooks/inferencing_vietnews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Connect to drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Install lib**

In [None]:
!pip install vncorenlp datasets==1.0.2
!pip install transformers
!pip install dill==0.3.5.1





In [None]:
!pip install git-python==1.0.3
# !pip install sacrebleu==1.4.12
!pip install rouge_score



In [None]:
import os
new_path = "/content/drive/MyDrive/LLM_TEXT/"
os.chdir(new_path)
current_path = os.getcwd()
print("current path:", current_path)

current path: /content/drive/MyDrive/LLM_TEXT


In [None]:
import glob
import pandas as pd
import concurrent.futures
from datasets import *
import datasets
import transformers

from seq2seq_trainer import Seq2SeqTrainer
from transformers import TrainingArguments
from dataclasses import dataclass, field
from typing import Optional

# **Processing data**

In [None]:
def listPaths(path):
  pathfiles = list()
  cnt = 0
  for pathfile in glob.glob(path):
    cnt += 1
    pathfiles.append(pathfile)
    if cnt == 20:
       break
  return pathfiles

train_paths = listPaths('/content/drive/MyDrive/LLM_TEXT/dataset/train_tokenized/*')
val_paths = listPaths('/content/drive/MyDrive/LLM_TEXT/dataset/val_tokenized/*')
test_paths = listPaths('/content/drive/MyDrive/LLM_TEXT/dataset/test_tokenized/*')

In [None]:
def read_content(pathfile):
  """
  Input: Path of txt file
  Output: A dictionary has keys 'original' and 'summary'
  """
  with open(pathfile) as f:
    rows  = f.readlines()
    original = ' '.join(''.join(rows[4:]).split('\n'))
    summary = ' '.join(rows[2].split('\n'))

  return {'file' : pathfile,
          'original': original,
          'summary': summary}

In [None]:
def get_dataframe(pathfiles):
  with concurrent.futures.ProcessPoolExecutor() as executor:
    data = executor.map(read_content, pathfiles)

  # Make blank dataframe
  data_df = list()
  for d in data:
    data_df.append(d)
  data_df = pd.DataFrame(data_df)
  data_df.dropna(inplace = True)
  data_df = data_df.sample(frac=1).reset_index(drop=True)

  return data_df

In [None]:
test_df = get_dataframe(test_paths)

In [None]:
from transformers import RobertaTokenizerFast,AutoTokenizer


In [None]:
test_data =  Dataset.from_pandas(test_df)

In [None]:
# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

### **Evaluation**


In [None]:
import datasets
from transformers import RobertaTokenizer, EncoderDecoderModel, AutoTokenizer
from sklearn.model_selection import train_test_split


tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

model = EncoderDecoderModel.from_pretrained('/content/drive/MyDrive/LLM_TEXT/training/checkpoint-30000')
model.to("cpu")

# test_data = datasets.load_dataset("xsum", split="test")

batch_size = 32  # change to 64 for full evaluation

# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    inputs = tokenizer(batch["original"], padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    input_ids = inputs.input_ids.to("cpu")
    attention_mask = inputs.attention_mask.to("cpu")

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch

results = test_data.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["original"])

pred_str = results["pred"]
label_str = results["summary"]

print ("test_data: ",test_data[0]["summary"] )
print ("pred: ",pred_str[0] )
print ("summary: ",label_str[0] )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The following encoder weights were not tied to the decoder ['roberta/pooler']
The following encoder weights were not tied to the decoder ['roberta/pooler']


  0%|          | 0/1 [00:00<?, ?ba/s]

test_data:  Thống_nhất với đề_nghị của Chính_phủ , Uỷ_ban Các vấn_đề xã_hội đề_nghị cho_phép người lao_động nếu có nguyện_vọng thì được nhận lại số tiền đã tham_gia bảo_hiểm_xã_hội . 
pred:  Theo vừa được Quốc_hội thông_qua, người lao_động khi chưa đủ 60 tuổi có_thể lựa_chọn nhận bảo_hiểm_xã_hội một lần hoặc tiếp_tục sử_dụng thời_gian đóng bảo_hiểm_xã_hội để hưởng lương hưu.
summary:  Thống_nhất với đề_nghị của Chính_phủ , Uỷ_ban Các vấn_đề xã_hội đề_nghị cho_phép người lao_động nếu có nguyện_vọng thì được nhận lại số tiền đã tham_gia bảo_hiểm_xã_hội . 


In [None]:
rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1","rouge2","rougeL"])

In [None]:
for key,value in rouge_output.items():
  print(key)
  print(value.mid)

rouge1
Score(precision=0.6313042451553816, recall=0.5803276901865263, fmeasure=0.600764769512887)
rouge2
Score(precision=0.25755639803296626, recall=0.23811261357096036, fmeasure=0.24565291107588214)
rougeL
Score(precision=0.3798055090604461, recall=0.3471518136851943, fmeasure=0.3603825762213886)
