# Deep learning using labelled data from GA-ExtractiveOracle for long text summarization

## Start of [5']: Build model and train

In [1]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Enable synchronous CUDA execution for debugging

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import precision_recall_fscore_support
from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm.notebook import tqdm  # For Jupyter-compatible progress bar
import logging
from rouge_score import rouge_scorer
rougescorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f'Using device: {device}')

# Build vocabulary from training data
def build_vocab(csv_file, min_freq=2):
    try:
        data = pd.read_csv(csv_file, delimiter=';;;;;;', engine='python', encoding='iso-8859-1')
    except UnicodeDecodeError:
        logger.warning("ISO-8859-1 encoding failed, trying latin1")
        data = pd.read_csv(csv_file, delimiter=';;;;;;', engine='python', encoding='latin1')
    texts = data['content'].astype(str).tolist()
    all_words = []
    for text in texts:
        words = word_tokenize(text.lower())
        all_words.extend(words)
    word_counts = Counter(all_words)
    vocab = {word: idx + 2 for idx, (word, count) in enumerate(word_counts.items()) if count >= min_freq}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

# Custom Dataset
class SummaryDataset(Dataset):
    def __init__(self, csv_file, vocab, max_length=40, max_sentences=7000):
        try:
            self.data = pd.read_csv(csv_file, delimiter=';;;;;;', engine='python', encoding='iso-8859-1')
        except UnicodeDecodeError:
            logger.warning("ISO-8859-1 encoding failed, trying latin1")
            self.data = pd.read_csv(csv_file, delimiter=';;;;;;', engine='python', encoding='latin1')
        
        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.max_length = max_length
        self.max_sentences = max_sentences
        self.chapters = []
        self.chapter_data = []
        
        # Group data by chapter
        grouped = self.data.groupby('chapter')
        for chapter_name, group in grouped:
            # Sort by sentence ID to ensure correct order
            group = group.sort_values('sentence', ascending=True)
            texts = []
            labels = []
            sentence_ids = []
            tokenized_indices = []
            
            for idx, row in group.iterrows():
                text = str(row['content'])
                label = row['in_summary']
                sentence_id = row['sentence']
                try:
                    label_int = int(label)
                    if label_int not in [0, 1]:
                        logger.warning(f"Skipping row {idx}: Invalid label value {label}")
                        continue
                    sentence_id_int = int(sentence_id)
                except (ValueError, TypeError):
                    logger.warning(f"Skipping row {idx}: Invalid label {label} or sentence ID {sentence_id}")
                    continue
                
                # Pre-tokenize
                words = word_tokenize(text.lower())
                indices = [self.vocab.get(word, self.vocab['<UNK>']) for word in words]
                if len(indices) > max_length:
                    indices = indices[:max_length]
                else:
                    indices += [self.vocab['<PAD>']] * (max_length - len(indices))
                
                # Validate indices
                indices = [max(0, min(idx, self.vocab_size - 1)) for idx in indices]
                if any(idx < 0 or idx >= self.vocab_size for idx in indices):
                    logger.warning(f"Invalid indices in text at index {idx}: {indices}")
                    indices = [self.vocab['<PAD>']] * max_length
                
                texts.append(text)
                labels.append(label_int)
                sentence_ids.append(sentence_id_int)
                tokenized_indices.append(indices)
            
            if texts:
                # Truncate at initialization to avoid oversized chapters
                if len(tokenized_indices) > max_sentences:
                    logger.warning(f"Chapter {chapter_name} has {len(tokenized_indices)} sentences, truncating to {max_sentences}")
                    tokenized_indices = tokenized_indices[:max_sentences]
                    labels = labels[:max_sentences]
                    texts = texts[:max_sentences]
                    sentence_ids = sentence_ids[:max_sentences]
                self.chapters.append(chapter_name)
                self.chapter_data.append({
                    'texts': texts,
                    'labels': labels,
                    'sentence_ids': sentence_ids,
                    'tokenized_indices': tokenized_indices
                })
        
        if not self.chapters:
            raise ValueError("No valid chapters found after cleaning")

    def __len__(self):
        return len(self.chapters)

    def __getitem__(self, idx):
        chapter = self.chapter_data[idx]
        tokenized_indices = chapter['tokenized_indices']
        labels = chapter['labels']
        num_sentences = len(tokenized_indices)  # Number of sentences after init truncation
        
        # Convert to tensors without padding to max_sentences
        input_ids = torch.tensor(tokenized_indices, dtype=torch.long)  # Shape: [num_sentences, max_length]
        labels = torch.tensor(labels, dtype=torch.long)               # Shape: [num_sentences]
        
        # Verify tensor sizes
        assert num_sentences <= self.max_sentences, \
            f"num_sentences {num_sentences} > max_sentences {self.max_sentences} for chapter {self.chapters[idx]}"
        assert input_ids.size() == (num_sentences, self.max_length), \
            f"input_ids size {input_ids.size()} != [{num_sentences}, {self.max_length}] for chapter {self.chapters[idx]}"
        assert labels.size() == (num_sentences,), \
            f"labels size {labels.size()} != [{num_sentences}] for chapter {self.chapters[idx]}"
        
        logger.debug(f"Chapter {self.chapters[idx]}: {num_sentences} sentences, input_ids shape {input_ids.size()}")
        
        return {
            'input_ids': input_ids,  # Shape: [num_sentences, max_length]
            'labels': labels,        # Shape: [num_sentences]
            'num_sentences': num_sentences  # Actual number of sentences
        }

# Define the LSTM model
class SummaryLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=64, num_layers=1):
        super(SummaryLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, 2)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x, num_sentences):
        # x shape: [batch_size, max_sentences, max_length]
        batch_size = x.size(0)
        outputs = []
        
        for i in range(batch_size):
            # Process sentences for one chapter
            chapter_input = x[i, :num_sentences[i]]  # Shape: [num_sentences, max_length]
            embedded = self.embedding(chapter_input)  # Shape: [num_sentences, max_length, embedding_dim]
            # Average embeddings over words to get sentence embeddings
            sentence_embeds = embedded.mean(dim=1)    # Shape: [num_sentences, embedding_dim]
            lstm_out, (hidden, _) = self.lstm(sentence_embeds.unsqueeze(0))  # Shape: [1, num_sentences, hidden_dim*2]
            lstm_out = lstm_out.squeeze(0)  # Shape: [num_sentences, hidden_dim*2]
            lstm_out = self.dropout(lstm_out)
            output = self.fc(lstm_out)  # Shape: [num_sentences, 2]
            # Pad output to max_sentences
            if num_sentences[i] < x.size(1):
                pad_size = x.size(1) - num_sentences[i]
                output = torch.cat([
                    output,
                    torch.zeros(pad_size, 2, device=output.device)
                ], dim=0)
            outputs.append(output)
        
        return torch.stack(outputs)  # Shape: [batch_size, max_sentences, 2]

# Build vocabulary
vocab = build_vocab('Dataset/train_GAlabelled.csv')
vocab_size = len(vocab)

# Initialize model
model = SummaryLSTM(vocab_size=vocab_size, embedding_dim=100, hidden_dim=64, num_layers=1)
model.to(device)

# Load datasets
train_dataset = SummaryDataset('Dataset/train_GAlabelled.csv', vocab, max_length=40, max_sentences=7000)
val_dataset = SummaryDataset('Dataset/val_GAlabelled.csv', vocab, max_length=40, max_sentences=7000)

# Custom collate function to handle variable-length chapters
def custom_collate_fn(batch):
    max_sentences = 7000  # Same as max_sentences in SummaryDataset
    max_length = 40     # Same as max_length in SummaryDataset

    # Find the maximum number of sentences in this batch (capped at max_sentences)
    batch_max_sentences = min(max(item['num_sentences'] for item in batch), max_sentences)
    logger.debug(f"Batch max sentences: {batch_max_sentences}")

    input_ids_list = []
    labels_list = []
    num_sentences_list = []

    for item in batch:
        input_ids = item['input_ids']  # Shape: [num_sentences, max_length]
        labels = item['labels']        # Shape: [num_sentences]
        num_sentences = item['num_sentences']

        # Truncate if necessary
        if num_sentences > batch_max_sentences:
            logger.debug(f"Truncating chapter from {num_sentences} to {batch_max_sentences} sentences")
            input_ids = input_ids[:batch_max_sentences]
            labels = labels[:batch_max_sentences]
            num_sentences = batch_max_sentences
        # Pad if necessary
        elif num_sentences < batch_max_sentences:
            pad_size = batch_max_sentences - num_sentences
            logger.debug(f"Padding chapter from {num_sentences} to {batch_max_sentences} sentences")
            input_ids = torch.cat([
                input_ids,
                torch.zeros(pad_size, max_length, dtype=torch.long)
            ], dim=0)
            labels = torch.cat([
                labels,
                torch.full((pad_size,), -1, dtype=torch.long)
            ], dim=0)

        # Verify tensor size before stacking
        assert input_ids.size() == (batch_max_sentences, max_length), \
            f"input_ids size {input_ids.size()} != [{batch_max_sentences}, {max_length}]"
        assert labels.size() == (batch_max_sentences,), \
            f"labels size {labels.size()} != [{batch_max_sentences}]"

        input_ids_list.append(input_ids)
        labels_list.append(labels)
        num_sentences_list.append(num_sentences)

    # Stack into batch tensors
    input_ids_batch = torch.stack(input_ids_list)  # Shape: [batch_size, batch_max_sentences, max_length]
    labels_batch = torch.stack(labels_list)        # Shape: [batch_size, batch_max_sentences]
    num_sentences_batch = torch.tensor(num_sentences_list, dtype=torch.long)

    logger.debug(f"Batch shapes: input_ids {input_ids_batch.size()}, labels {labels_batch.size()}")

    return {
        'input_ids': input_ids_batch,
        'labels': labels_batch,
        'num_sentences': num_sentences_batch
    }

# DataLoaders with custom collate function
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, collate_fn=custom_collate_fn)

# Optimizer and loss
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=-1)  # Ignore padding labels

# Training function
def train_epoch(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    tp, fp, tn, fn = 0, 0, 0, 0  # Confusion matrix counts

    for batch in tqdm(data_loader, desc="Training Batches"):
        input_ids = batch['input_ids'].to(device)  # Shape: [batch_size, max_sentences, max_length]
        labels = batch['labels'].to(device)        # Shape: [batch_size, max_sentences]
        num_sentences = batch['num_sentences'].to(device)

        outputs = model(input_ids, num_sentences)   # Shape: [batch_size, max_sentences, 2]
        # Reshape for loss computation
        outputs = outputs.view(-1, 2)              # Shape: [batch_size * max_sentences, 2]
        labels = labels.view(-1)                   # Shape: [batch_size * max_sentences]
        loss = criterion(outputs, labels)

        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Compute batch metrics
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        labels_np = labels.cpu().numpy()
        valid_mask = labels_np != -1  # Exclude padding
        batch_preds = preds[valid_mask]
        batch_labels = labels_np[valid_mask]

        # Update confusion matrix counts
        for pred, label in zip(batch_preds, batch_labels):
            if pred == 1 and label == 1:
                tp += 1
            elif pred == 1 and label == 0:
                fp += 1
            elif pred == 0 and label == 0:
                tn += 1
            elif pred == 0 and label == 1:
                fn += 1

    avg_loss = total_loss / len(data_loader)
    # Compute metrics
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    return avg_loss, precision, recall, f1

# Validation function
def eval_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    tp, fp, tn, fn = 0, 0, 0, 0  # Confusion matrix counts

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            num_sentences = batch['num_sentences'].to(device)

            outputs = model(input_ids, num_sentences)
            outputs = outputs.view(-1, 2)
            labels = labels.view(-1)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            labels_np = labels.cpu().numpy()
            valid_mask = labels_np != -1
            batch_preds = preds[valid_mask]
            batch_labels = labels_np[valid_mask]

            # Update confusion matrix counts
            for pred, label in zip(batch_preds, batch_labels):
                if pred == 1 and label == 1:
                    tp += 1
                elif pred == 1 and label == 0:
                    fp += 1
                elif pred == 0 and label == 0:
                    tn += 1
                elif pred == 0 and label == 1:
                    fn += 1

    avg_loss = total_loss / len(data_loader)
    # Compute metrics
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    return avg_loss, precision, recall, f1

# Training loop with early stopping and epoch printing
patience = 21
best_val_loss = float('inf')
patience_counter = 0
max_epochs = 100
epoch = 0

while epoch < max_epochs:
    epoch += 1
    print(f'\nStarting Epoch {epoch}/{max_epochs}')
    train_loss, train_precision, train_recall, train_f1 = train_epoch(model, train_loader, optimizer, criterion, device)
    print(f'Epoch {epoch} - Train Loss: {train_loss:.3f}, Precision: {train_precision:.3f}, Recall: {train_recall:.3f}, F1: {train_f1:.3f}')
    logger.info(f'Epoch {epoch}/{max_epochs}')
    logger.info(f'Train Loss: {train_loss:.3f}, Precision: {train_precision:.3f}, Recall: {train_recall:.3f}, F1: {train_f1:.3f}')

    val_loss, val_precision, val_recall, val_f1 = eval_model(model, val_loader, criterion, device)
    print(f'Epoch {epoch} - Val Loss: {val_loss:.3f}, Precision: {val_precision:.3f}, Recall: {val_recall:.3f}, F1: {val_f1:.3f}')
    logger.info(f'Val Loss: {val_loss:.3f}, Precision: {val_precision:.3f}, Recall: {val_recall:.3f}, F1: {val_f1:.3f}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pt')
        print('Model saved with best validation loss!')
        logger.info('Model saved!')
        patience_counter = 0
    else:
        patience_counter += 1
        print(f'Patience counter: {patience_counter}/{patience}')
        logger.info(f'Patience counter: {patience_counter}/{patience}')
        if patience_counter >= patience:
            print('Early stopping triggered!')
            logger.info('Early stopping triggered!')
            break

[nltk_data] Downloading package punkt to C:\Users\Viet-Dung
[nltk_data]     Nguyen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Viet-Dung
[nltk_data]     Nguyen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



Starting Epoch 1/100


Training Batches:   0%|          | 0/1200 [00:00<?, ?it/s]

Epoch 1 - Train Loss: 0.319, Precision: 0.505, Recall: 0.002, F1: 0.003
Epoch 1 - Val Loss: 0.330, Precision: 0.497, Recall: 0.002, F1: 0.005
Model saved with best validation loss!

Starting Epoch 2/100


Training Batches:   0%|          | 0/1200 [00:00<?, ?it/s]

Epoch 2 - Train Loss: 0.298, Precision: 0.547, Recall: 0.019, F1: 0.036
Epoch 2 - Val Loss: 0.327, Precision: 0.544, Recall: 0.012, F1: 0.024
Model saved with best validation loss!

Starting Epoch 3/100


Training Batches:   0%|          | 0/1200 [00:00<?, ?it/s]

Epoch 3 - Train Loss: 0.289, Precision: 0.561, Recall: 0.042, F1: 0.079
Epoch 3 - Val Loss: 0.333, Precision: 0.562, Recall: 0.013, F1: 0.025
Patience counter: 1/21

Starting Epoch 4/100


Training Batches:   0%|          | 0/1200 [00:00<?, ?it/s]

Epoch 4 - Train Loss: 0.283, Precision: 0.576, Recall: 0.061, F1: 0.110
Epoch 4 - Val Loss: 0.334, Precision: 0.548, Recall: 0.015, F1: 0.028
Patience counter: 2/21

Starting Epoch 5/100


Training Batches:   0%|          | 0/1200 [00:00<?, ?it/s]

Epoch 5 - Train Loss: 0.279, Precision: 0.586, Recall: 0.072, F1: 0.128
Epoch 5 - Val Loss: 0.335, Precision: 0.533, Recall: 0.022, F1: 0.042
Patience counter: 3/21

Starting Epoch 6/100


Training Batches:   0%|          | 0/1200 [00:00<?, ?it/s]

KeyboardInterrupt: 

## End of [5']: Build model and train

## Start of [6']: Run test & evaluate

In [2]:
test = pd.read_csv("Dataset/test.csv")

In [3]:
test_chapters = test['chapter'].tolist()
test_chapters_paragraphs = [test_chapters[i].split("\n\n") for i in range(len(test_chapters))]
for i in range(len(test_chapters_paragraphs)):
    test_chapters_paragraphs[i] = [test_chapters_paragraphs[i][j].replace("\n", " ") for j in range(len(test_chapters_paragraphs[i]))]
test_chapters = [test_chapters[i].replace("\n", " ") for i in range(len(test_chapters))]

In [4]:
test_chapters_sentences = [sent_tokenize(test_chapters[i]) for i in range(len(test_chapters))]
test_chapters_paragraphs_sentences = []
for i in range(len(test_chapters_paragraphs)): # chapter i
    chapter_list = []
    for j in range(len(test_chapters_paragraphs[i])): # paragraph j
        chapter_list.append(sent_tokenize(test_chapters_paragraphs[i][j]))
    test_chapters_paragraphs_sentences.append(chapter_list)

In [5]:
test_reference_summaries = test['summary_text'].tolist()
test_reference_summaries = [test_reference_summaries[i].replace("\n", " ") for i in range(len(test_reference_summaries))]

In [6]:
# Test summary generation
test_generated_summaries = []

In [7]:
# Load the best model
model.load_state_dict(torch.load('best_model.pt'))
model.eval()

# Function to generate summary for a chapter
def generate_summary(chapter_sentences, vocab, model, device, max_length=40, max_sentences=7000, target_ratio=0.15):
    model.eval()
    scores = []
    # Calculate total word count of the chapter
    chapter_words = sum(len(word_tokenize(sentence)) for sentence in chapter_sentences)
    target_word_count = int(chapter_words * target_ratio)
    
    # Prepare input for the model
    tokenized_indices = []
    for sentence in chapter_sentences:
        words = word_tokenize(sentence.lower())
        indices = [vocab.get(word, vocab['<UNK>']) for word in words]
        if len(indices) > max_length:
            indices = indices[:max_length]
        else:
            indices += [vocab['<PAD>']] * (max_length - len(indices))
        indices = [max(0, min(idx, len(vocab) - 1)) for idx in indices]
        tokenized_indices.append(indices)
    
    # Pad or truncate to max_sentences
    num_sentences = len(tokenized_indices)
    if num_sentences > max_sentences:
        tokenized_indices = tokenized_indices[:max_sentences]
        chapter_sentences = chapter_sentences[:max_sentences]
        num_sentences = max_sentences
    elif num_sentences < max_sentences:
        tokenized_indices += [[vocab['<PAD>']] * max_length] * (max_sentences - num_sentences)
    
    input_ids = torch.tensor([tokenized_indices], dtype=torch.long).to(device)  # Shape: [1, max_sentences, max_length]
    
    with torch.no_grad():
        outputs = model(input_ids, torch.tensor([num_sentences], device=device))  # Shape: [1, max_sentences, 2]
        scores = torch.softmax(outputs[0], dim=1)[:, 1].cpu().numpy()  # Probability of class 1 (in_summary)
    
    # Pair sentences with scores and original indices
    sentence_scores = [(sentence, score, len(word_tokenize(sentence)), idx) 
                      for idx, (sentence, score) in enumerate(zip(chapter_sentences, scores[:num_sentences]))]
    
    # Sort by score to select top sentences
    sentence_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Select sentences until target word count is reached
    selected = []
    current_word_count = 0
    for sentence, score, word_count, sentence_id in sentence_scores:
        if current_word_count + word_count <= target_word_count or len(selected) < 1:
            selected.append((sentence, sentence_id))
            current_word_count += word_count
        else:
            break
    
    # Sort selected sentences by sentence_id to maintain original order
    selected.sort(key=lambda x: x[1])
    selected_sentences = [sentence for sentence, _ in selected]
    
    return ' '.join(selected_sentences)

# Generate summaries for test set
test_generated_summaries = []
for chapter_sentences in test_chapters_sentences:
    summary = generate_summary(chapter_sentences, vocab, model, device, max_length=40, max_sentences=7000, target_ratio=0.15)
    test_generated_summaries.append(summary)

In [8]:
# Test ROUGE score
test_rouge1 = 0
test_rouge2 = 0
test_rougeL = 0
for i in range(len(test_generated_summaries)):
    print(i)
    scores = rougescorer.score(test_reference_summaries[i], test_generated_summaries[i])
    for key in scores:
        print("{}: {}".format(key, scores[key]))
        if key == "rouge1":
            test_rouge1 += scores[key][2] # take fmeasure value
        elif key == "rouge2":
            test_rouge2 += scores[key][2] # take fmeasure value
        else:
            test_rougeL += scores[key][2] # take fmeasure value
test_rouge1 /= len(test_generated_summaries)
test_rouge2 /= len(test_generated_summaries)
test_rougeL /= len(test_generated_summaries)
print("Test: rouge1 = {}, rouge2 = {}, rougeL = {}".format(test_rouge1, test_rouge2, test_rougeL))

0
rouge1: Score(precision=0.49233716475095785, recall=0.4312080536912752, fmeasure=0.4597495527728086)
rouge2: Score(precision=0.07869481765834933, recall=0.06890756302521009, fmeasure=0.07347670250896059)
rougeL: Score(precision=0.1685823754789272, recall=0.1476510067114094, fmeasure=0.15742397137745978)
1
rouge1: Score(precision=0.5592592592592592, recall=0.506145251396648, fmeasure=0.5313782991202345)
rouge2: Score(precision=0.11866501854140915, recall=0.10738255033557047, fmeasure=0.11274221961244862)
rougeL: Score(precision=0.19382716049382717, recall=0.17541899441340783, fmeasure=0.1841642228739003)
2
rouge1: Score(precision=0.5768261964735516, recall=0.4673469387755102, fmeasure=0.5163472378804961)
rouge2: Score(precision=0.12626262626262627, recall=0.10224948875255624, fmeasure=0.11299435028248588)
rougeL: Score(precision=0.20906801007556675, recall=0.16938775510204082, fmeasure=0.18714768883878238)
3
rouge1: Score(precision=0.4915254237288136, recall=0.13063063063063063, fmeas

## End of [6']: Run test & evaluate