In [None]:
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""


import json
import os
import warnings
from functools import lru_cache
from typing import Optional, Tuple

import regex as re

!pip install transformers
from transformers.tokenization_utils import (
    PreTrainedTokenizer,
    AddedToken)


#from tokenization_utils import (
 #     AddedToken, 
  #    PreTrainedTokenizer,
from transformers.tokenization_utils import logging

#!pip install logger
#import logger
logger = logging.get_logger(__name__)

VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",
    "merges_file": "merges.txt",
}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "gpt2": "https://huggingface.co/gpt2/resolve/main/vocab.json",
        "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/vocab.json",
        "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/vocab.json",
        "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/vocab.json",
        "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/vocab.json",
    },
    "merges_file": {
        "gpt2": "https://huggingface.co/gpt2/resolve/main/merges.txt",
        "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/merges.txt",
        "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/merges.txt",
        "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/merges.txt",
        "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/merges.txt",
    },
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "gpt2": 1024,
    "gpt2-medium": 1024,
    "gpt2-large": 1024,
    "gpt2-xl": 1024,
    "distilgpt2": 1024,
}


@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a signficant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2 ** 8):
        if b not in bs:
            bs.append(b)
            cs.append(2 ** 8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


def get_pairs(word):
    """
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

class GPT2Tokenizer(PreTrainedTokenizer):
    """
    Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding.

    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
    be encoded differently whether it is at the beginning of the sentence (without space) or not:

    ::

        >>> from transformers import GPT2Tokenizer
        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        >>> tokenizer("Hello world")['input_ids']
        [15496, 995]
        >>> tokenizer(" Hello world")['input_ids']
        [18435, 995]

    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.

    .. note::

        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
        one).

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (:obj:`str`):
            Path to the vocabulary file.
        merges_file (:obj:`str`):
            Path to the merges file.
        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
            The beginning of sequence token.
        eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
            The end of sequence token.
        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (GPT2 tokenizer detect beginning of words by the preceding space).
    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["attention_mask"]

    def __init__(
        self,
        vocab_file,
        merges_file,
        errors="replace",
        unk_token="<|endoftext|>",
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
        add_prefix_space=False,
        **kwargs
    ):
        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
        super().__init__(
            errors=errors,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )

        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        with open(merges_file, encoding="utf-8") as merges_handle:
            bpe_merges = merges_handle.read().split("\n")[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}
        self.add_prefix_space = add_prefix_space

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

    @property
    def vocab_size(self):
        return len(self.encoder)

    def get_vocab(self):
        return dict(self.encoder, **self.added_tokens_encoder)

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token)
        pairs = get_pairs(word)

        if not pairs:
            return token

        while True:
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = " ".join(word)
        self.cache[token] = word
        return word

    def _tokenize(self, text):
        """ Tokenize a string. """
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = "".join(
                self.byte_encoder[b] for b in token.encode("utf-8")
            )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
        return bpe_tokens

    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.decoder.get(index)

    def convert_tokens_to_string(self, tokens):
        """ Converts a sequence of tokens (string) in a single string. """
        text = "".join(tokens)
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        return text

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, ensure_ascii=False))

        index = 0
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!".format(merge_file)
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        return vocab_file, merge_file


    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")

        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        if is_split_into_words or add_prefix_space:
            text = " " + text
        return (text, kwargs)


Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 516 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 56.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 43.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#using this as a guide: https://github.com/VincentK1991/BERT_summarization_1/blob/master/notebook/generate-summary-with-BERT-or-GPT2.ipynb

In [None]:
#set up environment
#pipenv install
#pip install -r requirements.txt



In [None]:
#installed transformers . tokenizations. 

In [None]:
#@title Setup Environment and helper function
#@markdown Pip install Huggingface transformers

#@markdown if cuda is available, set device = 'cuda'

#@markdown setup pytorch environment

!pip install transformers
import transformers
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel, DistilBertModel, DistilBertTokenizer, BertTokenizer, BertForTokenClassification
import numpy as np

import nltk
nltk.download('punkt')
from nltk import sent_tokenize

!pip install tensorflow
%tensorflow_version 1.x
from keras.preprocessing.sequence import pad_sequences

from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

import json
import matplotlib.pyplot as plt
import timeit
import torch
import textwrap
wrapper = textwrap.TextWrapper(width=70)
SEED = 1234
torch.manual_seed(SEED)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
TensorFlow is already loaded. Please restart the runtime to change versions.


<torch._C.Generator at 0x7f2cc4b846b0>

In [None]:
!ls 

drive  sample_data


In [None]:
!#@title change to directory

#@markdown change directory to where to models are kept
#@markdown make sure this dir contain sub dirs for fine-tuned BERT and GPT2 models

%cd '/content/drive/My Drive/'

/content/drive/My Drive


In [None]:
#@title Choose Model Config and Weights

#@markdown Distil version is fine for this task
BERT_pretrained_weights = 'distilbert-base-uncased' #@param ["distilbert-base-uncased", "bert-base-uncased", "bert-base-cased"] {allow-input: true}

#@markdown for token classification we used 
BERTforTokenClassification_config_directory = 'https://huggingface.co/transformers/v3.1.0/_modules/transformers/modeling_bert.html#BertForTokenClassification' #@param {type:"string"}
token_label_files = '/content/drive/MyDrive/GPT2/resources/POS_tagging/POS2idx.json' #@param {type:"string"}

GPT2_config_directory = 'https://huggingface.co/gpt2' #@param {type:"string"}

#print('which BERT pre-trained ? ',BERT_pretrained_weights)
#print('where is BERT token classifier dir ? ',BERTforTokenClassification_config_directory)
print('where is GPT2 dir ? ',GPT2_config_directory)

where is GPT2 dir ?  https://huggingface.co/gpt2


In [None]:
#print(token_label_files)

In [None]:
#Load models and tokenizers
#the models are big, these may take a few mins, read [here](https://huggingface.co/transformers/serialization.html) for more information

#print('----loading pre-trained BERT----')
BERT_pretrained = DistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer_pretrained = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
print('----loading token labels----')
with open(token_label_files, 'r') as fp:
    POS2idx = json.load(fp)

POS_values = list(POS2idx.keys())
print('----loading BERT token classifier----')
BERT_token_classifier = BertForTokenClassification.from_pretrained('bert-base-uncased')
tokenizer_token_classifier = BertTokenizer.from_pretrained('bert-base-uncased')

#BERT_token_classifier.load_state_dict(torch.load('bert-base-uncased'))
print('----loading GPT2 summary generator----')
tokenizer_GPT2 = GPT2Tokenizer.from_pretrained("gpt2")
special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','pad_token':'<pad>','additional_special_tokens':['<|keyword|>','<|summarize|>']}
tokenizer_GPT2.add_special_tokens(special_tokens)
GPT2_generator = GPT2DoubleHeadsModel.from_pretrained("gpt2")

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

----loading token labels----
----loading BERT token classifier----


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

----loading GPT2 summary generator----


Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Some weights of GPT2DoubleHeadsModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['multiple_choice_head.summary.bias', 'multiple_choice_head.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#@title use GPU?

#@markdown check the box to indicate if GPU to be used for running any model?

use_GPU_BERT_pre_trained = False #@param {type:"boolean"}
use_GPU_BERT_token_classifier = False #@param {type:"boolean"}
use_GPU_GPT_generator = True #@param {type:"boolean"}

if torch.cuda.is_available():
  print('cuda is available')
  device = 'cuda'
  print('device is set to cuda')
if not torch.cuda.is_available():
  print('cuda is not available')
  device = 'cpu'
  print('device is set to cpu')
  use_GPU_BERT_pre_trained = False
  use_GPU_BERT_token_classifier = False
  use_GPU_GPT_generator = False

print(' ')
print('use GPU for pre-trained BERT?' ,use_GPU_BERT_pre_trained)
print('use GPU for BERT token classifier ?' ,use_GPU_BERT_token_classifier)
print('use GPU for GPT2?' ,use_GPU_GPT_generator)

cuda is available
device is set to cuda
 
use GPU for pre-trained BERT? False
use GPU for BERT token classifier ? False
use GPU for GPT2? True


In [None]:
input_file = 'GPT2/resources/hotelreview1.txt'


In [None]:
#@title Main text file
#@markdown indicate the text file to be summarized
#use_input_text = False

#input_file = 'GPT2/resources/hotelreview1.txt' #@param {type:"string"}
#max_len = 500 #@param {type:"integer",max:512}


#@markdown or copy paste your input here and check the box
use_input_text = True #@param {type:"boolean"}
input_text = "'My sister came into town so we decided to go to The National Harbor for a night. Checking in, the hotel staff was very pleasant. Parking is in a garage behind the hotel, for 18. Our room was clean with nice views. The bed was very comfortable. We were able to walk comfortably to the harbor and sightsee. There are also restaurants and bars within walking distance. Breakfast was great, with canadian bacon and sausage patty meat options. There was also a waffle station, eggs, potatoes, fruit, yogurt, cereal, muffins, and other pastries. Checkout was a breeze, and the front desk staff again, was very pleasant. I very much enjoyed my stay and I will definitely return!'" #{type:"string"}

#if not use_input_text:
   #open the txt file that is included
 # with open(input_file, 'r') as file:
  #  input_text = file.read().replace('\n', '')
    #type:"string"

# split text to sentences
paragraph_split = sent_tokenize(input_text)
#paragraph_split = input_text.split(".")
#from nltk.tokenize import PunktSentenceTokenizer
#custom_sent_tokenizer = PunktSentenceTokenizer(input_text)
#paragraph_split = custom_sent_tokenizer.tokenize(input_text)


print('input text has',len(paragraph_split) ,'sentences.')

print('tokenizing sentences')

input_tokens = []
for i in paragraph_split:
  input_tokens.append(tokenizer_pretrained.encode(i, 
                              add_special_tokens=True))
temp = []
for i in input_tokens:
  temp.append(len(i))
if np.max(temp) > max_len:
  raise ValueError('sentence longer than the max_len')
if np.max(temp) > 512:
  print('warning: sentence longer than 512')
  print('suggest to change max_len to 512, the remainder will be truncated')
input_ids = pad_sequences(input_tokens, 
                          maxlen=max_len, dtype="long", 
                          value=0, 
                          truncating="post", 
                          padding="post")

print('creating attention masks')

attention_masks = []
for sent in input_ids:
  att_mask = [int(token_id > 0) for token_id in sent]  # create a list of 0 and 1.
  attention_masks.append(att_mask)  # basically attention_masks is a list of list

input_ids = torch.tensor(input_ids)  
attention_mask = torch.tensor(attention_masks)

input text has 11 sentences.
tokenizing sentences
creating attention masks


In [None]:
input_ids


tensor([[ 101, 1005, 2026,  ...,    0,    0,    0],
        [ 101, 9361, 1999,  ...,    0,    0,    0],
        [ 101, 5581, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2045, 2001,  ...,    0,    0,    0],
        [ 101, 4638, 5833,  ...,    0,    0,    0],
        [ 101, 1045, 2200,  ...,    0,    0,    0]])

In [None]:
print(input_file)

GPT2/resources/hotelreview1.txt


In [None]:
#@title Extracting parameters

#@markdown make sure that the number_extract < number of sentences in input text
number_extract = 2 #@param {type:"slider", min:1, max:20, step:1}

if use_GPU_BERT_pre_trained:
  input_ids = input_ids.to(device)
  BERT_pretrained = BERT_pretrained.to(device)
  attention_mask = attention_mask.to(device)

if not use_GPU_BERT_pre_trained:
  input_ids = input_ids.to('cpu')
  BERT_pretrained = BERT_pretrained.to('cpu')
  attention_mask = attention_mask.to('cpu')

with torch.no_grad():
  last_hidden_states = BERT_pretrained(input_ids, 
                             attention_mask=attention_mask)

sentence_features = last_hidden_states[0][:,0,:].detach().cpu().numpy()

print('performing k-medoid clustering with '
        ,number_extract,' clusters')

kmeans = KMeans(n_clusters=number_extract, 
                random_state=0).fit(sentence_features)
cluster_center = kmeans.cluster_centers_
nbrs = NearestNeighbors(n_neighbors= 1, 
                        algorithm='brute').fit(sentence_features)
distances, indices = nbrs.kneighbors(
                  cluster_center.reshape(number_extract,-1))

indices = np.sort(indices.reshape(1,-1))
topic_answer = []
# for i in range(len(indices)):
#   topic_i = []
#   for j in indices[i]:
#     topic_i.append(paragraph_split[j])
#   topic_answer.append(topic_i)

for i in indices[0]:
  topic_answer.append(paragraph_split[i])

print('result:')

print('the ',number_extract,' extracted sentences are')
for i in topic_answer:
  print(i)

topic_answer_string = ''
for topic in topic_answer:
  topic_answer_string = topic_answer_string + ' '+ topic

performing k-medoid clustering with  2  clusters
result:
the  2  extracted sentences are
Checking in, the hotel staff was very pleasant.
There are also restaurants and bars within walking distance.


In [None]:
distances, indices = nbrs.kneighbors(
                  cluster_center.reshape(number_extract,-1))
indices

array([[6],
       [1]])

In [None]:
print (topic_answer_string)

 Checking in, the hotel staff was very pleasant. There are also restaurants and bars within walking distance.


In [None]:
# @title Contrast the input text
wrapper.wrap(input_text)

["'My sister came into town so we decided to go to The National Harbor",
 'for a night. Checking in, the hotel staff was very pleasant. Parking',
 'is in a garage behind the hotel, for 18. Our room was clean with nice',
 'views. The bed was very comfortable. We were able to walk comfortably',
 'to the harbor and sightsee. There are also restaurants and bars within',
 'walking distance. Breakfast was great, with canadian bacon and sausage',
 'patty meat options. There was also a waffle station, eggs, potatoes,',
 'fruit, yogurt, cereal, muffins, and other pastries. Checkout was a',
 'breeze, and the front desk staff again, was very pleasant. I very much',
 "enjoyed my stay and I will definitely return!'"]

In [None]:
#@title Keyword extraction

list_to_pick = ['NN','NNP','NNPS','NNS','VBD','VB','VBZ','VBP']

tokenized_sentence = tokenizer_token_classifier.encode(
                      topic_answer_string)
input_ids2 = torch.tensor([tokenized_sentence[:510]])

if use_GPU_BERT_token_classifier:
  BERT_token_classifier = BERT_token_classifier.to(device)
  input_ids2 = input_ids2.to(device)

if not use_GPU_BERT_token_classifier:
  BERT_token_classifier = BERT_token_classifier.to('cpu')
  input_ids2 = input_ids2.to('cpu')

with torch.no_grad():
  output2 = BERT_token_classifier(input_ids2)
label_indices = np.argmax(output2[0].to('cpu').numpy(), axis=2)

list_keywords = []

tokens = tokenizer_token_classifier.convert_ids_to_tokens(
                        input_ids2.to('cpu').numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(POS_values[label_idx])
        new_tokens.append(token)
for token, label in zip(new_tokens, new_labels):
    if label in list_to_pick:
      list_keywords.append(token)

print('finished keyword extraction ...')
print('the keywords are')

list_keywords = [i for i in list_keywords if i not in ['[CLS]','[SEP]','?','/','-','.','_','!','@','[',']']]
list_keywords

list_keywords_str = ' '.join(list_keywords)
wrapper.wrap(list_keywords_str)

finished keyword extraction ...
the keywords are


['checking in , the staff was very pleasant are also restaurants and',
 'within']

In [None]:
print(input_ids2)

tensor([[  101,  1005,  2026,  2905,  2234,  2046,  2237,  2061,  2057,  2787,
          2000,  2175,  2000,  1996,  2120,  6496,  2005,  1037,  2305,  1012,
          9361,  1999,  1010,  1996,  3309,  3095,  2001,  2200,  8242,  1012,
          5581,  2003,  1999,  1037,  7381,  2369,  1996,  3309,  1010,  2005,
          2324,  1012,  6350,  2001,  2307,  1010,  2007,  3010, 11611,  1998,
         24165, 17798,  6240,  7047,  1012,  2045,  2001,  2036,  1037, 11333,
         18142,  2276,  1010,  6763,  1010, 14629,  1010,  5909,  1010, 10930,
         27390,  2102,  1010, 20943,  1010, 14163, 15379,  2015,  1010,  1998,
          2060,  2627,  5134,  1012,   102]])


In [None]:
list_keywords_str

'my sister came into town so we decided go to the national for a checking in , the staff was very pleasant is a behind the , breakfast was great , with canadian bacon and sausage patty meat options a waffle station , eggs , potatoes , fruit , yogurt , cereal , muffins , and other pastries'

In [None]:
wrapper.wrap(topic_answer_string)

[" 'My sister came into town so we decided to go to The National Harbor",
 'for a night. Checking in, the hotel staff was very pleasant. Parking',
 'is in a garage behind the hotel, for 18. Breakfast was great, with',
 'canadian bacon and sausage patty meat options. There was also a waffle',
 'station, eggs, potatoes, fruit, yogurt, cereal, muffins, and other',
 'pastries.']

In [None]:
##GPT2

In [None]:
list_keywords_str2 = 'My sister came into town so we decided to go to The National Harbor for a night. Checking in, the hotel staff was very pleasant. Parking is in a garage behind the hotel, for 18. Our room was clean with nice views. The bed was very comfortable. We were able to walk comfortably to the harbor and sightsee. There are also restaurants and bars within walking distance. Breakfast was great, with canadian bacon and sausage patty meat options. There was also a waffle station, eggs, potatoes, fruit, yogurt, cereal, muffins, and other pastries. Checkout was a breeze, and the front desk staff again, was very pleasant. I very much enjoyed my stay and I will definitely return!'


In [None]:
title = 'Awesome Stay'


In [None]:
#@title GPT2 input preparation

GPT2_input = tokenizer_GPT2.encode(
      '<|startoftext|> ' +title + list_keywords_str + ' <|summarize|> ')
GPT2_input_torch = torch.tensor(GPT2_input, dtype=torch.long)

print("the keyword input :")
wrapper.wrap(tokenizer_GPT2.decode(GPT2_input_torch))

the keyword input :


['<|startoftext|> Awesome Staychecking in, the staff was very pleasant',
 'are also restaurants and within <|summarize|>']

In [None]:
wrapper.wrap(title+list_keywords_str2)

['Awesome StayMy sister came into town so we decided to go to The',
 'National Harbor for a night. Checking in, the hotel staff was very',
 'pleasant. Parking is in a garage behind the hotel, for 18. Our room',
 'was clean with nice views. The bed was very comfortable. We were able',
 'to walk comfortably to the harbor and sightsee. There are also',
 'restaurants and bars within walking distance. Breakfast was great,',
 'with canadian bacon and sausage patty meat options. There was also a',
 'waffle station, eggs, potatoes, fruit, yogurt, cereal, muffins, and',
 'other pastries. Checkout was a breeze, and the front desk staff again,',
 'was very pleasant. I very much enjoyed my stay and I will definitely',
 'return!']

In [None]:
print (GPT2_input_torch)


tensor([50257, 49061, 16160, 41004,   287,   837,   262,  3085,   373,   845,
        15497,   389,   635, 10808,   290,  1626, 50260])


In [None]:
#position_ids = torch.stack([torch.arange(config.max_position_embeddings) for a in range(GPT2_input_torch)]).to(device)


In [None]:
print()




In [None]:
#@title GPT2 paraphrase generation

#@markdown this step may takes a few mins without GPU

#CUDA_LAUNCH_BLOCKING=1

temperature =  1#@param {type:"number"}
greedy_search = False #@param {type:"boolean"}
top_k =   50#@param {type:"integer",min:1}
top_p = 0.8 #@param {type:"number",max:1}
max_length = 200 #@param {type:"integer",max:1}

min_length= 20 #@param {type:"integer",max:1}
num_return_sequences=3 #@param {type:"integer",min:1}

if use_GPU_GPT_generator:
  GPT2_generator = GPT2_generator.to(device)
  GPT2_input_torch = GPT2_input_torch.to(device)

do_sample = not greedy_search
if do_sample == False:
  num_return_sequences = 1
  
sampling_output = GPT2_generator.generate(
      input_ids=GPT2_input_torch.unsqueeze(0),
      max_length=max_length + len(GPT2_input_torch),
      min_length = min_length + len(GPT2_input_torch),
      temperature=temperature,
      decoder_start_token_id= '<|startoftext|>',
      top_k=top_k,
      top_p=top_p,
      do_sample=do_sample,
      num_return_sequences=num_return_sequences,
      add_special_tokens=True,
      #pad_token_id=GPT2_generator.eos_token_id,
      truncation = True,
      CUDA_LAUNCH_BLOCKING=1)
     # no_repeat_ngram_size=2)

print('finish generating')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


finish generating


In [None]:
print(input_ids2)

In [None]:
print(input_ids)

In [None]:
print(sampling_output)

tensor([[50256,  1212,  4130,   318,  3194,   287,   262,  4437,   286,   674,
           890,  2106,   355,   281,  8233,  4130,    13,   198,   198,  1890,
           883,   286,   345,   326,   389,   649,   284,  2106,    11,   428,
          4130,   481,   307,   845,  7613,    13,   314,   423,  3194,   546,
           428,  7243,   329,   257,   890,   640,   783,    11,   523,   314,
          1183,   307,  3599,   284,   467,   832,   616,  6461,   351,   262,
          2426,    13,   198,   198,  8421,   314,   651,   656,   617,   286,
           262,  2106,   326,   314,  1053,  3194,   546,    11,   314,   765,
           284, 11589,  1561,   546,   257,  1178,   584, 10233,   326,   314,
           423,  3194,   546,    11,   475,   314,  2911,   326,   345,  1183,
           307,  4609,   287,  3555,   832,   606,    13,   198,   198,   818,
          3090,   284,   257,  1049,  1730,   286,  6754,  2267,   326,   314,
          1053,  1760,    11,   314,  1101,   635,  

In [None]:
#@title GPT2 generated output

which_output = 0 #@param {type:"slider", min:0, max:10, step:1}
wrapper.wrap(tokenizer_GPT2.decode(
    sampling_output[which_output,len(GPT2_input_torch):], 
    skip_special_tokens=True)[:5000])

['  For those of you that are new to history, this blog will be very',
 "helpful. I have written about this topic for a long time now, so I'll",
 'be starting to go through my experiences with the subject.  Before I',
 "get into some of the history that I've written about, I want to",
 'briefly talk about a few other topics that I have written about, but I',
 "hope that you'll be interested in reading through them.  In addition",
 "to a great deal of historical research that I've done, I'm also an",
 "activist, so I'd like to share some of the reasons that I am involved",
 'with the blog, as well as some of the things that I think are',
 'important.  History  In this blog, I will be looking at how the',
 "history of the United States came about. I'm also going to be looking",
 'at how the United States was founded. This will be the first blog to',
 'take the idea of history and take it into the world']

In [None]:
#@title GPT2 generated output

which_output = 0 #@param {type:"slider", min:0, max:10, step:1}
wrapper.wrap(tokenizer_GPT2.decode(
    sampling_output[which_output,len(GPT2_input_torch):], 
    skip_special_tokens=True)[:500])

['  For those of you that are new to history, this blog will be very',
 "helpful. I have written about this topic for a long time now, so I'll",
 'be starting to go through my experiences with the subject.  Before I',
 "get into some of the history that I've written about, I want to",
 'briefly talk about a few other topics that I have written about, but I',
 "hope that you'll be interested in reading through them.  In addition",
 "to a great deal of historical research that I've done, I'm also an",
 "activist, so I'd like t"]

In [None]:
wrapper.wrap(title + gold_label)


In [None]:
wrapper.wrap(title+list_keywords_str2)
