# Install

In [1]:
# !pip3 install torch torchvision torchaudio

In [2]:
#!pip install transformers

# Import

## EN

In [3]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [4]:
en_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

## KO

In [5]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration

In [6]:
ko_tokenizer = PreTrainedTokenizerFast.from_pretrained("ainize/kobart-news")

# Load Model

## EN

In [7]:
en_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

## KO

In [8]:
ko_model = BartForConditionalGeneration.from_pretrained("ainize/kobart-news")

# Perform Abstractive Summarization

## EN

In [9]:
wiki_article = """Articles are found in many Indo-European languages, Semitic languages (only the definite article), and Polynesian languages; however, they are formally absent from many of the world's major languages including: Chinese, Japanese, Korean, Mongolian, many Turkic languages (incl. Tatar, Bashkir, Tuvan and Chuvash), many Uralic languages (incl. Finnic[a] and Saami languages), Indonesian, Hindi-Urdu, Punjabi, Tamil, the Baltic languages, the majority of Slavic languages, the Bantu languages (incl. Swahili) and Yoruba. In some languages that do have articles, such as some North Caucasian languages, the use of articles is optional; however, in others like English and German it is mandatory in all cases.

Linguists believe the common ancestor of the Indo-European languages, Proto-Indo-European, did not have articles. Most of the languages in this family do not have definite or indefinite articles: there is no article in Latin or Sanskrit, nor in some modern Indo-European languages, such as the families of Slavic languages (except for Bulgarian and Macedonian, which are rather distinctive among the Slavic languages in their grammar, and some Northern Russian dialects[7]), Baltic languages and many Indo-Aryan languages. Although Classical Greek had a definite article (which has survived into Modern Greek and which bears strong functional resemblance to the German definite article, which it is related to), the earlier Homeric Greek used this article largely as a pronoun or demonstrative, whereas the earliest known form of Greek known as Mycenaean Greek did not have any articles. Articles developed independently in several language families.

Not all languages have both definite and indefinite articles, and some languages have different types of definite and indefinite articles to distinguish finer shades of meaning: for example, French and Italian have a partitive article used for indefinite mass nouns, whereas Colognian has two distinct sets of definite articles indicating focus and uniqueness, and Macedonian uses definite articles in a demonstrative sense, with a tripartite distinction (proximal, medial, distal) based on distance from the speaker or interlocutor. The words this and that (and their plurals, these and those) can be understood in English as, ultimately, forms of the definite article the (whose declension in Old English included thaes, an ancestral form of this/that and these/those).

In many languages, the form of the article may vary according to the gender, number, or case of its noun. In some languages the article may be the only indication of the case. Many languages do not use articles at all, and may use other ways of indicating old versus new information, such as topic–comment constructions."""

In [10]:
tokens = en_tokenizer(wiki_article, truncation=True, padding="longest", return_tensors="pt")

In [11]:
tokens

{'input_ids': tensor([[16643,   127,   374,   115,   223, 29504,   121, 17859,  4482,   108,
           110, 42237,  4482,   143,  6026,   109, 13745,   974,   312,   111,
         51585,  4482,   206,   802,   108,   157,   127, 11830, 12001,   135,
           223,   113,   109,   278,   131,   116,   698,  4482,   330,   151,
          1950,   108,  2769,   108,  5829,   108, 42911,   108,   223, 34664,
          2288,  4482,   143, 62587,   107, 71491,   108, 25720, 35332,   108,
         10374, 10313,   111, 15379,  2075, 15007,   312,   223, 75098,  2288,
          4482,   143, 62587,   107, 18533,  2288,  4101,   304,  1100,   111,
          8375, 13291,  4482,   312, 15828,   108, 12499,   121, 40549,  9423,
           108, 32603,   108, 11635,   108,   109, 21201,  4482,   108,   109,
          2198,   113, 66815,  4482,   108,   109,   596, 63661,  4482,   143,
         62587,   107, 68119,   158,   111, 74321,   107,   222,   181,  4482,
           120,   171,   133,  2391,  

In [12]:
{**tokens}

{'input_ids': tensor([[16643,   127,   374,   115,   223, 29504,   121, 17859,  4482,   108,
            110, 42237,  4482,   143,  6026,   109, 13745,   974,   312,   111,
          51585,  4482,   206,   802,   108,   157,   127, 11830, 12001,   135,
            223,   113,   109,   278,   131,   116,   698,  4482,   330,   151,
           1950,   108,  2769,   108,  5829,   108, 42911,   108,   223, 34664,
           2288,  4482,   143, 62587,   107, 71491,   108, 25720, 35332,   108,
          10374, 10313,   111, 15379,  2075, 15007,   312,   223, 75098,  2288,
           4482,   143, 62587,   107, 18533,  2288,  4101,   304,  1100,   111,
           8375, 13291,  4482,   312, 15828,   108, 12499,   121, 40549,  9423,
            108, 32603,   108, 11635,   108,   109, 21201,  4482,   108,   109,
           2198,   113, 66815,  4482,   108,   109,   596, 63661,  4482,   143,
          62587,   107, 68119,   158,   111, 74321,   107,   222,   181,  4482,
            120,   171,   1

In [13]:
summary = en_model.generate(**tokens)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ../aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [14]:
summary

tensor([[    0, 16643,   127,   374,   115,   223,  4482,   108,   155,   127,
         11830, 12001,   135,   223,   113,   109,   278,   131,   116,   698,
          4482,   107,     1]])

In [15]:
en_tokenizer.decode(summary[0])

"Articles are found in many languages, but are formally absent from many of the world's major languages."

## KO

In [16]:
ko_article = """자신이 원하는 것을 하려 하는 아이에게 ‘하지마’라는 말은 강한 부정적 감정을 불러일으킨다. 아이가 본인의 결정이 존중받지 못했다고 받아들일 수 있는 것. 이 기질의 아이들에게는 왜 하면 안 되는지에 대한 설명을 해줄 필요가 있다. 반면 자기주도적인 아이들은 자신이 하겠다고 한 말은 잘 지킨다. 예를 들어 아이가 태블릿을 갖고 놀고 싶어 한다면 “몇 시까지 할거야?”라고 묻고, 아이가 ‘30분’이라고 대답했다면 하게 두는 것도 좋다. 시간이 끝나기 전에 ‘10분 남았어’, ‘5분 남았어’라며 시간을 상기시켜주면 아이는 순순히 하던 것을 멈추고 부모에게 태블릿을 돌려줄 것이다."""

In [17]:
input_ids = ko_tokenizer.encode(ko_article, return_tensors="pt")

In [19]:
# Generate Summary Text Ids
summary_text_ids = ko_model.generate(
    input_ids=input_ids,
    bos_token_id=ko_model.config.bos_token_id,
    eos_token_id=ko_model.config.eos_token_id,
    length_penalty=2.0,
    max_length=142,
    min_length=56,
    num_beams=4,
)

In [20]:
# Decoding Text
print(ko_tokenizer.decode(summary_text_ids[0], skip_special_tokens=True))

자 원하는 것을 하려 하는 아이에게 ‘하지마’라는 말은 강한 부정적 감정을 불러일으키는데 이 기질의 아이들에게는 왜 하면 안 되는지에 대한 설명을 해줄 필요가 있고 자기주도적인 아이들은 자신이 하겠다고 한 말은 잘 지킨다고 알려지며 시간이 끝나기 전에 시간이 끝나기 전에 ‘10분 남았어’, ‘5분 남았어’라며 시간을 상기시켜주면 아이는 순순히 하던 것을 멈추고 부모에게 태블릿을 돌려줄 것이다.
