In [1]:
# import required libraries

import transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import XLMWithLMHeadModel, XLMTokenizer

In [2]:

text = """
       Machine learning (ML) is the study of computer algorithms that improve automatically through experience. 
       It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model 
       based on sample data, known as "training data", in order to make predictions or decisions without being explicitly 
       programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as email filtering 
       and computer vision, where it is difficult or infeasible to develop conventional algorithms to perform the needed tasks.
       """ 

In [3]:
text

'\n       Machine learning (ML) is the study of computer algorithms that improve automatically through experience. \n       It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model \n       based on sample data, known as "training data", in order to make predictions or decisions without being explicitly \n       programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as email filtering \n       and computer vision, where it is difficult or infeasible to develop conventional algorithms to perform the needed tasks.\n       '

# T5 - Summarization

In [4]:
# instantiating the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-large')
tokenizer = T5Tokenizer.from_pretrained('t5-large')

In [5]:
# to run T5 for summarization we concatenate "summarize" with original text

t5_text = "summarize" + text
t5_text

'summarize\n       Machine learning (ML) is the study of computer algorithms that improve automatically through experience. \n       It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model \n       based on sample data, known as "training data", in order to make predictions or decisions without being explicitly \n       programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as email filtering \n       and computer vision, where it is difficult or infeasible to develop conventional algorithms to perform the needed tasks.\n       '

In [6]:
# next, we use T5 encoder-decoder to generate the input sequence as input ids

input_ids = tokenizer.encode(t5_text, return_tensors='pt')
input_ids

tensor([[21603,  5879,  1036,    41,  6858,    61,    19,     8,   810,    13,
          1218, 16783,    24,  1172,  3269,   190,   351,     5,    94,    19,
           894,    38,     3,     9,   769,  2244,    13,  7353,  6123,     5,
          5879,  1036, 16783,   918,     3,     9, 18913,   825,     3,   390,
            30,  3106,   331,     6,   801,    38,    96, 13023,   331,  1686,
            16,   455,    12,   143, 20099,    42,  3055,   406,   271, 21119,
          2486,    26,    12,   103,    78,     5,  5879,  1036, 16783,    33,
           261,    16,     3,     9,  1148,  1196,    13,  1564,     6,   224,
            38,   791,  4191,    53,    11,  1218,  2267,     6,   213,    34,
            19,  1256,    42,    16,    89,    15,     9,     7,  2317,    12,
          1344,  7450, 16783,    12,  1912,     8,   906,  4145,     5,     1]])

In [7]:
# next, pass input_ids to function generate() to return sequence ids corresponding to the summary

summary_ids = model.generate(input_ids)
summary_ids

tensor([[    0, 32099,     5, 32098,     5,  5879,  1036,    19,     8,   810,
            13,  1218, 16783,    24,  1172,  3269,   190,   351,     5, 32097]])

In [8]:
# next, we use decode function to generate summary text from these ids

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summary

'.. Machine learning is the study of computer algorithms that improve automatically through experience.'

# BART - Summarization

In [9]:
# instantiating the model and tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [10]:
text

'\n       Machine learning (ML) is the study of computer algorithms that improve automatically through experience. \n       It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model \n       based on sample data, known as "training data", in order to make predictions or decisions without being explicitly \n       programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as email filtering \n       and computer vision, where it is difficult or infeasible to develop conventional algorithms to perform the needed tasks.\n       '

In [11]:
# next, we use tokenizer to generate the inputs

inputs = tokenizer.batch_encode_plus([text], return_tensors='pt')
inputs

{'input_ids': tensor([[    0, 50118,  1437,  1437,  1437,  1437,  1437,  1437, 14969,  2239,
            36, 10537,    43,    16,     5,   892,     9,  3034, 16964,    14,
          1477,  6885,   149,   676,     4,  1437, 50118,  1437,  1437,  1437,
          1437,  1437,  1437,    85,    16,   450,    25,    10, 37105,     9,
          7350,  2316,     4, 14969,  2239, 16964,  1119,    10, 30412,  1421,
          1437, 50118,  1437,  1437,  1437,  1437,  1437,  1437,   716,    15,
          7728,   414,     6,   684,    25,    22, 32530,   414,  1297,    11,
           645,     7,   146, 12535,    50,  2390,   396,   145, 16369,  1437,
         50118,  1437,  1437,  1437,  1437,  1437,  1437, 30825,     7,   109,
            98,     4, 14969,  2239, 16964,    32,   341,    11,    10,  1810,
          3143,     9,  2975,     6,   215,    25,  1047, 35060,  1437, 50118,
          1437,  1437,  1437,  1437,  1437,  1437,     8,  3034,  3360,     6,
           147,    24,    16,  1202,  

In [12]:
# next, we pass input_ids to model.generate() function

summary_ids = model.generate(inputs['input_ids'], early_stopping=True)
summary_ids

tensor([[    2,     0, 46100,  2239,    36, 10537,    43,    16,     5,   892,
             9,  3034, 16964,    14,  1477,  6885,   149,   676,     4,    85,
            16,   450,    25,    10, 37105,     9,  7350,  2316,     4, 14969,
          2239, 16964,    32,   341,    11,    10,  1810,  3143,     9,  2975,
             6,   215,    25,  1047, 35060,     8,  3034,  3360,     6,   147,
            24,    16,  1202,    50,  4047, 29358,  4748,     7,  2179,  9164,
         16964,     4,     2]])

In [13]:
# next, we use decode function to generate summary text from these ids

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summary

'Machine learning (ML) is the study of computer algorithms that improve automatically through experience. It is seen as a subset of artificial intelligence. Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult or infeasible to develop conventional algorithms.'

# GPT-2 Summarization

In [14]:
# instantiating the model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.config.max_length = 512

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [15]:
# next, we use tokenizer to generate the inputs

inputs = tokenizer.batch_encode_plus([text], return_tensors='pt')
inputs

{'input_ids': tensor([[  198,   220,   220,   220,   220,   220,   220, 10850,  4673,   357,
          5805,     8,   318,   262,  2050,   286,  3644, 16113,   326,  2987,
          6338,   832,  1998,    13,   220,   198,   220,   220,   220,   220,
           220,   220,   632,   318,  1775,   355,   257, 24637,   286, 11666,
          4430,    13, 10850,  4673, 16113,  1382,   257, 18069,  2746,   220,
           198,   220,   220,   220,   220,   220,   220,  1912,   319,  6291,
          1366,    11,  1900,   355,   366, 34409,  1366,  1600,   287,  1502,
           284,   787, 16277,   393,  5370,  1231,   852, 11777,   220,   198,
           220,   220,   220,   220,   220,   220, 27402,   284,   466,   523,
            13, 10850,  4673, 16113,   389,   973,   287,   257,  3094,  4996,
           286,  5479,    11,   884,   355,  3053, 25431,   220,   198,   220,
           220,   220,   220,   220,   220,   290,  3644,  5761,    11,   810,
           340,   318,  2408,   393,  

In [16]:
# next, we pass input_ids to model.generate() function

summary_ids = model.generate(inputs['input_ids'], early_stopping=True)
summary_ids

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[  198,   220,   220,   220,   220,   220,   220, 10850,  4673,   357,
          5805,     8,   318,   262,  2050,   286,  3644, 16113,   326,  2987,
          6338,   832,  1998,    13,   220,   198,   220,   220,   220,   220,
           220,   220,   632,   318,  1775,   355,   257, 24637,   286, 11666,
          4430,    13, 10850,  4673, 16113,  1382,   257, 18069,  2746,   220,
           198,   220,   220,   220,   220,   220,   220,  1912,   319,  6291,
          1366,    11,  1900,   355,   366, 34409,  1366,  1600,   287,  1502,
           284,   787, 16277,   393,  5370,  1231,   852, 11777,   220,   198,
           220,   220,   220,   220,   220,   220, 27402,   284,   466,   523,
            13, 10850,  4673, 16113,   389,   973,   287,   257,  3094,  4996,
           286,  5479,    11,   884,   355,  3053, 25431,   220,   198,   220,
           220,   220,   220,   220,   220,   290,  3644,  5761,    11,   810,
           340,   318,  2408,   393,  1167, 30412,  

In [17]:
# next, we use decode function to generate summary text from these ids

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summary

'\n       Machine learning (ML) is the study of computer algorithms that improve automatically through experience. \n       It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model \n       based on sample data, known as "training data", in order to make predictions or decisions without being explicitly \n       programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as email filtering \n       and computer vision, where it is difficult or infeasible to develop conventional algorithms to perform the needed tasks.\n                                                                                                                                                                                                                                                                                                                                                                                      '

# XLM Summarization

In [25]:
# instantiating the model and tokenizer
model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
model.config.max_length = 512

tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')

Some weights of XLMWithLMHeadModel were not initialized from the model checkpoint at xlm-mlm-en-2048 and are newly initialized: ['transformer.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# next, we use tokenizer to generate the inputs

inputs = tokenizer.batch_encode_plus([text], return_tensors='pt')
inputs

{'input_ids': tensor([[    0,  2346,  2674,    36, 11111,    35,    26,    14,  1011,    17,
          1711, 15507,    30,  4093,  7540,   107, 28389,   314,  1972,    16,
            32,    26,   534,    28,    22, 19859,    17,  7499, 16910,  1636,
          1972,    16,  2346,  2674, 15507,  2281,    22,  8188,  1193,   348,
            27,  6841,  1302,    15,   159,    28,    19,   920,  1302,    15,
            19,    20,   412,    21,   223, 23825,    54,  5658,   351,   133,
         12392, 17692,    21,    99,   691,    16,  2346,  2674, 15507,    49,
           137,    20,    22,  1096,  1816,    17,  4028,    15,   131,    28,
          9453, 27195,    18,  1711,  2611,    15,   103,    32,    26,  2006,
            54,  6932,  3487, 11286,    21,  3011,  6060, 15507,    21,  3252,
            14,   852,  8307,    16,     1]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [27]:
# next, we pass input_ids to model.generate() function

summary_ids = model.generate(inputs['input_ids'], early_stopping=True)
summary_ids

KeyboardInterrupt: 

In [None]:
# next, we use decode function to generate summary text from these ids

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summary