# Natural Language Processing Techniques Notebook

*Author: Hoang Anh Tuan*

In [None]:
# Installation
!pip install spacy



In [1]:
import spacy
from spacy import displacy    # Spacy Visualization
# Text classification using SVC
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
# Topic Modelling
import gensim
from gensim import corpora

# Load the English language model
nlp = spacy.load("en_core_web_sm")

## **Tokenization**

### Example 1: Simple Sentence

In [None]:
token_1 = "NLP can improve on keyword matching search for document and FAQ retrieval by disambiguating word senses based on context"
doc_token_1 = nlp(token_1)

In [None]:
for token in doc_token_1:
  # Print the id and text
  print(token.i + 1, token.text)

1 NLP
2 can
3 improve
4 on
5 keyword
6 matching
7 search
8 for
9 document
10 and
11 FAQ
12 retrieval
13 by
14 disambiguating
15 word
16 senses
17 based
18 on
19 context


### Example 2: Complex Sentence with Punctuation and Numbers

In [None]:
token_2 = "spaCy ver 3 introduces 1 transformer-based pipelines that bring spaCy's accuracy right up to the current state-of-the-art."
doc_token_2 = nlp(token_2)

In [None]:
for token in doc_token_2:
  # Print the id, text and boolean indicating the digits
  print(token.i + 1, token.text, token.is_digit)

1 spaCy False
2 ver False
3 3 True
4 introduces False
5 1 True
6 transformer False
7 - False
8 based False
9 pipelines False
10 that False
11 bring False
12 spaCy False
13 's False
14 accuracy False
15 right False
16 up False
17 to False
18 the False
19 current False
20 state False
21 - False
22 of False
23 - False
24 the False
25 - False
26 art False
27 . False


## **Uppercasing/Lowercasing**

### Example 1: Simple Casing

In [2]:
cas_1 = "Hello World! This is tuanx18"
doc_cas_1 = nlp(cas_1)

In [6]:
for token in doc_cas_1:
  print(f"Upper case of: {token.text} is {token.text.upper()} and its Lowercase is {token.text.lower()} ")

Upper case of: Hello is HELLO and its Lowercase is hello 
Upper case of: World is WORLD and its Lowercase is world 
Upper case of: ! is ! and its Lowercase is ! 
Upper case of: This is THIS and its Lowercase is this 
Upper case of: is is IS and its Lowercase is is 
Upper case of: tuanx18 is TUANX18 and its Lowercase is tuanx18 


### Example 2: Casing for the whole sentence

In [7]:
cas_2 = "Now, I and my family that consists of 6 family members, are going to play basketball at the nearby park until midnight."
doc_cas_2 = nlp(cas_2)

In [8]:
# Print the whole sentence in uppercase
upper_cas_2 = [token.text.upper() for token in doc_cas_2]
upper_text = " ".join(upper_cas_2)
print(upper_text)

NOW , I AND MY FAMILY THAT CONSISTS OF 6 FAMILY MEMBERS , ARE GOING TO PLAY BASKETBALL AT THE NEARBY PARK UNTIL MIDNIGHT .


In [9]:
# Take the uppercased sentence, make it lowercase
upper_doc = nlp(upper_text)
lower_cas_2 = [token.text.lower() for token in upper_doc]
lower_text = " ".join(lower_cas_2)
print(lower_text)

now , i and my family that consists of 6 family members , are going to play basketball at the nearby park until midnight .


## **Part-of-Speech Tagging (POS)**

### Example 1: Simple POS Tagging + Explanation

In [None]:
pos_1 = "The quick brown fox jumps over the 2 lazy dogs."
doc_pos_1 = nlp(pos_1)

In [None]:
for token in doc_pos_1:
  # Print id, text, POS and full POS
  print(f"{token.i} {token.text} {token.pos_}: {spacy.explain(token.pos_)}")

0 The DET: determiner
1 quick ADJ: adjective
2 brown ADJ: adjective
3 fox NOUN: noun
4 jumps VERB: verb
5 over ADP: adposition
6 the DET: determiner
7 2 NUM: numeral
8 lazy ADJ: adjective
9 dogs NOUN: noun
10 . PUNCT: punctuation


### Example 2: POS Tagging and Dependency Parsing

In [None]:
pos_2 = "The 666 cats sat on the 666 mats."
doc_pos_2 = nlp(pos_2)

In [None]:
for token in doc_pos_2:
  # Print the id, text and its dependency parsing
  print(f"{token.i} {token.text} - {token.dep_} - {token.head.text}")

0 The - det - cats
1 666 - nummod - cats
2 cats - nsubj - sat
3 sat - ROOT - sat
4 on - prep - sat
5 the - det - mats
6 666 - nummod - mats
7 mats - pobj - on
8 . - punct - sat


## **Stop Words Removal**

### Example 1: Define the Stop Words

In [None]:
sw_1 = "Wow, there are more than a billion people on the world, but none of them are perfect."
doc_sw_1 = nlp(sw_1)

In [None]:
# Check the type of words
for token in doc_sw_1:
  if token.is_stop == True:
    print(f"{token.text} is a stop word")
  elif token.is_digit == True:
    print(f"Oh look, a number: ")
  else:
    print(f"{token.text} is just a normal word, no need to care")

Wow is just a normal word, no need to care
, is just a normal word, no need to care
there is a stop word
are is a stop word
more is a stop word
than is a stop word
a is a stop word
billion is just a normal word, no need to care
people is just a normal word, no need to care
on is a stop word
the is a stop word
world is just a normal word, no need to care
, is just a normal word, no need to care
but is a stop word
none is a stop word
of is a stop word
them is a stop word
are is a stop word
perfect is just a normal word, no need to care
. is just a normal word, no need to care


##**Lemmatization**

### Example 1: Simple Word Lemmatization

In [None]:
lemma_1 = ["were", "happiness", "largest", "mice", "bought", "times"]
for x in lemma_1:
  doc_lemma = nlp(x)
  print(doc_lemma[0].lemma_)

be
happiness
large
mouse
buy
time


### Example 2: Create a function for Lemmatizing the whole text

In [None]:
# Define a function for lemmatization
def lemmatize_text(text):
  doc = nlp(text)
  lemmatized_tokens = [token.lemma_ for token in doc]
  return " ".join(lemmatized_tokens)

In [None]:
# Example text
lemma_2 = "The biggest cats are running around the football fields where we belonged to"
lemmatized_2 = lemmatize_text(lemma_2)
print(f"Lemmatized Text: {lemmatized_2}")

Lemmatized Text: the big cat be run around the football field where we belong to


### Example 3: Lemmatization with Sentence Structure

In [None]:
lemma_3 = "She has always wanted playing with one of my best friends, Jeremy - The blackest hunter. And she also wanted to pet all my dogs within the tree houses"
doc_lemma_3 = nlp(lemma_3)

In [None]:
# Loop through the sentences and then the tokens to print the text/lemmatized text
for sent in doc_lemma_3.sents:
  print(f"\nNumber of letter of this sentence is {sent.end_char - sent.start_char}")
  for token in sent:
    if token.text == token.lemma_:
      print(f"Index: {token.i + 1} // Unchanged Lemma")
    else:
      print(f"Index: {token.i + 1}, Text: {token.text}, Lemmatized: {token.lemma_}")


Number of letter of this sentence is 88
Index: 1, Text: She, Lemmatized: she
Index: 2, Text: has, Lemmatized: have
Index: 3 // Unchanged Lemma
Index: 4, Text: wanted, Lemmatized: want
Index: 5, Text: playing, Lemmatized: play
Index: 6 // Unchanged Lemma
Index: 7 // Unchanged Lemma
Index: 8 // Unchanged Lemma
Index: 9 // Unchanged Lemma
Index: 10, Text: best, Lemmatized: good
Index: 11, Text: friends, Lemmatized: friend
Index: 12 // Unchanged Lemma
Index: 13 // Unchanged Lemma
Index: 14 // Unchanged Lemma
Index: 15, Text: The, Lemmatized: the
Index: 16, Text: blackest, Lemmatized: black
Index: 17 // Unchanged Lemma
Index: 18 // Unchanged Lemma

Number of letter of this sentence is 61
Index: 19, Text: And, Lemmatized: and
Index: 20 // Unchanged Lemma
Index: 21 // Unchanged Lemma
Index: 22, Text: wanted, Lemmatized: want
Index: 23 // Unchanged Lemma
Index: 24 // Unchanged Lemma
Index: 25 // Unchanged Lemma
Index: 26 // Unchanged Lemma
Index: 27, Text: dogs, Lemmatized: dog
Index: 28 // U

## **Named Entity Recognition (NER)**

### Example 1: Simple Named Entity Recognition

In [None]:
ner_1 = "Apple is headquartered in 2555 Southern Street, Cupertino, California, USA."
doc_ner_1 = nlp(ner_1)

In [None]:
for ent in doc_ner_1.ents:
  # Print the entity text and label
  print(f"Text: {ent.text}, Named Entity Label: {ent.label_}")

Text: Apple, Named Entity Label: ORG
Text: 2555, Named Entity Label: DATE
Text: Southern Street, Named Entity Label: LOC
Text: Cupertino, Named Entity Label: GPE
Text: California, Named Entity Label: GPE
Text: USA, Named Entity Label: GPE


### Example 2: Named Entity Recognition with Detailed Labels

In [None]:
ner_2 = "Bill Gates founded Microsoft in 1975 and Elon Musk is the CEO of SpaceX and Tesla."
doc_ner_2 = nlp(ner_2)

In [None]:
for ent in doc_ner_2.ents:
  # Print entity text, label and detailed label
  print(f"Text: {ent.text} - Label: {ent.label_}  \n\tDetailed Label: {spacy.explain(ent.label_)}\n")

Text: Bill Gates - Label: PERSON  
	Detailed Label: People, including fictional

Text: Microsoft - Label: ORG  
	Detailed Label: Companies, agencies, institutions, etc.

Text: 1975 - Label: DATE  
	Detailed Label: Absolute or relative dates or periods

Text: Elon Musk - Label: PERSON  
	Detailed Label: People, including fictional

Text: SpaceX - Label: NORP  
	Detailed Label: Nationalities or religious or political groups

Text: Tesla - Label: ORG  
	Detailed Label: Companies, agencies, institutions, etc.



### Example 3: Named Entity Recognition and Sentence Structure:

In [None]:
ner_3 = "Barack Obama was born in Hawaii. He served as the 44th President of the U.S.A. He is such a great African-American. His website is obama.com"
doc_ner_3 = nlp(ner_3)

In [None]:
# Print named entities and their labels, along with sentence structure
s_num = 0
for sent in doc_ner_3.sents:
  s_num += 1
  print(f"Sentence {s_num}: {sent}")

Sentence 1: Barack Obama was born in Hawaii.
Sentence 2: He served as the 44th President of the U.S.A.
Sentence 3: He is such a great African-American.
Sentence 4: His website is obama.com


In [None]:
for ent in doc_ner_3.ents:
  print(ent.text, ent.label_)

Barack Obama PERSON
Hawaii GPE
44th ORDINAL
the U.S.A. ORG
African-American NORP


## **Chunking**

### Example 1: Chunks of Nouns

In [10]:
chk_1 = "I have 3 black balls, 2 yellow cats, a playful pomeranian and the 5 great family members"
doc_chk_1 = nlp(chk_1)

In [12]:
noun_chk_1 = list(doc_chk_1.noun_chunks)
for chunk in noun_chk_1:
  print(chunk.text)

I
3 black balls
2 yellow cats
a playful pomeranian
the 5 great family members


### Example 2: Chunks of Nouns with Sentence Structures

In [13]:
chk_2 = "Elon Musk is a great entrepreneur who was born in 1970. He is an White African-American and his origin is South Africa. Currently, he is the wealthiest, richest man on the Planet Earth"
doc_chk_2 = nlp(chk_2)

In [19]:
for sent in doc_chk_2.sents:
  noun_chk_2 = list(doc_chk_2.noun_chunks)
  for chunk in noun_chk_2:
    print(chunk.text)
  print()

Elon Musk
a great entrepreneur
who
He
his origin
South Africa
he
the wealthiest, richest man
the Planet Earth

Elon Musk
a great entrepreneur
who
He
his origin
South Africa
he
the wealthiest, richest man
the Planet Earth

Elon Musk
a great entrepreneur
who
He
his origin
South Africa
he
the wealthiest, richest man
the Planet Earth



## **Dependency Parsing**

### Example 1: Simple Dependency Parsing

In [None]:
dep_1 = "The quick brown fox jumps over the lazy dog."
doc_dep_1 = nlp(dep_1)

In [None]:
for token in doc_dep_1:
  # Print the ID, dependency and text
  print(f"ID: {token.i + 1}, Dependency: {token.dep_} / {spacy.explain(token.dep_)}, Text: {token.text}\n")

ID: 1, Dependency: det / determiner, Text: The

ID: 2, Dependency: amod / adjectival modifier, Text: quick

ID: 3, Dependency: amod / adjectival modifier, Text: brown

ID: 4, Dependency: nsubj / nominal subject, Text: fox

ID: 5, Dependency: ROOT / root, Text: jumps

ID: 6, Dependency: prep / prepositional modifier, Text: over

ID: 7, Dependency: det / determiner, Text: the

ID: 8, Dependency: amod / adjectival modifier, Text: lazy

ID: 9, Dependency: pobj / object of preposition, Text: dog

ID: 10, Dependency: punct / punctuation, Text: .



### Example 2: Dependency Parsing and POS tagging of sentences and their tokens

In [None]:
dep_2 = "Shrek premiered at the Mann Village Theatre In Westwood. It was shown at the 2001 Cannes Film Festival, where it competed for the P'Or. Making it the first animated film."
doc_dep_2 = nlp(dep_2)

In [None]:
# Print the word/sentence index and their text + dependency parsing
a = 0
for sent in doc_dep_2.sents:
  print(f"\nThe sentence has total [ {sent.end_char - sent.start_char} ] letters")
  a += 1
  b = 0
  for token in sent:
    b += 1
    print(f"Word {b} in sentence {a} is: {token.text}, its dependency parsing is [{token.dep_} / {spacy.explain(token.dep_)}], its POS Tagging is {spacy.explain(token.pos_)}")


The sentence has total [ 56 ] letters
Word 1 in sentence 1 is: Shrek, its dependency parsing is [nsubj / nominal subject], its POS Tagging is noun
Word 2 in sentence 1 is: premiered, its dependency parsing is [ROOT / root], its POS Tagging is verb
Word 3 in sentence 1 is: at, its dependency parsing is [prep / prepositional modifier], its POS Tagging is adposition
Word 4 in sentence 1 is: the, its dependency parsing is [det / determiner], its POS Tagging is determiner
Word 5 in sentence 1 is: Mann, its dependency parsing is [compound / compound], its POS Tagging is proper noun
Word 6 in sentence 1 is: Village, its dependency parsing is [compound / compound], its POS Tagging is proper noun
Word 7 in sentence 1 is: Theatre, its dependency parsing is [pobj / object of preposition], its POS Tagging is proper noun
Word 8 in sentence 1 is: In, its dependency parsing is [prep / prepositional modifier], its POS Tagging is adposition
Word 9 in sentence 1 is: Westwood, its dependency parsing is 

### Example 3: Dependency Parsing Visualization

In [None]:
from spacy import displacy

In [None]:
dep_3 = "The quick brown fox jumps over the lazy dog."
doc_dep_3 = nlp(dep_3)

In [None]:
# Visualize the dependency tree
displacy.render(doc_dep_3, style="dep", jupyter=True)

## **Text Similarity**

### Example 1: Simple Similarity

In [None]:
# Compute Similarity between 2 sentences
sim_1 = nlp("The cat sat on the mat.")
sim_2 = nlp("The dog lay on the rug.")
similarity_12 = sim_1.similarity(sim_2)
print(f"Similarity: {round(similarity_12 * 100, 2)} % ")

Similarity: 88.09 % 


  similarity_12 = sim_1.similarity(sim_2)


### Example 2: Document Similarity

In [None]:
# Compute Similarity between 2 documents
sim_3 = nlp("Shrek is a 2001 American animated fantasy comedy film loosely based on the 1990 children's picture book of the same name by William Steig.")
sim_4 = nlp("Shrek premiered at the Mann Village Theatre In Westwood, and at the 2001 Cannes Film Festival, making it the first animated film since Disney's Peter Pan to be chosen to do so.")
similarity_34 = sim_3.similarity(sim_4)
print(f"Similarity: {round(similarity_34 * 100, 2)} % ")

Similarity: 73.29 % 


  similarity_34 = sim_3.similarity(sim_4)


### Example 3: Word Similarity

In [None]:
# Compute Similarity between 2 words
sim_5 = nlp("elephant")
sim_6 = nlp("giraffe")
similarity_56 = sim_5.similarity(sim_6)
print(f"Similarity: {round(similarity_56 * 100, 2)} % ")

Similarity: 57.39 % 


  similarity_56 = sim_5.similarity(sim_6)


## **Text Classification**

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Define a simple text classification pipeline
text_classification_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", SVC(kernel="linear"))
])

### Example 1: Simple Binary Classification

In [None]:
# Sample Training data
X_train = ["This is a positive review.", "This is a negative review."]
y_train = ["positive", "negative"]

# Fit the text classification model
text_classification_pipeline.fit(X_train, y_train)

# Sample test data
X_test = ["This movie is positive!", "This movie is negative!", "This movie must be deleted at all costs because it has so many negative review"]

# Predict categories for test data
predictions = text_classification_pipeline.predict(X_test)
print("Predictions:", predictions)

Predictions: ['positive' 'negative' 'negative']


### Example 2: Custom Text Classification

In [None]:
# Define a custom text classification model using spaCy features
def spacy_text_classifier(text):
  doc = nlp(text)
  # Example: classifying based on presence of certain entities
  if any(ent.label_ == "PERSON" for ent in doc.ents):
    return "PERSON"
  elif any(ent.label_ == "ORGANIZATION" for ent in doc.ents):
    return "ORGANIZATION"
  else:
    return "OTHER"

# Test the custom text classifier
text_1 = "Barack Obama was the President of the United States"
text_2 = "Amazon's Revenue this year is 200 billion USD"
text_3 = "My dad was my hero"

print("Text1 Classification:", spacy_text_classifier(text_1))
print("Text2 Classification:", spacy_text_classifier(text_2))
print("Text3 Classification:", spacy_text_classifier(text_3))

Text1 Classification: PERSON
Text2 Classification: OTHER
Text3 Classification: OTHER


##**Word Vectorization**

### Example 1: Single-word Vectorization

In [20]:
word_list = ["cat", "dog", "apple", "lemonade", "toy", "dinosaurs", "animal", "Martin", "Chinese", "Vietnamese", "black", "autumn"]

In [22]:
for word in word_list:
  doc_word = nlp(word)
  vector = doc_word.vector[:3]
  print(f"Vector of {word} is {vector}")

Vector of cat is [-1.3749216  -0.9209707  -0.48585108]
Vector of dog is [-1.6806675  -1.2663747  -0.71255565]
Vector of apple is [-1.2599487  -0.87038326 -1.0834986 ]
Vector of lemonade is [-1.6263449  -0.2931102  -0.51256025]
Vector of toy is [-1.4421012 -0.928125  -1.1336758]
Vector of dinosaurs is [-0.52412915  0.67276394  0.14787653]
Vector of animal is [-1.848459  -1.1953701 -0.4190052]
Vector of Martin is [-1.907685   -0.09650317 -0.3161859 ]
Vector of Chinese is [-1.1079756  -0.37069964 -0.9406413 ]
Vector of Vietnamese is [-1.2938046  -0.18000111 -1.0216382 ]
Vector of black is [-1.8262893  -1.787104   -0.47396478]
Vector of autumn is [-1.590935  -1.3176191 -1.4198819]


### Example 2: Word Vectorization in Sentence Structure

In [25]:
sent_vec = "I want to be the best students on the planet. With this title, I can be so famous and earn a lot of money"
doc_sent_vec = nlp(sent_vec)

In [26]:
for sent in doc_sent_vec.sents:
  for token in sent:
    print(f"Token ID: {token.i}, token text: {token.text}, first 2 vectors: {token.vector[:2]}")

Token ID: 0, token text: I, first 2 vectors: [-1.2726469  -0.55084884]
Token ID: 1, token text: want, first 2 vectors: [ 0.24857768 -0.3973046 ]
Token ID: 2, token text: to, first 2 vectors: [-1.0477109 -0.9864377]
Token ID: 3, token text: be, first 2 vectors: [-1.5539343  -0.02412109]
Token ID: 4, token text: the, first 2 vectors: [0.77335215 0.83330375]
Token ID: 5, token text: best, first 2 vectors: [-0.97872555 -1.7390311 ]
Token ID: 6, token text: students, first 2 vectors: [-0.7634615  1.4236249]
Token ID: 7, token text: on, first 2 vectors: [ 0.7723517  -0.04488764]
Token ID: 8, token text: the, first 2 vectors: [1.9377446  0.48467484]
Token ID: 9, token text: planet, first 2 vectors: [-0.913363   -0.47031605]
Token ID: 10, token text: ., first 2 vectors: [-0.22269934 -0.883154  ]
Token ID: 11, token text: With, first 2 vectors: [-1.2187836 -0.9501754]
Token ID: 12, token text: this, first 2 vectors: [ 1.5580131  -0.29074976]
Token ID: 13, token text: title, first 2 vectors: [-0

##**Topic Modelling**

### Example: Simple Topic Modelling

In [30]:
import gensim
from gensim import corpora

In [2]:
# Sample Documents
documents = [
    "Machine Learning is such an exciting field of technology",
    "Artificial Intelligence is the innovation of the century",
    "Deep Learning Models have achived state-of-the-arts results",
    "The greatest movie of all time is Breaking Bad. It is available in 150 regions"
]

In [3]:
# Tokenize documents
tokenized_doc = [doc.lower().split() for doc in documents]

In [4]:
# Create Dictionaries
dictionaries_t = corpora.Dictionary(tokenized_doc)
corpus = [dictionaries_t.doc2bow(doc) for doc in tokenized_doc]

In [7]:
# Apply LDA (Latent Dirichlet Allocation) for topic modelling
lda_model = gensim.models.LdaModel(corpus, num_topics=4, id2word = dictionaries_t)



In [8]:
# Print topics
for topic in lda_model.print_topics():
    print(topic)

(0, '0.033*"of" + 0.033*"learning" + 0.033*"is" + 0.032*"the" + 0.032*"such" + 0.032*"achived" + 0.032*"models" + 0.032*"machine" + 0.032*"have" + 0.032*"state-of-the-arts"')
(1, '0.084*"learning" + 0.084*"state-of-the-arts" + 0.084*"deep" + 0.084*"results" + 0.084*"have" + 0.084*"models" + 0.084*"achived" + 0.017*"the" + 0.017*"of" + 0.017*"is"')
(2, '0.103*"is" + 0.084*"of" + 0.081*"the" + 0.036*"learning" + 0.036*"exciting" + 0.036*"intelligence" + 0.036*"technology" + 0.036*"an" + 0.036*"century" + 0.036*"field"')
(3, '0.073*"is" + 0.054*"bad." + 0.054*"the" + 0.049*"greatest" + 0.047*"movie" + 0.046*"breaking" + 0.046*"of" + 0.045*"it" + 0.045*"regions" + 0.044*"all"')


### **Text Summarization**

In [9]:
from transformers import pipeline

# Example text
text = "Text summarization is the process of automatically generating a concise and coherent summary of a longer document while preserving its key information. There are two main approaches to text summarization: extraction-based and abstraction-based summarization."

# Use the transformers library for summarization
summarizer = pipeline("summarization")
summary = summarizer(text, max_length=50, min_length=10, do_sample=False)

# Print summary
print(summary[0]['summary_text'])

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Your max_length is set to 50, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)


 Text summarization is the process of automatically generating a concise and coherent summary of a longer document . There are two main approaches to text summarization: extraction-based and abstraction-based summarization .
