In [1]:
# adopted from https://textblob.readthedocs.io/en/dev/quickstart.html#quickstart
# create a TextBlob

%pip install textblob
from textblob import TextBlob

wiki = TextBlob("Python is a high-level, general-purpose programming language.")
wiki2 = TextBlob("Python은 고급 범용 프로그래밍 언어입니다.")


Note: you may need to restart the kernel to use updated packages.


In [2]:
# console에서 python -m textblob.download_corpora 먼저 실행
# Part-of-speech Tagging¶

wiki.tags

[('Python', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('high-level', 'JJ'),
 ('general-purpose', 'JJ'),
 ('programming', 'NN'),
 ('language', 'NN')]

In [3]:
wiki2.tags

[('Python은', 'NNP'),
 ('고급', 'NNP'),
 ('범용', 'NNP'),
 ('프로그래밍', 'NNP'),
 ('언어입니다', 'NNP')]

In [4]:
# KoNLPy는 한국어 자연어 처리를 위한 Python 라이브러리
# POS 태깅을 포함해 한국어 텍스트 분석에 적합한 여러 도구에 대한 액세스를 제공
# KoNLPy는 Mecab, Komoran, Hannanum, Kkma, and Twitter (Okt)등 다양한 한국어 NLP 도구를 지원

%pip install konlpy

from konlpy.tag import Okt
okt = Okt()
wiki2 = "Python은 고급 범용 프로그래밍 언어입니다."
pos_tags = okt.pos(wiki2)
print(pos_tags)

Note: you may need to restart the kernel to use updated packages.
[('Python', 'Alpha'), ('은', 'Noun'), ('고급', 'Noun'), ('범용', 'Noun'), ('프로그래밍', 'Noun'), ('언어', 'Noun'), ('입니다', 'Adjective'), ('.', 'Punctuation')]


In [5]:
# Noun Phrase Extraction¶

wiki.noun_phrases

WordList(['python'])

In [6]:
# Sentiment Analysis¶
# Sentiment(polarity, subjectivity)
# The polarity score is a float within the range [-1.0, 1.0].
# The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective

testimonial = TextBlob("Textblob is amazingly simple to use. What great fun!")
testimonial.sentiment

Sentiment(polarity=0.39166666666666666, subjectivity=0.4357142857142857)

In [7]:
# Tokenization

zen = TextBlob(
    "Beautiful is better than ugly. "
    "Explicit is better than implicit. "
    "Simple is better than complex."
)
zen.words

WordList(['Beautiful', 'is', 'better', 'than', 'ugly', 'Explicit', 'is', 'better', 'than', 'implicit', 'Simple', 'is', 'better', 'than', 'complex'])

In [8]:
zen.sentences

[Sentence("Beautiful is better than ugly."),
 Sentence("Explicit is better than implicit."),
 Sentence("Simple is better than complex.")]

In [9]:
# Words Inflection and Lemmatization¶

sentence = TextBlob("Use 4 spaces per indentation level.")
sentence.words

WordList(['Use', '4', 'spaces', 'per', 'indentation', 'level'])

In [10]:
sentence.words[2].singularize()

'space'

In [11]:
sentence.words[-1].pluralize()

'levels'

In [12]:
# Words can be lemmatized by calling the lemmatize method.

from textblob import Word
w = Word("octopi")
w.lemmatize()

'octopus'

In [13]:
w = Word("went")
w.lemmatize("v")  # Pass in WordNet part of speech (verb)

'go'

In [14]:
# WordNet Integration¶
# You can access the synsets for a Word via the synsets property or the get_synsets method, optionally passing in a part of speech.

from textblob import Word
from textblob.wordnet import VERB
word = Word("octopus")
word.synsets

[Synset('octopus.n.01'), Synset('octopus.n.02')]

In [15]:
Word("hack").get_synsets(pos=VERB)

[Synset('chop.v.05'),
 Synset('hack.v.02'),
 Synset('hack.v.03'),
 Synset('hack.v.04'),
 Synset('hack.v.05'),
 Synset('hack.v.06'),
 Synset('hack.v.07'),
 Synset('hack.v.08')]

In [16]:
# You can access the definitions for each synset via the definitions property or the define() method, which can also take an optional part-of-speech argument.

Word("octopus").definitions


['tentacles of octopus prepared as food',
 'bottom-living cephalopod having a soft oval body with eight long tentacles']

In [17]:
Word("hack").definitions


['one who works hard at boring tasks',
 'a politician who belongs to a small clique that controls a political party for private rather than public ends',
 'a mediocre and disdained writer',
 'a tool (as a hoe or pick or mattock) used for breaking up the surface of the soil',
 'a car driven by a person whose job is to take passengers where they want to go in exchange for money',
 'an old or over-worked horse',
 'a horse kept for hire',
 'a saddle horse used for transportation rather than sport etc.',
 'cut with a hacking tool',
 'be able to manage or manage successfully',
 'cut away',
 'kick on the arms',
 'kick on the shins',
 'fix a computer program piecemeal until it works',
 'significantly cut up a manuscript',
 'cough spasmodically']

In [18]:
from textblob.wordnet import Synset
octopus = Synset("octopus.n.02")
shrimp = Synset("shrimp.n.03")
octopus.path_similarity(shrimp)

0.1111111111111111

In [19]:
Word("dog").get_synsets()

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [20]:
dog = Synset("doc.n.01")
octopus.path_similarity(dog)

0.07692307692307693

In [21]:
# A WordList is a Python list with additional methods.

animals = TextBlob("cat dog octopus")
animals.words

WordList(['cat', 'dog', 'octopus'])

In [22]:
animals.words.pluralize()

WordList(['cats', 'dogs', 'octopodes'])

In [23]:
# Spelling Correction
# Use the correct() method to attempt spelling correction.

b = TextBlob("I havv goood speling!")
print(b.correct())


I have good spelling!


In [24]:
# Word objects have a spellcheck() Word.spellcheck() method that returns a list of (word, confidence) tuples with spelling suggestions.

from textblob import Word
w = Word("falibility")
w.spellcheck()

[('fallibility', 1.0)]

In [25]:
# Get Word and Noun Phrase Frequencies¶
# The first is through the word_counts dictionary.


monty = TextBlob("We are no longer the Knights who say Ni. "
                    "We are now the Knights who say Ekki ekki ekki PTANG.")
monty.word_counts['ekki']

3

In [26]:
# The second way is to use the count() method.

monty.words.count('ekki')

3

In [27]:
# You can specify whether or not the search should be case-sensitive (default is False).

monty.words.count('ekki', case_sensitive=True)

2

In [28]:
# Each of these methods can also be used with noun phrases.

wiki.noun_phrases.count('python')

1

In [29]:
# Parsing
# By default, TextBlob uses pattern’s parser

b = TextBlob("And now for something completely different.")
print(b.parse())

And/CC/O/O now/RB/B-ADVP/O for/IN/B-PP/B-PNP something/NN/B-NP/I-PNP completely/RB/B-ADJP/O different/JJ/I-ADJP/O ././O/O


In [30]:
# n-grams
# The TextBlob.ngrams() method returns a list of tuples of n successive words.

blob = TextBlob("Now is better than never.")
blob.ngrams(n=3)

[WordList(['Now', 'is', 'better']),
 WordList(['is', 'better', 'than']),
 WordList(['better', 'than', 'never'])]