# Using Textblob for text processing

In [1]:
# pip install textblob
from textblob import TextBlob

string= 'Thank you. God bles you. And God bless the United States of America.'
blob = TextBlob(string)                   # Note the typo

In [2]:
blob.sentences

[Sentence("Thank you."),
 Sentence("God bles you."),
 Sentence("And God bless the United States of America.")]

In [3]:
blob.words

WordList(['Thank', 'you', 'God', 'bles', 'you', 'And', 'God', 'bless', 'the', 'United', 'States', 'of', 'America'])

In [4]:
blob.tags

[('Thank', 'NNP'),
 ('you', 'PRP'),
 ('God', 'NNP'),
 ('bles', 'VBZ'),
 ('you', 'PRP'),
 ('And', 'CC'),
 ('God', 'NNP'),
 ('bless', 'VBD'),
 ('the', 'DT'),
 ('United', 'NNP'),
 ('States', 'NNPS'),
 ('of', 'IN'),
 ('America', 'NNP')]

In [5]:
blob.words[0].spellcheck()  

[('Thank', 1.0)]

In [6]:
# spellcheck() returns a list of suggested corrections and corresponding confidence level

blob.words[3].spellcheck() 

[('bees', 0.32),
 ('les', 0.24),
 ('blew', 0.14666666666666667),
 ('bless', 0.13333333333333333),
 ('bled', 0.06666666666666667),
 ('blebs', 0.05333333333333334),
 ('bales', 0.02666666666666667),
 ('blest', 0.013333333333333334)]

In [7]:
blob.correct()                            # Corrects typographical errors

TextBlob("Thank you. God bees you. And God bless the United States of America.")

In [8]:
for word, pos in blob.tags:
    if pos == 'NNP':
        print (word.pluralize())

Thanks
Gods
Gods
Uniteds
Americas


In [10]:
from textblob import Word      
w = Word('running')                       # Converts strings into blob words
w.lemmatize('v')                          # Lemmatize as verb

'run'

In [11]:
for ngram in blob.ngrams(3):
    print(ngram)

['Thank', 'you', 'God']
['you', 'God', 'bles']
['God', 'bles', 'you']
['bles', 'you', 'And']
['you', 'And', 'God']
['And', 'God', 'bless']
['God', 'bless', 'the']
['bless', 'the', 'United']
['the', 'United', 'States']
['United', 'States', 'of']
['States', 'of', 'America']


In [13]:
import random
blob = TextBlob('The cat is in the box. The cat likes the box. The box is over the cat.')
nouns = list()
for word, tag in blob.tags:
    if tag == 'NN':
        nouns.append(word.lemmatize())

print("This text is about...")
for item in random.sample(nouns, 3):
    word = Word(item)
    print (word.pluralize())


This text is about...
boxes
cats
boxes


In [14]:
# Textblob has a translation feature that uses ISO 639-1 language codes: 
# https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes

blob.detect_language()

'en'

In [15]:
blob.translate(from_lang='en', to ='es')

TextBlob("El gato está en la caja. Al gato le gusta la caja. La caja está sobre el gato.")

In [16]:
blob.translate(from_lang='en', to ='zh-CN')

TextBlob("猫在盒子里。猫喜欢盒子。盒子在猫身上。")

In [17]:
blob.translate(from_lang='en', to ='ar')

TextBlob("القطة في الصندوق. القط يحب الصندوق. الصندوق فوق القطة.")

In [18]:
blob.translate(from_lang='en', to ='hi')

TextBlob("बिल्ली बक्से में है। बिल्ली को बॉक्स पसंद है। बॉक्स बिल्ली के ऊपर है।")

In [19]:
blob.translate(from_lang='en', to ='te')

TextBlob("పిల్లి పెట్టెలో ఉంది. పిల్లికి పెట్టె ఇష్టం. పెట్టె పిల్లి మీద ఉంది.")

In [20]:
blob.translate(from_lang='en', to ='ta')

TextBlob("பூனை பெட்டியில் உள்ளது. பூனைக்கு பெட்டி பிடிக்கும். பெட்டி பூனைக்கு மேல் உள்ளது.")

In [21]:
# Sentiment analysis using Textblob
# Polarity: [-1,1] negative to positive 
# Subjectivity: [0,1] Factual information (objectivty) or personal opinion, emotion or judgment

blob.sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [22]:
blob.sentiment.polarity

0.0

In [23]:
blob.subjectivity

0.0

In [24]:
# Building your own sentiment analysis tool using Textblob and Machine Learning
# Note: Textblob's classifiers require input formatted as a list of tuples (X, Y)

from textblob.classifiers import NaiveBayesClassifier, DecisionTreeClassifier

train = [
('Tom Holland is a terrible spiderman.','neg'),
('a terrible Javert (Russell Crowe) ruined Les Miserables for me...','neg'),
('The Dark Knight Rises is the greatest superhero movie ever!','pos'),
('Fantastic Four should have never been made.','neg'),
('Wes Anderson is my favorite director!','pos'),
('Captain America 2 is pretty awesome.','pos'),
('Let\'s pretend that "Batman and Robin" never happened..','neg'),
]

test = [
('Superman was never an interesting character.','neg'),
('Fantastic Mr Fox is an awesome film!','pos'),
('Dragonball Evolution is simply terrible!!','neg')
]

nb_classifier = NaiveBayesClassifier(train)
print(nb_classifier.accuracy(test))

1.0


In [25]:
nb_classifier.show_informative_features(3)

Most Informative Features
            contains(is) = True              pos : neg    =      2.9 : 1.0
         contains(never) = False             pos : neg    =      1.8 : 1.0
             contains(a) = False             pos : neg    =      1.8 : 1.0


In [26]:
blob = TextBlob("The weather is terrible!", classifier=nb_classifier)
print(blob.classify())

pos


In [27]:
blob = TextBlob("Today's weather is terrible!", classifier=nb_classifier)
print(blob.classify())

neg


In [28]:
dt_classifier = DecisionTreeClassifier(train)
print(nb_classifier.accuracy(test))

1.0


In [29]:
nb_classifier.show_informative_features(3)

Most Informative Features
            contains(is) = True              pos : neg    =      2.9 : 1.0
         contains(never) = False             pos : neg    =      1.8 : 1.0
             contains(a) = False             pos : neg    =      1.8 : 1.0


In [30]:
blob = TextBlob("The weather is terrible!", classifier=dt_classifier)
print(blob.classify())

pos


In [31]:
blob = TextBlob("Today's weather is terrible!", classifier=dt_classifier)
print(blob.classify())

pos


In [32]:
# So you can see how dangerous it is to build a classifier with insufficient data input