# Text Analysis

In this module, we will use the Natural Language Toolkit Library (NLTK) to look at individual words and sentences in a text and clean unneccessary features from the text data to prepare for sentiment analysis. Then using the textblob library, we will analyze the sentiment of opinioned data to give a numerical value for use in a predictive model.

#### Tokenizing Words and Sentences

Recall in the "Python Dictionaries and String Manipulation" notebook, we used the .split() function to break a sentence apart.

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#this is sample data
from nltk.corpus import names  

from string import punctuation

#if the next cell does not work
#remove number symbol on following lines and re-run this cell
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('names')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GBTC408006ur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\GBTC408006ur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\GBTC408006ur\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GBTC408006ur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\GBTC408006ur\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
eng_stopwords = stopwords.words('english')
#eng_stopwords

In [3]:
#[create an empty list here to hold the tokens at the end]
new_words = []  #list to hold new words

with open("datasets/12dancingprincesses.txt", 'r') as f:
    for line in f:
        cline = line.strip() #get rid of newline character
        #print(cline)

        if cline == '': pass  #this will skip over lines that only had a newline and are now blank
        else:
            tknls = word_tokenize(cline)   # [write the function to tokenize the words]
            #print(tknls)
        for token in tknls:
            new_words.append(token) #[write the function to append each token to the empty list you created at the start of this code]

    #for word in tknls:
    #    if word not in eng_stopwords:
    #        new_words.append(word)
    #else: rm_count += 1


The NLTK library was built to separate punctuation from words when tokenizing (splitting into parts).

In [4]:
new_words

['THE',
 'TWELVE',
 'DANCING',
 'PRINCESSES',
 'THE',
 'TWELVE',
 'DANCING',
 'PRINCESSES',
 'There',
 'was',
 'a',
 'king',
 'who',
 'had',
 'twelve',
 'beautiful',
 'daughters',
 '.',
 'They',
 'slept',
 'in',
 'twelve',
 'beds',
 'all',
 'in',
 'one',
 'room',
 ';',
 'and',
 'when',
 'they',
 'went',
 'to',
 'bed',
 ',',
 'the',
 'doors',
 'were',
 'shut',
 'and',
 'locked',
 'up',
 ';',
 'but',
 'every',
 'morning',
 'their',
 'shoes',
 'were',
 'found',
 'to',
 'be',
 'quite',
 'worn',
 'through',
 'as',
 'if',
 'they',
 'had',
 'been',
 'danced',
 'in',
 'all',
 'night',
 ';',
 'and',
 'yet',
 'nobody',
 'could',
 'find',
 'out',
 'how',
 'it',
 'happened',
 ',',
 'or',
 'where',
 'they',
 'had',
 'been',
 '.',
 'could',
 'find',
 'out',
 'how',
 'it',
 'happened',
 ',',
 'or',
 'where',
 'they',
 'had',
 'been',
 '.',
 'Then',
 'the',
 'king',
 'made',
 'it',
 'known',
 'to',
 'all',
 'the',
 'land',
 ',',
 'that',
 'if',
 'any',
 'person',
 'could',
 'discover',
 'the',
 'secre

In [5]:
len(new_words)

1970

In [6]:
#remove the puntuation tokens from the list
for word in new_words:
    if word in punctuation:
        new_words.remove(word)

In [7]:
new_words

['THE',
 'TWELVE',
 'DANCING',
 'PRINCESSES',
 'THE',
 'TWELVE',
 'DANCING',
 'PRINCESSES',
 'There',
 'was',
 'a',
 'king',
 'who',
 'had',
 'twelve',
 'beautiful',
 'daughters',
 'They',
 'slept',
 'in',
 'twelve',
 'beds',
 'all',
 'in',
 'one',
 'room',
 'and',
 'when',
 'they',
 'went',
 'to',
 'bed',
 'the',
 'doors',
 'were',
 'shut',
 'and',
 'locked',
 'up',
 'but',
 'every',
 'morning',
 'their',
 'shoes',
 'were',
 'found',
 'to',
 'be',
 'quite',
 'worn',
 'through',
 'as',
 'if',
 'they',
 'had',
 'been',
 'danced',
 'in',
 'all',
 'night',
 'and',
 'yet',
 'nobody',
 'could',
 'find',
 'out',
 'how',
 'it',
 'happened',
 'or',
 'where',
 'they',
 'had',
 'been',
 'could',
 'find',
 'out',
 'how',
 'it',
 'happened',
 'or',
 'where',
 'they',
 'had',
 'been',
 'Then',
 'the',
 'king',
 'made',
 'it',
 'known',
 'to',
 'all',
 'the',
 'land',
 'that',
 'if',
 'any',
 'person',
 'could',
 'discover',
 'the',
 'secret',
 'and',
 'find',
 'out',
 'where',
 'it',
 'was',
 'that

In [8]:
len(new_words)

1763

In [9]:
#put the word lemmatization function into a variable
wnl = WordNetLemmatizer()

In [10]:
sentence= ' '.join(new_words)

In [11]:
#tokenize the sentence into a list
#this is before we lemmatize it
lems = word_tokenize(sentence)
lems

['THE',
 'TWELVE',
 'DANCING',
 'PRINCESSES',
 'THE',
 'TWELVE',
 'DANCING',
 'PRINCESSES',
 'There',
 'was',
 'a',
 'king',
 'who',
 'had',
 'twelve',
 'beautiful',
 'daughters',
 'They',
 'slept',
 'in',
 'twelve',
 'beds',
 'all',
 'in',
 'one',
 'room',
 'and',
 'when',
 'they',
 'went',
 'to',
 'bed',
 'the',
 'doors',
 'were',
 'shut',
 'and',
 'locked',
 'up',
 'but',
 'every',
 'morning',
 'their',
 'shoes',
 'were',
 'found',
 'to',
 'be',
 'quite',
 'worn',
 'through',
 'as',
 'if',
 'they',
 'had',
 'been',
 'danced',
 'in',
 'all',
 'night',
 'and',
 'yet',
 'nobody',
 'could',
 'find',
 'out',
 'how',
 'it',
 'happened',
 'or',
 'where',
 'they',
 'had',
 'been',
 'could',
 'find',
 'out',
 'how',
 'it',
 'happened',
 'or',
 'where',
 'they',
 'had',
 'been',
 'Then',
 'the',
 'king',
 'made',
 'it',
 'known',
 'to',
 'all',
 'the',
 'land',
 'that',
 'if',
 'any',
 'person',
 'could',
 'discover',
 'the',
 'secret',
 'and',
 'find',
 'out',
 'where',
 'it',
 'was',
 'that

In [12]:
fd_nw = FreqDist(lems)
fd_nw.most_common(10)

[('the', 138),
 ('and', 74),
 ('to', 47),
 ('he', 34),
 ('they', 31),
 ('’', 31),
 ('of', 29),
 ('was', 25),
 ('in', 25),
 ('all', 24)]

In [13]:
rm_count = 0
new_no_stopwords = []  #list to hold new words

for word in lems:
    if word not in eng_stopwords:
        new_no_stopwords.append(word)
    else: rm_count += 1

In [14]:
new_no_stopwords

['THE',
 'TWELVE',
 'DANCING',
 'PRINCESSES',
 'THE',
 'TWELVE',
 'DANCING',
 'PRINCESSES',
 'There',
 'king',
 'twelve',
 'beautiful',
 'daughters',
 'They',
 'slept',
 'twelve',
 'beds',
 'one',
 'room',
 'went',
 'bed',
 'doors',
 'shut',
 'locked',
 'every',
 'morning',
 'shoes',
 'found',
 'quite',
 'worn',
 'danced',
 'night',
 'yet',
 'nobody',
 'could',
 'find',
 'happened',
 'could',
 'find',
 'happened',
 'Then',
 'king',
 'made',
 'known',
 'land',
 'person',
 'could',
 'discover',
 'secret',
 'find',
 'princesses',
 'danced',
 'night',
 'one',
 'liked',
 'best',
 'wife',
 'king',
 'death',
 'whoever',
 'tried',
 'succeed',
 'three',
 'days',
 'nights',
 'put',
 'death',
 'succeed',
 'three',
 'days',
 'nights',
 'put',
 'death',
 'A',
 'king',
 '’',
 'son',
 'soon',
 'came',
 'He',
 'well',
 'entertained',
 'evening',
 'taken',
 'chamber',
 'next',
 'one',
 'princesses',
 'lay',
 'twelve',
 'beds',
 'There',
 'sit',
 'watch',
 'went',
 'dance',
 'order',
 'nothing',
 'might

In [15]:
rm_count

928

In [16]:
len(new_no_stopwords)

844

In [17]:
last = []
for word in lems:
    if word != '‘' and word != '.' and word != '’':
        last.append(word)

In [18]:
last

['THE',
 'TWELVE',
 'DANCING',
 'PRINCESSES',
 'THE',
 'TWELVE',
 'DANCING',
 'PRINCESSES',
 'There',
 'was',
 'a',
 'king',
 'who',
 'had',
 'twelve',
 'beautiful',
 'daughters',
 'They',
 'slept',
 'in',
 'twelve',
 'beds',
 'all',
 'in',
 'one',
 'room',
 'and',
 'when',
 'they',
 'went',
 'to',
 'bed',
 'the',
 'doors',
 'were',
 'shut',
 'and',
 'locked',
 'up',
 'but',
 'every',
 'morning',
 'their',
 'shoes',
 'were',
 'found',
 'to',
 'be',
 'quite',
 'worn',
 'through',
 'as',
 'if',
 'they',
 'had',
 'been',
 'danced',
 'in',
 'all',
 'night',
 'and',
 'yet',
 'nobody',
 'could',
 'find',
 'out',
 'how',
 'it',
 'happened',
 'or',
 'where',
 'they',
 'had',
 'been',
 'could',
 'find',
 'out',
 'how',
 'it',
 'happened',
 'or',
 'where',
 'they',
 'had',
 'been',
 'Then',
 'the',
 'king',
 'made',
 'it',
 'known',
 'to',
 'all',
 'the',
 'land',
 'that',
 'if',
 'any',
 'person',
 'could',
 'discover',
 'the',
 'secret',
 'and',
 'find',
 'out',
 'where',
 'it',
 'was',
 'that

In [19]:
fd_nw = FreqDist(lems)
fd_nw.most_common(10)

[('the', 138),
 ('and', 74),
 ('to', 47),
 ('he', 34),
 ('they', 31),
 ('’', 31),
 ('of', 29),
 ('was', 25),
 ('in', 25),
 ('all', 24)]