# Text Data Cleaning and Preprocessing Assignment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

### Read the O'Reilly RSS plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [4]:
PATH = '/content/drive/MyDrive/Thinkful/NLP/oreily_rss/'

DOC_PATTERN = r'.*\.txt'

corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [5]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

### Sentence tokenize each document in the list of documents.

In [6]:
doc_sents = [sent_tokenize(doc) for doc in docs]

In [7]:
doc_sents[0]

['2020 has been a year of great challenges for so many, but it’s not all negative.',
 'Around the world, organizations and their workforces have risen to the occasion, recognizing the importance of expanding their knowledge, taking on new tasks, and bettering themselves both personally and professionally.',
 'With the uptick in virtual conferencing, remote work, and, for some, reentering the job market, new technology adoption was accelerated, driving the workforce to build new skills.',
 'While 2020 was the year of the global COVID-19 pandemic, it will also be commemorated as the year online learning prevailed.',
 'As vaccine development persists and life gets back to normal, with it will come a more future-proof workforce ready to share their new knowledge with the world.',
 'Since the onset of the pandemic, online courses and programs have seen dramatic spikes in consumption and enrollment, and O’Reilly has been no different.',
 'A big contributor to O’Reilly’s continued success dur

In [8]:
doc = docs[0]

In [9]:
print(doc)

2020 has been a year of great challenges for so many, but it’s not all negative. Around the world, organizations and their workforces have risen to the occasion, recognizing the importance of expanding their knowledge, taking on new tasks, and bettering themselves both personally and professionally. With the uptick in virtual conferencing, remote work, and, for some, reentering the job market, new technology adoption was accelerated, driving the workforce to build new skills. While 2020 was the year of the global COVID-19 pandemic, it will also be commemorated as the year online learning prevailed. As vaccine development persists and life gets back to normal, with it will come a more future-proof workforce ready to share their new knowledge with the world.  Since the onset of the pandemic, online courses and programs have seen dramatic spikes in consumption and enrollment, and O’Reilly has been no different. A big contributor to O’Reilly’s continued success during these unprecedented t

In [10]:
sents = sent_tokenize(doc)
sents

['2020 has been a year of great challenges for so many, but it’s not all negative.',
 'Around the world, organizations and their workforces have risen to the occasion, recognizing the importance of expanding their knowledge, taking on new tasks, and bettering themselves both personally and professionally.',
 'With the uptick in virtual conferencing, remote work, and, for some, reentering the job market, new technology adoption was accelerated, driving the workforce to build new skills.',
 'While 2020 was the year of the global COVID-19 pandemic, it will also be commemorated as the year online learning prevailed.',
 'As vaccine development persists and life gets back to normal, with it will come a more future-proof workforce ready to share their new knowledge with the world.',
 'Since the onset of the pandemic, online courses and programs have seen dramatic spikes in consumption and enrollment, and O’Reilly has been no different.',
 'A big contributor to O’Reilly’s continued success dur

### Word tokenize each sentence within each document.

You should end up with a nested list structure where the outer list contains all the sentences in each document and the inner list contains the tokenized sentences.

In [11]:
tokened = [[word_tokenize(sent) for sent in doc] for doc in doc_sents]
print(tokened)



### Tag each token with its part of speech.

In [12]:
part_of_speech = [[pos_tag(sent) for sent in doc] for doc in tokened]
print(part_of_speech)



### Word tokenize the raw text of each document and remove stop words.

In [13]:
no_stopwords = [[token.lower() for token in word_tokenize(doc) if token.lower() not in stopwords.words('english')] for doc in docs]

In [14]:
print(no_stopwords)



### For every document, stem all the words in the document.

In [15]:
stemmer = SnowballStemmer('english')
stemmed = [[stemmer.stem(token) for token in word_tokenize(doc)] for doc in docs]
print(stemmed)

[['2020', 'has', 'been', 'a', 'year', 'of', 'great', 'challeng', 'for', 'so', 'mani', ',', 'but', 'it', '’', 's', 'not', 'all', 'negat', '.', 'around', 'the', 'world', ',', 'organ', 'and', 'their', 'workforc', 'have', 'risen', 'to', 'the', 'occas', ',', 'recogn', 'the', 'import', 'of', 'expand', 'their', 'knowledg', ',', 'take', 'on', 'new', 'task', ',', 'and', 'better', 'themselv', 'both', 'person', 'and', 'profession', '.', 'with', 'the', 'uptick', 'in', 'virtual', 'conferenc', ',', 'remot', 'work', ',', 'and', ',', 'for', 'some', ',', 'reenter', 'the', 'job', 'market', ',', 'new', 'technolog', 'adopt', 'was', 'acceler', ',', 'drive', 'the', 'workforc', 'to', 'build', 'new', 'skill', '.', 'while', '2020', 'was', 'the', 'year', 'of', 'the', 'global', 'covid-19', 'pandem', ',', 'it', 'will', 'also', 'be', 'commemor', 'as', 'the', 'year', 'onlin', 'learn', 'prevail', '.', 'as', 'vaccin', 'develop', 'persist', 'and', 'life', 'get', 'back', 'to', 'normal', ',', 'with', 'it', 'will', 'come

In [16]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
lemmatized = [[lemmatizer.lemmatize(token.lower(), get_wordnet_pos(token)) for token in word_tokenize(doc)] for doc in docs]
print(lemmatized)

[['2020', 'have', 'be', 'a', 'year', 'of', 'great', 'challenge', 'for', 'so', 'many', ',', 'but', 'it', '’', 's', 'not', 'all', 'negative', '.', 'around', 'the', 'world', ',', 'organization', 'and', 'their', 'workforce', 'have', 'risen', 'to', 'the', 'occasion', ',', 'recognize', 'the', 'importance', 'of', 'expand', 'their', 'knowledge', ',', 'take', 'on', 'new', 'task', ',', 'and', 'bettering', 'themselves', 'both', 'personally', 'and', 'professionally', '.', 'with', 'the', 'uptick', 'in', 'virtual', 'conferencing', ',', 'remote', 'work', ',', 'and', ',', 'for', 'some', ',', 'reentering', 'the', 'job', 'market', ',', 'new', 'technology', 'adoption', 'be', 'accelerate', ',', 'drive', 'the', 'workforce', 'to', 'build', 'new', 'skill', '.', 'while', '2020', 'be', 'the', 'year', 'of', 'the', 'global', 'covid-19', 'pandemic', ',', 'it', 'will', 'also', 'be', 'commemorate', 'a', 'the', 'year', 'online', 'learn', 'prevail', '.', 'a', 'vaccine', 'development', 'persists', 'and', 'life', 'get'

### Iterate through each document, computing and printing the following document statistics for each.

- Number of sentences
- Average words per sentence
- Vocabulary
- Lexical Diversity

In [17]:
def doc_stats(doc):
  sents = sent_tokenize(doc)
  tokenized = [word_tokenize(sent) for sent in sents]
  total_sents = len(sents)
  avg_words_sent = sum([len(sent) for sent in tokenized])/total_sents
  vocab = len(set([word.lower() for word in word_tokenize(doc)]))
  lexicon_diversity = vocab / len(word_tokenize(doc))
  print(f'Number of sentences: {total_sents}')
  print(f'Avg. number of words per sentence: {avg_words_sent}')
  print(f'Unique word count (vocab): {vocab}')
  print(f'Lexicon diversity: {lexicon_diversity}')

In [18]:
for i,doc in enumerate(docs):
  print(f'Stats for document {i+1}')
  doc_stats(doc)
  print('---------')

Stats for document 1
Number of sentences: 15
Avg. number of words per sentence: 27.466666666666665
Unique word count (vocab): 198
Lexicon diversity: 0.48058252427184467
---------
Stats for document 2
Number of sentences: 77
Avg. number of words per sentence: 24.25974025974026
Unique word count (vocab): 555
Lexicon diversity: 0.29710920770877947
---------
Stats for document 3
Number of sentences: 21
Avg. number of words per sentence: 14.476190476190476
Unique word count (vocab): 183
Lexicon diversity: 0.6019736842105263
---------
Stats for document 4
Number of sentences: 42
Avg. number of words per sentence: 23.595238095238095
Unique word count (vocab): 380
Lexicon diversity: 0.3834510595358224
---------
Stats for document 5
Number of sentences: 7
Avg. number of words per sentence: 19.571428571428573
Unique word count (vocab): 91
Lexicon diversity: 0.6642335766423357
---------
Stats for document 6
Number of sentences: 2
Avg. number of words per sentence: 27.0
Unique word count (vocab): 

#Lecture Notes

In [19]:
PATH = '/content/drive/MyDrive/Thinkful/NLP/cnn_articles/'

DOC_PATTERN = r'.*\.txt'

corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

In [20]:
corpus.fileids()

['article_0.txt',
 'article_1.txt',
 'article_10.txt',
 'article_11.txt',
 'article_12.txt',
 'article_13.txt',
 'article_2.txt',
 'article_3.txt',
 'article_4.txt',
 'article_5.txt',
 'article_6.txt',
 'article_7.txt',
 'article_8.txt',
 'article_9.txt']

In [21]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

In [22]:
doc = docs[0]

In [23]:
sents = sent_tokenize(doc)
sents

['Pink taking a break to focus on familyUpdated 10:18 AM ET, Thu November 14, 2019(CNN) - Pink has been working pretty hard and it sounds like she will be taking a step back in 2020.Speaking with "Entertainment Tonight" on the Country Music Association Awards red carpet, the singer was joined by her husband, Carey Hart, and their kids Willow, 8, and Jameson, 2.Pink was there to perform her song "Love Me Anyway" with country star Chris Stapleton, and she  talked about how hectic things have been.',
 '"We did two and a half years of [music] and Willow\'s back in school now, Jameson\'s going to start pre-school soon," Pink said.',
 '"It\'s kind of the year of the family.',
 '"The star also praised her husband, with whom she will celebrate 14 years of marriage in January.',
 '"Carey has a lot going on as well," she said of Hart, who went from being a professional motocross competitor to racing off-road trucks.',
 '"He\'s super supportive, he follows me around the world and now it\'s his tu

In [24]:
tokenized = word_tokenize(sents[0])
tokenized

['Pink',
 'taking',
 'a',
 'break',
 'to',
 'focus',
 'on',
 'familyUpdated',
 '10:18',
 'AM',
 'ET',
 ',',
 'Thu',
 'November',
 '14',
 ',',
 '2019',
 '(',
 'CNN',
 ')',
 '-',
 'Pink',
 'has',
 'been',
 'working',
 'pretty',
 'hard',
 'and',
 'it',
 'sounds',
 'like',
 'she',
 'will',
 'be',
 'taking',
 'a',
 'step',
 'back',
 'in',
 '2020.Speaking',
 'with',
 '``',
 'Entertainment',
 'Tonight',
 "''",
 'on',
 'the',
 'Country',
 'Music',
 'Association',
 'Awards',
 'red',
 'carpet',
 ',',
 'the',
 'singer',
 'was',
 'joined',
 'by',
 'her',
 'husband',
 ',',
 'Carey',
 'Hart',
 ',',
 'and',
 'their',
 'kids',
 'Willow',
 ',',
 '8',
 ',',
 'and',
 'Jameson',
 ',',
 '2.Pink',
 'was',
 'there',
 'to',
 'perform',
 'her',
 'song',
 '``',
 'Love',
 'Me',
 'Anyway',
 "''",
 'with',
 'country',
 'star',
 'Chris',
 'Stapleton',
 ',',
 'and',
 'she',
 'talked',
 'about',
 'how',
 'hectic',
 'things',
 'have',
 'been',
 '.']

In [25]:
word_tokenize(doc)

['Pink',
 'taking',
 'a',
 'break',
 'to',
 'focus',
 'on',
 'familyUpdated',
 '10:18',
 'AM',
 'ET',
 ',',
 'Thu',
 'November',
 '14',
 ',',
 '2019',
 '(',
 'CNN',
 ')',
 '-',
 'Pink',
 'has',
 'been',
 'working',
 'pretty',
 'hard',
 'and',
 'it',
 'sounds',
 'like',
 'she',
 'will',
 'be',
 'taking',
 'a',
 'step',
 'back',
 'in',
 '2020.Speaking',
 'with',
 '``',
 'Entertainment',
 'Tonight',
 "''",
 'on',
 'the',
 'Country',
 'Music',
 'Association',
 'Awards',
 'red',
 'carpet',
 ',',
 'the',
 'singer',
 'was',
 'joined',
 'by',
 'her',
 'husband',
 ',',
 'Carey',
 'Hart',
 ',',
 'and',
 'their',
 'kids',
 'Willow',
 ',',
 '8',
 ',',
 'and',
 'Jameson',
 ',',
 '2.Pink',
 'was',
 'there',
 'to',
 'perform',
 'her',
 'song',
 '``',
 'Love',
 'Me',
 'Anyway',
 "''",
 'with',
 'country',
 'star',
 'Chris',
 'Stapleton',
 ',',
 'and',
 'she',
 'talked',
 'about',
 'how',
 'hectic',
 'things',
 'have',
 'been',
 '.',
 '``',
 'We',
 'did',
 'two',
 'and',
 'a',
 'half',
 'years',
 'of',

In [26]:
pos_tag(tokenized)

[('Pink', 'NNP'),
 ('taking', 'VBG'),
 ('a', 'DT'),
 ('break', 'NN'),
 ('to', 'TO'),
 ('focus', 'VB'),
 ('on', 'IN'),
 ('familyUpdated', 'JJ'),
 ('10:18', 'CD'),
 ('AM', 'NNP'),
 ('ET', 'NNP'),
 (',', ','),
 ('Thu', 'NNP'),
 ('November', 'NNP'),
 ('14', 'CD'),
 (',', ','),
 ('2019', 'CD'),
 ('(', '('),
 ('CNN', 'NNP'),
 (')', ')'),
 ('-', ':'),
 ('Pink', 'NN'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('working', 'VBG'),
 ('pretty', 'RB'),
 ('hard', 'JJ'),
 ('and', 'CC'),
 ('it', 'PRP'),
 ('sounds', 'VBZ'),
 ('like', 'IN'),
 ('she', 'PRP'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('taking', 'VBG'),
 ('a', 'DT'),
 ('step', 'NN'),
 ('back', 'RB'),
 ('in', 'IN'),
 ('2020.Speaking', 'VBG'),
 ('with', 'IN'),
 ('``', '``'),
 ('Entertainment', 'JJ'),
 ('Tonight', 'NNP'),
 ("''", "''"),
 ('on', 'IN'),
 ('the', 'DT'),
 ('Country', 'NNP'),
 ('Music', 'NNP'),
 ('Association', 'NNP'),
 ('Awards', 'NNP'),
 ('red', 'JJ'),
 ('carpet', 'NN'),
 (',', ','),
 ('the', 'DT'),
 ('singer', 'NN'),
 ('was', 'VBD'),
 ('joi

In [27]:
sents = sent_tokenize(doc)
tokenized = [word_tokenize(sent) for sent in sents]

In [28]:
tagged = [pos_tag(tokens) for tokens in tokenized]

In [29]:
print(tagged)

[[('Pink', 'NNP'), ('taking', 'VBG'), ('a', 'DT'), ('break', 'NN'), ('to', 'TO'), ('focus', 'VB'), ('on', 'IN'), ('familyUpdated', 'JJ'), ('10:18', 'CD'), ('AM', 'NNP'), ('ET', 'NNP'), (',', ','), ('Thu', 'NNP'), ('November', 'NNP'), ('14', 'CD'), (',', ','), ('2019', 'CD'), ('(', '('), ('CNN', 'NNP'), (')', ')'), ('-', ':'), ('Pink', 'NN'), ('has', 'VBZ'), ('been', 'VBN'), ('working', 'VBG'), ('pretty', 'RB'), ('hard', 'JJ'), ('and', 'CC'), ('it', 'PRP'), ('sounds', 'VBZ'), ('like', 'IN'), ('she', 'PRP'), ('will', 'MD'), ('be', 'VB'), ('taking', 'VBG'), ('a', 'DT'), ('step', 'NN'), ('back', 'RB'), ('in', 'IN'), ('2020.Speaking', 'VBG'), ('with', 'IN'), ('``', '``'), ('Entertainment', 'JJ'), ('Tonight', 'NNP'), ("''", "''"), ('on', 'IN'), ('the', 'DT'), ('Country', 'NNP'), ('Music', 'NNP'), ('Association', 'NNP'), ('Awards', 'NNP'), ('red', 'JJ'), ('carpet', 'NN'), (',', ','), ('the', 'DT'), ('singer', 'NN'), ('was', 'VBD'), ('joined', 'VBN'), ('by', 'IN'), ('her', 'PRP$'), ('husband',

In [30]:
lowercase = [token.lower() for token in tokenized[0]]
print(lowercase)

['pink', 'taking', 'a', 'break', 'to', 'focus', 'on', 'familyupdated', '10:18', 'am', 'et', ',', 'thu', 'november', '14', ',', '2019', '(', 'cnn', ')', '-', 'pink', 'has', 'been', 'working', 'pretty', 'hard', 'and', 'it', 'sounds', 'like', 'she', 'will', 'be', 'taking', 'a', 'step', 'back', 'in', '2020.speaking', 'with', '``', 'entertainment', 'tonight', "''", 'on', 'the', 'country', 'music', 'association', 'awards', 'red', 'carpet', ',', 'the', 'singer', 'was', 'joined', 'by', 'her', 'husband', ',', 'carey', 'hart', ',', 'and', 'their', 'kids', 'willow', ',', '8', ',', 'and', 'jameson', ',', '2.pink', 'was', 'there', 'to', 'perform', 'her', 'song', '``', 'love', 'me', 'anyway', "''", 'with', 'country', 'star', 'chris', 'stapleton', ',', 'and', 'she', 'talked', 'about', 'how', 'hectic', 'things', 'have', 'been', '.']


In [31]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [32]:
no_stopwords = [token.lower() for token in word_tokenize(doc) 
                if token.lower() not in stopwords.words('english')]

In [33]:
print(no_stopwords)

['pink', 'taking', 'break', 'focus', 'familyupdated', '10:18', 'et', ',', 'thu', 'november', '14', ',', '2019', '(', 'cnn', ')', '-', 'pink', 'working', 'pretty', 'hard', 'sounds', 'like', 'taking', 'step', 'back', '2020.speaking', '``', 'entertainment', 'tonight', "''", 'country', 'music', 'association', 'awards', 'red', 'carpet', ',', 'singer', 'joined', 'husband', ',', 'carey', 'hart', ',', 'kids', 'willow', ',', '8', ',', 'jameson', ',', '2.pink', 'perform', 'song', '``', 'love', 'anyway', "''", 'country', 'star', 'chris', 'stapleton', ',', 'talked', 'hectic', 'things', '.', '``', 'two', 'half', 'years', '[', 'music', ']', 'willow', "'s", 'back', 'school', ',', 'jameson', "'s", 'going', 'start', 'pre-school', 'soon', ',', "''", 'pink', 'said', '.', '``', "'s", 'kind', 'year', 'family', '.', '``', 'star', 'also', 'praised', 'husband', ',', 'celebrate', '14', 'years', 'marriage', 'january', '.', '``', 'carey', 'lot', 'going', 'well', ',', "''", 'said', 'hart', ',', 'went', 'professio

In [34]:
len(no_stopwords)

158

In [35]:
len(word_tokenize(doc))

241

In [36]:
no_punct = [token.lower() for token in word_tokenize(doc)
            if token.isalpha() == True]

In [37]:
print(no_punct)

['pink', 'taking', 'a', 'break', 'to', 'focus', 'on', 'familyupdated', 'am', 'et', 'thu', 'november', 'cnn', 'pink', 'has', 'been', 'working', 'pretty', 'hard', 'and', 'it', 'sounds', 'like', 'she', 'will', 'be', 'taking', 'a', 'step', 'back', 'in', 'with', 'entertainment', 'tonight', 'on', 'the', 'country', 'music', 'association', 'awards', 'red', 'carpet', 'the', 'singer', 'was', 'joined', 'by', 'her', 'husband', 'carey', 'hart', 'and', 'their', 'kids', 'willow', 'and', 'jameson', 'was', 'there', 'to', 'perform', 'her', 'song', 'love', 'me', 'anyway', 'with', 'country', 'star', 'chris', 'stapleton', 'and', 'she', 'talked', 'about', 'how', 'hectic', 'things', 'have', 'been', 'we', 'did', 'two', 'and', 'a', 'half', 'years', 'of', 'music', 'and', 'willow', 'back', 'in', 'school', 'now', 'jameson', 'going', 'to', 'start', 'soon', 'pink', 'said', 'it', 'kind', 'of', 'the', 'year', 'of', 'the', 'family', 'the', 'star', 'also', 'praised', 'her', 'husband', 'with', 'whom', 'she', 'will', 'ce

In [38]:
len(no_punct)

181

In [39]:
no_punct_stop = [token.lower() for token in no_stopwords
            if token.isalpha() == True]

In [40]:
len(no_punct_stop)

98

In [41]:
stemmer = SnowballStemmer('english')
stemmed = [stemmer.stem(token) for token in word_tokenize(doc)]
print(stemmed)

['pink', 'take', 'a', 'break', 'to', 'focus', 'on', 'familyupd', '10:18', 'am', 'et', ',', 'thu', 'novemb', '14', ',', '2019', '(', 'cnn', ')', '-', 'pink', 'has', 'been', 'work', 'pretti', 'hard', 'and', 'it', 'sound', 'like', 'she', 'will', 'be', 'take', 'a', 'step', 'back', 'in', '2020.speak', 'with', '``', 'entertain', 'tonight', "''", 'on', 'the', 'countri', 'music', 'associ', 'award', 'red', 'carpet', ',', 'the', 'singer', 'was', 'join', 'by', 'her', 'husband', ',', 'carey', 'hart', ',', 'and', 'their', 'kid', 'willow', ',', '8', ',', 'and', 'jameson', ',', '2.pink', 'was', 'there', 'to', 'perform', 'her', 'song', '``', 'love', 'me', 'anyway', "''", 'with', 'countri', 'star', 'chris', 'stapleton', ',', 'and', 'she', 'talk', 'about', 'how', 'hectic', 'thing', 'have', 'been', '.', '``', 'we', 'did', 'two', 'and', 'a', 'half', 'year', 'of', '[', 'music', ']', 'and', 'willow', "'s", 'back', 'in', 'school', 'now', ',', 'jameson', "'s", 'go', 'to', 'start', 'pre-school', 'soon', ',', "

In [42]:
len(stemmed)

241

In [43]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token.lower()) for token in word_tokenize(doc)]
print(lemmatized)

['pink', 'taking', 'a', 'break', 'to', 'focus', 'on', 'familyupdated', '10:18', 'am', 'et', ',', 'thu', 'november', '14', ',', '2019', '(', 'cnn', ')', '-', 'pink', 'ha', 'been', 'working', 'pretty', 'hard', 'and', 'it', 'sound', 'like', 'she', 'will', 'be', 'taking', 'a', 'step', 'back', 'in', '2020.speaking', 'with', '``', 'entertainment', 'tonight', "''", 'on', 'the', 'country', 'music', 'association', 'award', 'red', 'carpet', ',', 'the', 'singer', 'wa', 'joined', 'by', 'her', 'husband', ',', 'carey', 'hart', ',', 'and', 'their', 'kid', 'willow', ',', '8', ',', 'and', 'jameson', ',', '2.pink', 'wa', 'there', 'to', 'perform', 'her', 'song', '``', 'love', 'me', 'anyway', "''", 'with', 'country', 'star', 'chris', 'stapleton', ',', 'and', 'she', 'talked', 'about', 'how', 'hectic', 'thing', 'have', 'been', '.', '``', 'we', 'did', 'two', 'and', 'a', 'half', 'year', 'of', '[', 'music', ']', 'and', 'willow', "'s", 'back', 'in', 'school', 'now', ',', 'jameson', "'s", 'going', 'to', 'start',

In [44]:
len(lemmatized)

241

In [45]:
len(word_tokenize(doc))

241

In [46]:
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(word) for word in ['taking', 'take']]

['taking', 'take']

In [47]:
sents = sent_tokenize(doc)
tokenized = [word_tokenize(sent) for sent in sents]

In [48]:
total_sents = len(sents)
total_sents

7

In [49]:
avg_words_sent = sum([len(sent) for sent in tokenized])/total_sents
avg_words_sent

34.42857142857143

In [50]:
vocab = len(set([word.lower() for word in word_tokenize(doc)])) #filter first (isalpha and stopwords)
vocab

140

In [51]:
lexicon_diversity = vocab / len(word_tokenize(doc))
lexicon_diversity

0.5809128630705395

In [52]:
def doc_stats(doc):
  sents = sent_tokenize(doc)
  tokenized = [word_tokenize(sent) for sent in sents]
  total_sents = len(sents)
  avg_words_sent = sum([len(sent) for sent in tokenized])/total_sents
  vocab = len(set([word.lower() for word in word_tokenize(doc)]))
  lexicon_diversity = vocab / len(word_tokenize(doc))
  print(f'Number of sentences: {total_sents}')
  print(f'Avg. number of words per sentence: {avg_words_sent}')
  print(f'Unique word count (vocab): {vocab}')
  print(f'Lexicon diversity: {lexicon_diversity}')

In [53]:
doc_stats(doc)

Number of sentences: 7
Avg. number of words per sentence: 34.42857142857143
Unique word count (vocab): 140
Lexicon diversity: 0.5809128630705395


In [54]:
wnl = WordNetLemmatizer()
print(wnl.lemmatize('feet'))

foot


In [55]:
lemmatizer = WordNetLemmatizer()

In [56]:
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [57]:
word = 'feet'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

foot


In [58]:
sentence = "The striped bats are hanging on their feet for best"
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']


In [59]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token.lower(), get_wordnet_pos(token)) for token in word_tokenize(doc)]
print(lemmatized)

['pink', 'take', 'a', 'break', 'to', 'focus', 'on', 'familyupdated', '10:18', 'am', 'et', ',', 'thu', 'november', '14', ',', '2019', '(', 'cnn', ')', '-', 'pink', 'have', 'be', 'work', 'pretty', 'hard', 'and', 'it', 'sound', 'like', 'she', 'will', 'be', 'take', 'a', 'step', 'back', 'in', '2020.speaking', 'with', '``', 'entertainment', 'tonight', "''", 'on', 'the', 'country', 'music', 'association', 'award', 'red', 'carpet', ',', 'the', 'singer', 'be', 'join', 'by', 'her', 'husband', ',', 'carey', 'hart', ',', 'and', 'their', 'kid', 'willow', ',', '8', ',', 'and', 'jameson', ',', '2.pink', 'be', 'there', 'to', 'perform', 'her', 'song', '``', 'love', 'me', 'anyway', "''", 'with', 'country', 'star', 'chris', 'stapleton', ',', 'and', 'she', 'talk', 'about', 'how', 'hectic', 'thing', 'have', 'be', '.', '``', 'we', 'do', 'two', 'and', 'a', 'half', 'year', 'of', '[', 'music', ']', 'and', 'willow', "'s", 'back', 'in', 'school', 'now', ',', 'jameson', "'s", 'go', 'to', 'start', 'pre-school', 's

In [60]:
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token.lower(), get_wordnet_pos(token)) for token in word_tokenize(doc)]
print(lemmatized)

['pink', 'take', 'a', 'break', 'to', 'focus', 'on', 'familyupdated', '10:18', 'am', 'et', ',', 'thu', 'november', '14', ',', '2019', '(', 'cnn', ')', '-', 'pink', 'have', 'be', 'work', 'pretty', 'hard', 'and', 'it', 'sound', 'like', 'she', 'will', 'be', 'take', 'a', 'step', 'back', 'in', '2020.speaking', 'with', '``', 'entertainment', 'tonight', "''", 'on', 'the', 'country', 'music', 'association', 'award', 'red', 'carpet', ',', 'the', 'singer', 'be', 'join', 'by', 'her', 'husband', ',', 'carey', 'hart', ',', 'and', 'their', 'kid', 'willow', ',', '8', ',', 'and', 'jameson', ',', '2.pink', 'be', 'there', 'to', 'perform', 'her', 'song', '``', 'love', 'me', 'anyway', "''", 'with', 'country', 'star', 'chris', 'stapleton', ',', 'and', 'she', 'talk', 'about', 'how', 'hectic', 'thing', 'have', 'be', '.', '``', 'we', 'do', 'two', 'and', 'a', 'half', 'year', 'of', '[', 'music', ']', 'and', 'willow', "'s", 'back', 'in', 'school', 'now', ',', 'jameson', "'s", 'go', 'to', 'start', 'pre-school', 's