# Text Data Cleaning and Preprocessing Assignment

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [17]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### Read the O'Reilly RSS plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [3]:
PATH = '/content/drive/MyDrive/web scrap/rss/'
DOC_PATTERN = r'.*\.txt'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [None]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

### Sentence tokenize each document in the list of documents.

In [9]:
doc = docs[0]
doc

'2020 has been a year of great challenges for so many, but it’s not all negative. Around the world, organizations and their workforces have risen to the occasion, recognizing the importance of expanding their knowledge, taking on new tasks, and bettering themselves both personally and professionally. With the uptick in virtual conferencing, remote work, and, for some, reentering the job market, new technology adoption was accelerated, driving the workforce to build new skills. While 2020 was the year of the global COVID-19 pandemic, it will also be commemorated as the year online learning prevailed. As vaccine development persists and life gets back to normal, with it will come a more future-proof workforce ready to share their new knowledge with the world.\xa0 Since the onset of the pandemic, online courses and programs have seen dramatic spikes in consumption and enrollment, and O’Reilly has been no different. A big contributor to O’Reilly’s continued success during these unprecedent

In [11]:
sents = sent_tokenize(doc)
print(sents)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['2020 has been a year of great challenges for so many, but it’s not all negative.', 'Around the world, organizations and their workforces have risen to the occasion, recognizing the importance of expanding their knowledge, taking on new tasks, and bettering themselves both personally and professionally.', 'With the uptick in virtual conferencing, remote work, and, for some, reentering the job market, new technology adoption was accelerated, driving the workforce to build new skills.', 'While 2020 was the year of the global COVID-19 pandemic, it will also be commemorated as the year online learning prevailed.', 'As vaccine development persists and life gets back to normal, with it will come a more future-proof workforce ready to share their new knowledge with the world.', 'Since the onset of the pandemic, online courses and programs have seen dramatic spikes in consumption and enro

### Word tokenize each sentence within each document.

You should end up with a nested list structure where the outer list contains all the sentences in each document and the inner list contains the tokenized sentences.

In [27]:
tokenized = [word_tokenize(sent) for sent in sents]
print(tokenized)
print('words in 11th sentence: ', len(tokenized[10]))

[['2020', 'has', 'been', 'a', 'year', 'of', 'great', 'challenges', 'for', 'so', 'many', ',', 'but', 'it', '’', 's', 'not', 'all', 'negative', '.'], ['Around', 'the', 'world', ',', 'organizations', 'and', 'their', 'workforces', 'have', 'risen', 'to', 'the', 'occasion', ',', 'recognizing', 'the', 'importance', 'of', 'expanding', 'their', 'knowledge', ',', 'taking', 'on', 'new', 'tasks', ',', 'and', 'bettering', 'themselves', 'both', 'personally', 'and', 'professionally', '.'], ['With', 'the', 'uptick', 'in', 'virtual', 'conferencing', ',', 'remote', 'work', ',', 'and', ',', 'for', 'some', ',', 'reentering', 'the', 'job', 'market', ',', 'new', 'technology', 'adoption', 'was', 'accelerated', ',', 'driving', 'the', 'workforce', 'to', 'build', 'new', 'skills', '.'], ['While', '2020', 'was', 'the', 'year', 'of', 'the', 'global', 'COVID-19', 'pandemic', ',', 'it', 'will', 'also', 'be', 'commemorated', 'as', 'the', 'year', 'online', 'learning', 'prevailed', '.'], ['As', 'vaccine', 'development'

### Tag each token with its part of speech.

In [18]:
tagged = [pos_tag(tokens) for tokens in tokenized]
tagged

[[('2020', 'CD'),
  ('has', 'VBZ'),
  ('been', 'VBN'),
  ('a', 'DT'),
  ('year', 'NN'),
  ('of', 'IN'),
  ('great', 'JJ'),
  ('challenges', 'NNS'),
  ('for', 'IN'),
  ('so', 'RB'),
  ('many', 'JJ'),
  (',', ','),
  ('but', 'CC'),
  ('it', 'PRP'),
  ('’', 'NNP'),
  ('s', 'VBZ'),
  ('not', 'RB'),
  ('all', 'DT'),
  ('negative', 'JJ'),
  ('.', '.')],
 [('Around', 'IN'),
  ('the', 'DT'),
  ('world', 'NN'),
  (',', ','),
  ('organizations', 'NNS'),
  ('and', 'CC'),
  ('their', 'PRP$'),
  ('workforces', 'NNS'),
  ('have', 'VBP'),
  ('risen', 'VBN'),
  ('to', 'TO'),
  ('the', 'DT'),
  ('occasion', 'NN'),
  (',', ','),
  ('recognizing', 'VBG'),
  ('the', 'DT'),
  ('importance', 'NN'),
  ('of', 'IN'),
  ('expanding', 'VBG'),
  ('their', 'PRP$'),
  ('knowledge', 'NN'),
  (',', ','),
  ('taking', 'VBG'),
  ('on', 'IN'),
  ('new', 'JJ'),
  ('tasks', 'NNS'),
  (',', ','),
  ('and', 'CC'),
  ('bettering', 'VBG'),
  ('themselves', 'PRP'),
  ('both', 'DT'),
  ('personally', 'RB'),
  ('and', 'CC'),
  (

### Word tokenize the raw text of each document and remove stop words.

In [26]:
no_stopwords = [token.lower() for token in word_tokenize(doc) 
                if token.lower() not in stopwords.words('english')]
print(no_stopwords)
print('words in 11th sentence: ', len(no_stopwords[10]))

['2020', 'year', 'great', 'challenges', 'many', ',', '’', 'negative', '.', 'around', 'world', ',', 'organizations', 'workforces', 'risen', 'occasion', ',', 'recognizing', 'importance', 'expanding', 'knowledge', ',', 'taking', 'new', 'tasks', ',', 'bettering', 'personally', 'professionally', '.', 'uptick', 'virtual', 'conferencing', ',', 'remote', 'work', ',', ',', ',', 'reentering', 'job', 'market', ',', 'new', 'technology', 'adoption', 'accelerated', ',', 'driving', 'workforce', 'build', 'new', 'skills', '.', '2020', 'year', 'global', 'covid-19', 'pandemic', ',', 'also', 'commemorated', 'year', 'online', 'learning', 'prevailed', '.', 'vaccine', 'development', 'persists', 'life', 'gets', 'back', 'normal', ',', 'come', 'future-proof', 'workforce', 'ready', 'share', 'new', 'knowledge', 'world', '.', 'since', 'onset', 'pandemic', ',', 'online', 'courses', 'programs', 'seen', 'dramatic', 'spikes', 'consumption', 'enrollment', ',', '’', 'reilly', 'different', '.', 'big', 'contributor', '’',

### For every document, stem all the words in the document.

In [28]:
stemmer = SnowballStemmer('english')
stemmed = [stemmer.stem(token.lower()) for token in word_tokenize(doc)]

print(stemmed)

['2020', 'has', 'been', 'a', 'year', 'of', 'great', 'challeng', 'for', 'so', 'mani', ',', 'but', 'it', '’', 's', 'not', 'all', 'negat', '.', 'around', 'the', 'world', ',', 'organ', 'and', 'their', 'workforc', 'have', 'risen', 'to', 'the', 'occas', ',', 'recogn', 'the', 'import', 'of', 'expand', 'their', 'knowledg', ',', 'take', 'on', 'new', 'task', ',', 'and', 'better', 'themselv', 'both', 'person', 'and', 'profession', '.', 'with', 'the', 'uptick', 'in', 'virtual', 'conferenc', ',', 'remot', 'work', ',', 'and', ',', 'for', 'some', ',', 'reenter', 'the', 'job', 'market', ',', 'new', 'technolog', 'adopt', 'was', 'acceler', ',', 'drive', 'the', 'workforc', 'to', 'build', 'new', 'skill', '.', 'while', '2020', 'was', 'the', 'year', 'of', 'the', 'global', 'covid-19', 'pandem', ',', 'it', 'will', 'also', 'be', 'commemor', 'as', 'the', 'year', 'onlin', 'learn', 'prevail', '.', 'as', 'vaccin', 'develop', 'persist', 'and', 'life', 'get', 'back', 'to', 'normal', ',', 'with', 'it', 'will', 'come'

### Iterate through each document, computing and printing the following document statistics for each.

- Number of sentences
- Average words per sentence
- Vocabulary
- Lexical Diversity

In [29]:
sentences = len(sents)
avg_words_sent = sum([len(sent) for sent in tokenized]) / sentences
vocab = len(set([word.lower() for word in word_tokenize(doc)]))
lex_div = vocab / len(word_tokenize(doc))
print('Number of Sentences: ', sentences)
print('Avg. words per sentence: ', avg_words_sent)
print('Unique words (vocabulary): ', vocab)
print('Lexical Diversity: ', lex_div)

Number of Sentences:  15
Avg. words per sentence:  27.466666666666665
Unique words (vocabulary):  198
Lexical Diversity:  0.48058252427184467
