# Text Data Cleaning and Preprocessing Assignment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### Read the O'Reilly RSS plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [4]:
PATH = 'drive/MyDrive/content/rss/'

DOC_PATTERN = r'.*\.txt'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [9]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

### Sentence tokenize each document in the list of documents.

In [10]:
# sentence tokenization
sents = [sent_tokenize(doc) for doc in docs ]
print(sents)

[['2020 has been a year of great challenges for so many, but it’s not all negative.', 'Around the world, organizations and their workforces have risen to the occasion, recognizing the importance of expanding their knowledge, taking on new tasks, and bettering themselves both personally and professionally.', 'With the uptick in virtual conferencing, remote work, and, [&#8230;]'], ['It has long seemed to me that functional programming is, essentially, programming viewed as mathematics.', 'Many ideas in functional programming came from Alonzo Church&#8217;s Lambda Calculus, which significantly predates anything that looks remotely like a modern computer.', 'Though the actual history of computing runs differently: in the early days of computing, Von Neumann’s ideas were [&#8230;]'], ['Advanced System on a Chip Lecture Notes (2016) &#8212; Topics: 1.', 'Basic Processor &#38; Memory hierarchy; 2.', 'Advanced Out-of-Order Processor; 3.', 'Data-parallel processors; 4.', 'Micro-controller intro

### Word tokenize each sentence within each document.

You should end up with a nested list structure where the outer list contains all the sentences in each document and the inner list contains the tokenized sentences.

In [23]:
sents[0][0]

'2020 has been a year of great challenges for so many, but it’s not all negative.'

In [29]:
len(sents)

60

In [33]:
i = 0
tokenized = []
for i in range(len(docs)-1):
  tokenized += [word_tokenize(sent) for sent in sents[i]]
print(tokenized)

[['2020', 'has', 'been', 'a', 'year', 'of', 'great', 'challenges', 'for', 'so', 'many', ',', 'but', 'it', '’', 's', 'not', 'all', 'negative', '.'], ['Around', 'the', 'world', ',', 'organizations', 'and', 'their', 'workforces', 'have', 'risen', 'to', 'the', 'occasion', ',', 'recognizing', 'the', 'importance', 'of', 'expanding', 'their', 'knowledge', ',', 'taking', 'on', 'new', 'tasks', ',', 'and', 'bettering', 'themselves', 'both', 'personally', 'and', 'professionally', '.'], ['With', 'the', 'uptick', 'in', 'virtual', 'conferencing', ',', 'remote', 'work', ',', 'and', ',', '[', '&', '#', '8230', ';', ']'], ['It', 'has', 'long', 'seemed', 'to', 'me', 'that', 'functional', 'programming', 'is', ',', 'essentially', ',', 'programming', 'viewed', 'as', 'mathematics', '.'], ['Many', 'ideas', 'in', 'functional', 'programming', 'came', 'from', 'Alonzo', 'Church', '&', '#', '8217', ';', 's', 'Lambda', 'Calculus', ',', 'which', 'significantly', 'predates', 'anything', 'that', 'looks', 'remotely', 

### Tag each token with its part of speech.

In [37]:
tags = [pos_tag(token) for token in tokenized]
print(tags)

[[('2020', 'CD'), ('has', 'VBZ'), ('been', 'VBN'), ('a', 'DT'), ('year', 'NN'), ('of', 'IN'), ('great', 'JJ'), ('challenges', 'NNS'), ('for', 'IN'), ('so', 'RB'), ('many', 'JJ'), (',', ','), ('but', 'CC'), ('it', 'PRP'), ('’', 'NNP'), ('s', 'VBZ'), ('not', 'RB'), ('all', 'DT'), ('negative', 'JJ'), ('.', '.')], [('Around', 'IN'), ('the', 'DT'), ('world', 'NN'), (',', ','), ('organizations', 'NNS'), ('and', 'CC'), ('their', 'PRP$'), ('workforces', 'NNS'), ('have', 'VBP'), ('risen', 'VBN'), ('to', 'TO'), ('the', 'DT'), ('occasion', 'NN'), (',', ','), ('recognizing', 'VBG'), ('the', 'DT'), ('importance', 'NN'), ('of', 'IN'), ('expanding', 'VBG'), ('their', 'PRP$'), ('knowledge', 'NN'), (',', ','), ('taking', 'VBG'), ('on', 'IN'), ('new', 'JJ'), ('tasks', 'NNS'), (',', ','), ('and', 'CC'), ('bettering', 'VBG'), ('themselves', 'PRP'), ('both', 'DT'), ('personally', 'RB'), ('and', 'CC'), ('professionally', 'RB'), ('.', '.')], [('With', 'IN'), ('the', 'DT'), ('uptick', 'NN'), ('in', 'IN'), ('v

### Word tokenize the raw text of each document and remove stop words.

In [44]:
doc_tokens = [word_tokenize(doc) for doc in docs]
print(doc_tokens)

[['2020', 'has', 'been', 'a', 'year', 'of', 'great', 'challenges', 'for', 'so', 'many', ',', 'but', 'it', '’', 's', 'not', 'all', 'negative', '.', 'Around', 'the', 'world', ',', 'organizations', 'and', 'their', 'workforces', 'have', 'risen', 'to', 'the', 'occasion', ',', 'recognizing', 'the', 'importance', 'of', 'expanding', 'their', 'knowledge', ',', 'taking', 'on', 'new', 'tasks', ',', 'and', 'bettering', 'themselves', 'both', 'personally', 'and', 'professionally', '.', 'With', 'the', 'uptick', 'in', 'virtual', 'conferencing', ',', 'remote', 'work', ',', 'and', ',', '[', '&', '#', '8230', ';', ']'], ['It', 'has', 'long', 'seemed', 'to', 'me', 'that', 'functional', 'programming', 'is', ',', 'essentially', ',', 'programming', 'viewed', 'as', 'mathematics', '.', 'Many', 'ideas', 'in', 'functional', 'programming', 'came', 'from', 'Alonzo', 'Church', '&', '#', '8217', ';', 's', 'Lambda', 'Calculus', ',', 'which', 'significantly', 'predates', 'anything', 'that', 'looks', 'remotely', 'like'

In [45]:
i = 0
no_stopwords = []
for i in range(len(doc_tokens)-1):
  no_stopwords += [token.lower() for token in doc_tokens[i] 
                if token.lower() not in stopwords.words('english')]
print(no_stopwords)

['2020', 'year', 'great', 'challenges', 'many', ',', '’', 'negative', '.', 'around', 'world', ',', 'organizations', 'workforces', 'risen', 'occasion', ',', 'recognizing', 'importance', 'expanding', 'knowledge', ',', 'taking', 'new', 'tasks', ',', 'bettering', 'personally', 'professionally', '.', 'uptick', 'virtual', 'conferencing', ',', 'remote', 'work', ',', ',', '[', '&', '#', '8230', ';', ']', 'long', 'seemed', 'functional', 'programming', ',', 'essentially', ',', 'programming', 'viewed', 'mathematics', '.', 'many', 'ideas', 'functional', 'programming', 'came', 'alonzo', 'church', '&', '#', '8217', ';', 'lambda', 'calculus', ',', 'significantly', 'predates', 'anything', 'looks', 'remotely', 'like', 'modern', 'computer', '.', 'though', 'actual', 'history', 'computing', 'runs', 'differently', ':', 'early', 'days', 'computing', ',', 'von', 'neumann', '’', 'ideas', '[', '&', '#', '8230', ';', ']', 'advanced', 'system', 'chip', 'lecture', 'notes', '(', '2016', ')', '&', '#', '8212', ';',

### For every document, stem all the words in the document.

In [47]:
# stemming
stemmer = SnowballStemmer('english')
stemmed = []
for i in range(len(docs)-1):
  stemmed += [stemmer.stem(token.lower()) for token in word_tokenize(docs[i])]

print(stemmed)

['2020', 'has', 'been', 'a', 'year', 'of', 'great', 'challeng', 'for', 'so', 'mani', ',', 'but', 'it', '’', 's', 'not', 'all', 'negat', '.', 'around', 'the', 'world', ',', 'organ', 'and', 'their', 'workforc', 'have', 'risen', 'to', 'the', 'occas', ',', 'recogn', 'the', 'import', 'of', 'expand', 'their', 'knowledg', ',', 'take', 'on', 'new', 'task', ',', 'and', 'better', 'themselv', 'both', 'person', 'and', 'profession', '.', 'with', 'the', 'uptick', 'in', 'virtual', 'conferenc', ',', 'remot', 'work', ',', 'and', ',', '[', '&', '#', '8230', ';', ']', 'it', 'has', 'long', 'seem', 'to', 'me', 'that', 'function', 'program', 'is', ',', 'essenti', ',', 'program', 'view', 'as', 'mathemat', '.', 'mani', 'idea', 'in', 'function', 'program', 'came', 'from', 'alonzo', 'church', '&', '#', '8217', ';', 's', 'lambda', 'calculus', ',', 'which', 'signific', 'predat', 'anyth', 'that', 'look', 'remot', 'like', 'a', 'modern', 'comput', '.', 'though', 'the', 'actual', 'histori', 'of', 'comput', 'run', 'di

### Iterate through each document, computing and printing the following document statistics for each.

- Number of sentences
- Average words per sentence
- Vocabulary
- Lexical Diversity

In [51]:
# document statistics
def stats(doc, title):
  sents = sent_tokenize(doc)
  tokenized = word_tokenize(doc)
  sentences = len(sents)
  avg_words_sent = sum([len(sent) for sent in tokenized]) / sentences
  vocab = len(set([word.lower() for word in word_tokenize(doc)]))
  lex_div = vocab / len(word_tokenize(doc))

  print(title)
  print('Number of Sentences: ', sentences)
  print('Avg. words per sentence: ', avg_words_sent)
  print('Unique words (vocabulary): ', vocab)
  print('Lexical Diversity: ', lex_div)
  print('---')

In [52]:
for doc in docs:
  stats(doc, doc[0:3])

202
Number of Sentences:  3
Avg. words per sentence:  104.66666666666667
Unique words (vocabulary):  57
Lexical Diversity:  0.7808219178082192
---
It 
Number of Sentences:  3
Avg. words per sentence:  113.66666666666667
Unique words (vocabulary):  58
Lexical Diversity:  0.7733333333333333
---
Adv
Number of Sentences:  13
Avg. words per sentence:  29.615384615384617
Unique words (vocabulary):  57
Lexical Diversity:  0.5757575757575758
---
The
Number of Sentences:  3
Avg. words per sentence:  117.0
Unique words (vocabulary):  53
Lexical Diversity:  0.5578947368421052
---
Hyp
Number of Sentences:  3
Avg. words per sentence:  90.0
Unique words (vocabulary):  53
Lexical Diversity:  0.7066666666666667
---
Dea
Number of Sentences:  4
Avg. words per sentence:  84.75
Unique words (vocabulary):  53
Lexical Diversity:  0.654320987654321
---
The
Number of Sentences:  2
Avg. words per sentence:  146.0
Unique words (vocabulary):  53
Lexical Diversity:  0.7571428571428571
---
Per
Number of Sentences: