# Text Data Cleaning and Preprocessing Assignment

In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

### Read the O'Reilly RSS plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [0]:
DOC_PATTERN = r'.*\.txt'
corpus = PlaintextCorpusReader('/content/drive/My Drive/Colab Notebooks/Assignments/PairProgramming/NLP/oreilly', DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [0]:
corpus.raw('0.txt')

'Most of the technical news this month continues to swirl around coronavirus. Many things are happening under that rubric\u2060—for example, delivery during a pandemic could be the killer app for autonomous vehicles. And money being (literally) dirty, the pandemic could drive the development of new payment systems that are more inclusive. I haven’t seen as much interesting news from the AI world this month.\xa0 There are some hints that we’re reaching the point of diminishing returns for large models, and that we’re turning the corner from research into deployment and production. COVID-19 A startup is developing a CRISPR-based fast and accurate COVID-19 test that can be used at home. At-home testing raises questions about reporting, but since the test gives quick results, it could also be used by airlines, offices, and other crowded locations. One consequence of the coronavirus pandemic may be that there aren’t enough people to work in call centers.\xa0 Will AI take up the slack?\xa0 N

In [0]:
big_list = []

for item in corpus.fileids():
  big_list.append(corpus.raw(item))

In [0]:
len(big_list)

60

### Sentence tokenize each document in the list of documents.

In [0]:
tokenized_list = []

for doc in big_list:
  tokenized_list.append(sent_tokenize(doc))

In [0]:
tokenized_list[0]

['Most of the technical news this month continues to swirl around coronavirus.',
 'Many things are happening under that rubric\u2060—for example, delivery during a pandemic could be the killer app for autonomous vehicles.',
 'And money being (literally) dirty, the pandemic could drive the development of new payment systems that are more inclusive.',
 'I haven’t seen as much interesting news from the AI world this month.',
 'There are some hints that we’re reaching the point of diminishing returns for large models, and that we’re turning the corner from research into deployment and production.',
 'COVID-19 A startup is developing a CRISPR-based fast and accurate COVID-19 test that can be used at home.',
 'At-home testing raises questions about reporting, but since the test gives quick results, it could also be used by airlines, offices, and other crowded locations.',
 'One consequence of the coronavirus pandemic may be that there aren’t enough people to work in call centers.',
 'Will AI

### Word tokenize each sentence within each document.

You should end up with a nested list structure where the outer list contains all the sentences in each document and the inner list contains the tokenized sentences.

In [0]:
outer_list = []

for doc in tokenized_list:
  doc_list = []
  for sentence in doc:
    doc_list.append(word_tokenize(sentence))

  outer_list.append(doc_list)

In [0]:
outer_list[0][0]

['Most',
 'of',
 'the',
 'technical',
 'news',
 'this',
 'month',
 'continues',
 'to',
 'swirl',
 'around',
 'coronavirus',
 '.']

### Tag each token with its part of speech.

In [0]:
part_of_speech = []

for doc in outer_list:
  doc_list = []
  for tokenized_sentence in doc:
    doc_list.append(pos_tag(tokenized_sentence))

  part_of_speech.append(doc_list)

In [0]:
part_of_speech[0][0]

[('Most', 'JJS'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('technical', 'JJ'),
 ('news', 'NN'),
 ('this', 'DT'),
 ('month', 'NN'),
 ('continues', 'VBZ'),
 ('to', 'TO'),
 ('swirl', 'VB'),
 ('around', 'RP'),
 ('coronavirus', 'NN'),
 ('.', '.')]

### Word tokenize the raw text of each document and remove stop words.

In [0]:
no_stops = []

for document in big_list:
  no_stopwords = [token.lower() for token in word_tokenize(document) if not token.lower() in stopwords.words('english')]
  no_stops.append(no_stopwords)

In [0]:
no_stops[0][:10]

['technical',
 'news',
 'month',
 'continues',
 'swirl',
 'around',
 'coronavirus',
 '.',
 'many',
 'things']

### For every document, stem all the words in the document.

In [0]:
stemmed = []
stemmer = SnowballStemmer('english')

for document in outer_list:
  doc_list = []
  for sentence in document:
    stemmed_sentence = [stemmer.stem(token) for token in sentence]
    doc_list.append(stemmed_sentence)
  
  stemmed.append(stemmed_sentence)

In [0]:
stemmed[0][:10]

['it', 'may', 'also', 'be', 'an', 'attempt', 'to', 'exclud', 'china', ',']

### Iterate through each document, computing and printing the following document statistics for each.

- Number of sentences
- Average words per sentence
- Vocabulary
- Lexical Diversity

In [0]:
for i in range(0, len(tokenized_list)):
  sentences = len(tokenized_list[i])
  avg_words_sent = round(sum([len(sent) for sent in outer_list[i]]) / sentences, 2)
  vocab = len(set([w.lower() for w in word_tokenize(big_list[i])]))
  lex_div = round(vocab / len(word_tokenize(big_list[i])), 2)

  print(f'\nDocument #{i}')
  print('Number of sentences: ', sentences)
  print('Avg words per sentence: ', avg_words_sent)
  print('Unique words (vocabulary: ', vocab)
  print('Lexical diversity: ', lex_div)


Document #0
Number of sentences:  62
Avg words per sentence:  19.53
Unique words (vocabulary:  538
Lexical diversity:  0.44

Document #1
Number of sentences:  14
Avg words per sentence:  22.93
Unique words (vocabulary:  189
Lexical diversity:  0.59

Document #2
Number of sentences:  115
Avg words per sentence:  26.77
Unique words (vocabulary:  1004
Lexical diversity:  0.33

Document #3
Number of sentences:  13
Avg words per sentence:  29.0
Unique words (vocabulary:  226
Lexical diversity:  0.6

Document #4
Number of sentences:  6
Avg words per sentence:  43.83
Unique words (vocabulary:  177
Lexical diversity:  0.67

Document #5
Number of sentences:  12
Avg words per sentence:  24.75
Unique words (vocabulary:  175
Lexical diversity:  0.59

Document #6
Number of sentences:  8
Avg words per sentence:  19.62
Unique words (vocabulary:  100
Lexical diversity:  0.64

Document #7
Number of sentences:  16
Avg words per sentence:  24.06
Unique words (vocabulary:  204
Lexical diversity:  0.53

D