In [1]:
import numpy as np, nltk, re # regular expression
from sklearn.datasets import fetch_20newsgroups # importing dataset from sklearn, we have a couple of text books in NLTK as well "from nltk.book import *"
# pip install sklearn
from nltk.tokenize import word_tokenize, sent_tokenize # for tokenization
# nltk.download('punkt')
from nltk.corpus import stopwords # stopwords
# nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer # stemmer
from nltk.stem.wordnet import WordNetLemmatizer # lemmatizer
# nltk.download('wordnet')

In [2]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Loading the data
text_data = fetch_20newsgroups()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
type(text_data) # this is basically a bunch datatype, we can convert this to list or array

sklearn.utils.Bunch

In [5]:
raw_data = text_data.data
raw_data[:10]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [6]:
raw_data = raw_data[:4]
print(type(raw_data)) # should be in list format
raw_data

<class 'list'>


["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [7]:
# applying word normalization

def word_normalize(_list):
  """
  fn & return // This function outputs lower case list, post word normalization.
  """
  try:
    temp_list = []
    for _ in _list:
      temp_list.append(_.lower())
    return temp_list

  except Exception as e:
    return e

normalized_text = word_normalize(raw_data)
normalized_text

["from: lerxst@wam.umd.edu (where's my thing)\nsubject: what car is this!?\nnntp-posting-host: rac3.wam.umd.edu\norganization: university of maryland, college park\nlines: 15\n\n i was wondering if anyone out there could enlighten me on this car i saw\nthe other day. it was a 2-door sports car, looked to be from the late 60s/\nearly 70s. it was called a bricklin. the doors were really small. in addition,\nthe front bumper was separate from the rest of the body. this is \nall i know. if anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nthanks,\n- il\n   ---- brought to you by your neighborhood lerxst ----\n\n\n\n\n",
 "from: guykuo@carson.u.washington.edu (guy kuo)\nsubject: si clock poll - final call\nsummary: final call for si clock reports\nkeywords: si,acceleration,clock,upgrade\narticle-i.d.: shelley.1qvfo9innc3s\norganization: university of washington\nlines: 

In [8]:
# Tokenization

tokenized_sentences, tokenized_words = [], []

# Sentence Tokenization
for _ in normalized_text:
  sentence = sent_tokenize(_)
  tokenized_sentences.append(sentence)

print(type(tokenized_sentences))
print("\n-------------------------------- SENTENCE TOKENIZATION ---------------------------------\n")
tokenized_sentences

<class 'list'>

-------------------------------- SENTENCE TOKENIZATION ---------------------------------



[["from: lerxst@wam.umd.edu (where's my thing)\nsubject: what car is this!?",
  'nntp-posting-host: rac3.wam.umd.edu\norganization: university of maryland, college park\nlines: 15\n\n i was wondering if anyone out there could enlighten me on this car i saw\nthe other day.',
  'it was a 2-door sports car, looked to be from the late 60s/\nearly 70s.',
  'it was called a bricklin.',
  'the doors were really small.',
  'in addition,\nthe front bumper was separate from the rest of the body.',
  'this is \nall i know.',
  'if anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
  'thanks,\n- il\n   ---- brought to you by your neighborhood lerxst ----'],
 ['from: guykuo@carson.u.washington.edu (guy kuo)\nsubject: si clock poll - final call\nsummary: final call for si clock reports\nkeywords: si,acceleration,clock,upgrade\narticle-i.d.',
  ': shelley.1qvfo9innc3s\norganization

In [9]:
# Word Tokenization
for _ in normalized_text:
  word = word_tokenize(_)
  tokenized_words.append(word)

print(type(tokenized_words))
print("\n-------------------------------- WORD TOKENIZATION ---------------------------------\n")
print(tokenized_words[:10])

<class 'list'>

-------------------------------- WORD TOKENIZATION ---------------------------------

[['from', ':', 'lerxst', '@', 'wam.umd.edu', '(', 'where', "'s", 'my', 'thing', ')', 'subject', ':', 'what', 'car', 'is', 'this', '!', '?', 'nntp-posting-host', ':', 'rac3.wam.umd.edu', 'organization', ':', 'university', 'of', 'maryland', ',', 'college', 'park', 'lines', ':', '15', 'i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', '.', 'it', 'was', 'a', '2-door', 'sports', 'car', ',', 'looked', 'to', 'be', 'from', 'the', 'late', '60s/', 'early', '70s', '.', 'it', 'was', 'called', 'a', 'bricklin', '.', 'the', 'doors', 'were', 'really', 'small', '.', 'in', 'addition', ',', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', '.', 'this', 'is', 'all', 'i', 'know', '.', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', ',', 'engine', 'specs', ',', 'years', 'of

In [10]:
# Removing all the symbols, punctuation, etc.

clean_data = []

for _list_of_words in tokenized_words:
  temp = []
  for word in _list_of_words:
    r = re.sub(r'[^\w\s]', '', word)
    if r != '': temp.append(r)
  clean_data.append(temp)

print("\n-------------------------------- CLEAN DATA ---------------------------------\n")
print(clean_data)


-------------------------------- CLEAN DATA ---------------------------------

[['from', 'lerxst', 'wamumdedu', 'where', 's', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntppostinghost', 'rac3wamumdedu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', '15', 'i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', 'it', 'was', 'a', '2door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', '60s', 'early', '70s', 'it', 'was', 'called', 'a', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'i', 'know', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'lookin

In [11]:
# Removing the stopwords
stopwords_list = stopwords.words('english') 

clean_data_temp = clean_data
clean_data = []

for  _list_of_words in clean_data_temp:
  temp = []
  for word in _list_of_words:
    if word not in stopwords_list:
      temp.append(word)
  clean_data.append(temp)

print("\n-------------------------------- CLEAN DATA ---------------------------------\n")
print(clean_data[:20])


-------------------------------- CLEAN DATA ---------------------------------

[['lerxst', 'wamumdedu', 'thing', 'subject', 'car', 'nntppostinghost', 'rac3wamumdedu', 'organization', 'university', 'maryland', 'college', 'park', 'lines', '15', 'wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', '2door', 'sports', 'car', 'looked', 'late', '60s', 'early', '70s', 'called', 'bricklin', 'doors', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please', 'email', 'thanks', 'il', 'brought', 'neighborhood', 'lerxst'], ['guykuo', 'carsonuwashingtonedu', 'guy', 'kuo', 'subject', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'si', 'clock', 'reports', 'keywords', 'si', 'acceleration', 'clock', 'upgrade', 'articleid', 'shelley1qvfo9innc3s', 'organization', 'university', 'washin

In [12]:
# Stemming on the CLEAN  DATA -- used for removing the prefix

porter_stemmer = PorterStemmer()

# Example
input = "G. Washington is walking down the streets of Washington and is dreaming of one of the greatest economies building itself to glory"
output = [ porter_stemmer.stem(i) for i in input.split() ]
print(output)

clean_data_temp1 = clean_data
clean_data = []

for _list_of_words in clean_data_temp1:
  temp = []
  for word in _list_of_words:
    temp.append(porter_stemmer.stem(word))
  clean_data.append(temp)

clean_data # stemmed data

['G.', 'washington', 'is', 'walk', 'down', 'the', 'street', 'of', 'washington', 'and', 'is', 'dream', 'of', 'one', 'of', 'the', 'greatest', 'economi', 'build', 'itself', 'to', 'glori']


[['lerxst',
  'wamumdedu',
  'thing',
  'subject',
  'car',
  'nntppostinghost',
  'rac3wamumdedu',
  'organ',
  'univers',
  'maryland',
  'colleg',
  'park',
  'line',
  '15',
  'wonder',
  'anyon',
  'could',
  'enlighten',
  'car',
  'saw',
  'day',
  '2door',
  'sport',
  'car',
  'look',
  'late',
  '60',
  'earli',
  '70',
  'call',
  'bricklin',
  'door',
  'realli',
  'small',
  'addit',
  'front',
  'bumper',
  'separ',
  'rest',
  'bodi',
  'know',
  'anyon',
  'tellm',
  'model',
  'name',
  'engin',
  'spec',
  'year',
  'product',
  'car',
  'made',
  'histori',
  'whatev',
  'info',
  'funki',
  'look',
  'car',
  'pleas',
  'email',
  'thank',
  'il',
  'brought',
  'neighborhood',
  'lerxst'],
 ['guykuo',
  'carsonuwashingtonedu',
  'guy',
  'kuo',
  'subject',
  'si',
  'clock',
  'poll',
  'final',
  'call',
  'summari',
  'final',
  'call',
  'si',
  'clock',
  'report',
  'keyword',
  'si',
  'acceler',
  'clock',
  'upgrad',
  'articleid',
  'shelley1qvfo9innc3',


In [13]:
# Lemmatization

lemmatizer = WordNetLemmatizer()

clean_data_temp1 = clean_data
clean_data = []

for _list_of_words in clean_data_temp1:
  temp = []
  for word in _list_of_words:
    temp.append(lemmatizer.lemmatize(word))
  clean_data.append(temp)

In [14]:
print(clean_data[1]) # lemmatized data

['guykuo', 'carsonuwashingtonedu', 'guy', 'kuo', 'subject', 'si', 'clock', 'poll', 'final', 'call', 'summari', 'final', 'call', 'si', 'clock', 'report', 'keyword', 'si', 'acceler', 'clock', 'upgrad', 'articleid', 'shelley1qvfo9innc3', 'organ', 'univers', 'washington', 'line', '11', 'nntppostinghost', 'carsonuwashingtonedu', 'fair', 'number', 'brave', 'soul', 'upgrad', 'si', 'clock', 'oscil', 'share', 'experi', 'poll', 'plea', 'send', 'brief', 'messag', 'detail', 'experi', 'procedur', 'top', 'speed', 'attain', 'cpu', 'rate', 'speed', 'add', 'card', 'adapt', 'heat', 'sink', 'hour', 'usag', 'per', 'day', 'floppi', 'disk', 'function', '800', '14', 'floppi', 'especi', 'request', 'summar', 'next', 'two', 'day', 'plea', 'add', 'network', 'knowledg', 'base', 'done', 'clock', 'upgrad', 'nt', 'answer', 'poll', 'thank', 'guy', 'kuo', 'guykuo', 'uwashingtonedu']
