## Example of corpus-raw text preprocessing

In [1]:
import nltk
#nltk.download('stopwords') # download and update package stopwords
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.porter import PorterStemmer

from nltk.tokenize import RegexpTokenizer

In [2]:
stemmer = PorterStemmer()

raw_text = """ The next preprocessing step is breaking up the units of text into individual words or tokens... As mentioned 
                earlier, stopwords are very common words. Words like “we” and “are” probably do not help at all in NLP tasks
                such as sentiment analysis or text classifications. Hence, we can remove stopwords to save computing time and
                efforts in processing large volumes of text. In our case, we used spaCy’s inbuilt stopwords,
                but we should be cautious and modify the stopwords list accordingly. E.g., 
                for sentiment analysis, the word “not” is important in the meaning of a text such as “not good”.
                However, spaCy included “not” as a stopword. 
            """

In [3]:
#lower case
text = raw_text.lower()
print("text :", text)
print("\n")

text :  the next preprocessing step is breaking up the units of text into individual words or tokens... as mentioned 
                earlier, stopwords are very common words. words like “we” and “are” probably do not help at all in nlp tasks
                such as sentiment analysis or text classifications. hence, we can remove stopwords to save computing time and
                efforts in processing large volumes of text. in our case, we used spacy’s inbuilt stopwords,
                but we should be cautious and modify the stopwords list accordingly. e.g., 
                for sentiment analysis, the word “not” is important in the meaning of a text such as “not good”.
                however, spacy included “not” as a stopword. 
            




In [4]:
#tokenization
tokenizer = RegexpTokenizer(r'\w+') # remove punctuatuion
tokens = tokenizer.tokenize(text)
print("tokens :", tokens)
print("\n")
print("tokens[0] :", tokens[0])

tokens : ['the', 'next', 'preprocessing', 'step', 'is', 'breaking', 'up', 'the', 'units', 'of', 'text', 'into', 'individual', 'words', 'or', 'tokens', 'as', 'mentioned', 'earlier', 'stopwords', 'are', 'very', 'common', 'words', 'words', 'like', 'we', 'and', 'are', 'probably', 'do', 'not', 'help', 'at', 'all', 'in', 'nlp', 'tasks', 'such', 'as', 'sentiment', 'analysis', 'or', 'text', 'classifications', 'hence', 'we', 'can', 'remove', 'stopwords', 'to', 'save', 'computing', 'time', 'and', 'efforts', 'in', 'processing', 'large', 'volumes', 'of', 'text', 'in', 'our', 'case', 'we', 'used', 'spacy', 's', 'inbuilt', 'stopwords', 'but', 'we', 'should', 'be', 'cautious', 'and', 'modify', 'the', 'stopwords', 'list', 'accordingly', 'e', 'g', 'for', 'sentiment', 'analysis', 'the', 'word', 'not', 'is', 'important', 'in', 'the', 'meaning', 'of', 'a', 'text', 'such', 'as', 'not', 'good', 'however', 'spacy', 'included', 'not', 'as', 'a', 'stopword']


tokens[0] : the


In [5]:
stopWords = set(stopwords.words('english'))
stopWords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [6]:
# stopwords removal
tokens_clean = [t for t in tokens if t not in stopWords]
print("\n tokens_clean : ", tokens_clean)


 tokens_clean :  ['next', 'preprocessing', 'step', 'breaking', 'units', 'text', 'individual', 'words', 'tokens', 'mentioned', 'earlier', 'stopwords', 'common', 'words', 'words', 'like', 'probably', 'help', 'nlp', 'tasks', 'sentiment', 'analysis', 'text', 'classifications', 'hence', 'remove', 'stopwords', 'save', 'computing', 'time', 'efforts', 'processing', 'large', 'volumes', 'text', 'case', 'used', 'spacy', 'inbuilt', 'stopwords', 'cautious', 'modify', 'stopwords', 'list', 'accordingly', 'e', 'g', 'sentiment', 'analysis', 'word', 'important', 'meaning', 'text', 'good', 'however', 'spacy', 'included', 'stopword']


In [7]:
#stemming
stems = [stemmer.stem(t) for t in tokens_clean]
print("\n stems : " ,stems)


 stems :  ['next', 'preprocess', 'step', 'break', 'unit', 'text', 'individu', 'word', 'token', 'mention', 'earlier', 'stopword', 'common', 'word', 'word', 'like', 'probabl', 'help', 'nlp', 'task', 'sentiment', 'analysi', 'text', 'classif', 'henc', 'remov', 'stopword', 'save', 'comput', 'time', 'effort', 'process', 'larg', 'volum', 'text', 'case', 'use', 'spaci', 'inbuilt', 'stopword', 'cautiou', 'modifi', 'stopword', 'list', 'accordingli', 'e', 'g', 'sentiment', 'analysi', 'word', 'import', 'mean', 'text', 'good', 'howev', 'spaci', 'includ', 'stopword']
