# Introduction to NLP - Text Preprocessing

In [1]:
import nltk

## Lemma VS Stemmer

In [2]:
nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
porter_stemmer = PorterStemmer()
wordnet_lemma = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def display_lemma_porter(text):
  print(f"{'word':<12} \t {'lemma':<12} \t {'stem':<12}")
  print('-'*50)
  for word in text:
    print(f'{word:12} \t {wordnet_lemma.lemmatize(word):12} \t {porter_stemmer.stem(word):12}')

In [4]:
word_list = ['fly', 'flies', 'flying', 'flew', 'flown']
display_lemma_porter(word_list)

word         	 lemma        	 stem        
--------------------------------------------------
fly          	 fly          	 fli         
flies        	 fly          	 fli         
flying       	 flying       	 fli         
flew         	 flew         	 flew        
flown        	 flown        	 flown       


In [5]:
word_list = ['universe', 'university', 'universal']
display_lemma_porter(word_list)

word         	 lemma        	 stem        
--------------------------------------------------
universe     	 universe     	 univers     
university   	 university   	 univers     
universal    	 universal    	 univers     


In [6]:
word_list = "The formatting operations described here exhibit a variety of quirks the lead to a number of common errors.".split()
display_lemma_porter(word_list)

word         	 lemma        	 stem        
--------------------------------------------------
The          	 The          	 the         
formatting   	 formatting   	 format      
operations   	 operation    	 oper        
described    	 described    	 describ     
here         	 here         	 here        
exhibit      	 exhibit      	 exhibit     
a            	 a            	 a           
variety      	 variety      	 varieti     
of           	 of           	 of          
quirks       	 quirk        	 quirk       
the          	 the          	 the         
lead         	 lead         	 lead        
to           	 to           	 to          
a            	 a            	 a           
number       	 number       	 number      
of           	 of           	 of          
common       	 common       	 common      
errors.      	 errors.      	 errors.     


## Stopword Removal

In [7]:
import nltk

In [8]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [9]:
def remove_stopwords(text):
    output = [i for i in text if i not in stopwords]
    return output

In [10]:
text = 'Thailand (Thai: ประเทศไทย), known formerly as Siam and officially as the Kingdom of Thailand, is a country inSoutheast Asia.'.split()
print('Original text:', text)
print('Remove stopword:', remove_stopwords(text))

Original text: ['Thailand', '(Thai:', 'ประเทศไทย),', 'known', 'formerly', 'as', 'Siam', 'and', 'officially', 'as', 'the', 'Kingdom', 'of', 'Thailand,', 'is', 'a', 'country', 'inSoutheast', 'Asia.']
Remove stopword: ['Thailand', '(Thai:', 'ประเทศไทย),', 'known', 'formerly', 'Siam', 'officially', 'Kingdom', 'Thailand,', 'country', 'inSoutheast', 'Asia.']


## Normalisation

In [11]:
norm_dict = {'2moro':'tomorrow',
             '2mrrw':'tomorrow',
             '2morrow':'tomorrow', 
             '2mrw':'tomorrow',
             'tomrw':'tomorrow',
             'b4':'before',
             'otw':'on the way',
             ':)':'smile',
             ';-)':'smile'}

In [12]:
def normalise(text):
    res = [norm_dict[w] if w in norm_dict else w for w in text]
    return res

In [14]:
word_list = ['2moro', '2mrrw', '2morrow', '2mrw', 'tomrw', 'b4']
normalise(word_list) 

['tomorrow', 'tomorrow', 'tomorrow', 'tomorrow', 'tomorrow', 'before']

## Noise Removal

In [15]:
import pandas as pd
import re

In [16]:
def scrub_words(text):
    """Basic cleaning of texts."""
    # remove html markup
    text = re.sub("(<.*?>)","",text)

    # remove non-ascii and digits
    text = re.sub("(\\W|\\d)","",text)

    # remove whitespace
    text = text.strip()

    return text

In [17]:
raw_words = ["..trouble..", "trouble<", "trouble!", "<a>trouble</a>", '1.trouble'] 
cleaned_words = [scrub_words(w) for w in raw_words] 
stemdf = pd.DataFrame({ 'raw_word' : raw_words, 'cleaned_word' : cleaned_words }) 
stemdf = stemdf[['raw_word', 'cleaned_word']] 
stemdf 

Unnamed: 0,raw_word,cleaned_word
0,..trouble..,trouble
1,trouble<,trouble
2,trouble!,trouble
3,<a>trouble</a>,trouble
4,1.trouble,trouble


## Text Enrichment / Augmentation

In [19]:
from nltk.corpus import wordnet

In [20]:
syns = wordnet.synsets("program")
syns

[Synset('plan.n.01'),
 Synset('program.n.02'),
 Synset('broadcast.n.02'),
 Synset('platform.n.02'),
 Synset('program.n.05'),
 Synset('course_of_study.n.01'),
 Synset('program.n.07'),
 Synset('program.n.08'),
 Synset('program.v.01'),
 Synset('program.v.02')]

In [21]:
[s.lemmas()[0].name() for s in syns]

['plan',
 'program',
 'broadcast',
 'platform',
 'program',
 'course_of_study',
 'program',
 'program',
 'program',
 'program']