# Text cleaning

In [41]:
sample_txt = "&&& ^@@^ 11111     Hey! did you know that the summer      break is coming? Amazing right!! It's only 5 more days!! Let's go hiking! I'd like to know how do you think about it!" 

In [2]:
### Lowercasing
lower_txt = sample_txt.lower()
lower_txt

'&&& ^@@^ hey, did you know that the summer break is coming? amazing right!! it’s only 5 more days!! let’s go hiking!'

In [3]:
### Removing Punctuation
### There are multiple ways to remove punctuations

# Opeion 1 （it cannot deal with non-English punctuations)
import string
punc_removed = sample_txt.translate(str.maketrans('', '', string.punctuation))
punc_removed

'  Hey did you know that the summer break is coming Amazing right It’s only 5 more days Let’s go hiking'

In [4]:
# Option 2
import re
punc_removed = re.sub(r'[^\w\s]', '', sample_txt)
punc_removed

'  Hey did you know that the summer break is coming Amazing right Its only 5 more days Lets go hiking'

In [11]:
# Option 3
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~’‘”“'''
punc_removed = [s for s in sample_txt if s not in punc ]
punc_removed = ''.join(p for p in punc_removed)
punc_removed

'  Hey did you know that the summer break is coming Amazing right Its only 5 more days Lets go hiking'

In [14]:
### Removing numbers

# Option 1
import string
translation_table = str.maketrans('', '', string.digits)
num_removed = sample_txt.translate(translation_table)
num_removed

'&&& ^@@^  Hey, did you know that the summer break is coming? Amazing right!! It’s only  more days!! Let’s go hiking!'

In [15]:
# Option 2
import re
pattern = r'[0-9]'
num_removed = re.sub(pattern,'',sample_txt)
num_removed

'&&& ^@@^  Hey, did you know that the summer break is coming? Amazing right!! It’s only  more days!! Let’s go hiking!'

In [59]:
# Option 3
num_removed = ''.join((s for s in sample_txt if not s.isdigit()))
num_removed

"&&& ^@@^  Hey! did you know that the summer break is coming? Amazing right!! It's only  more days!! Let's go hiking! I'd like to know how do you think about it!"

In [21]:
### Expanding contractions

import contractions
contr_expanded = contractions.fix(sample_txt)
contr_expanded

'&&& ^@@^ 11111 Hey, did you know that the summer break is coming? Amazing right!! It is only 5 more days!! Let us go hiking! I would like to know how do you think about it!'

In [45]:
### Removing extra white spaces

wt_space_removed = ' '.join(s for s in sample_txt.split())
wt_space_removed

"&&& ^@@^ 11111 Hey! did you know that the summer break is coming? Amazing right!! It's only 5 more days!! Let's go hiking! I'd like to know how do you think about it!"

In [64]:
### Transfering a cleaned sentence to a list of words will make the following steps easier
### Because the following steps are exected at the word level

lower_txt = sample_txt.lower()
punc_removed = re.sub(r'[^\w\s]', '', lower_txt)
num_removed = ''.join((s for s in punc_removed if not s.isdigit()))
contr_expanded = contractions.fix(num_removed)
wt_space_removed = ' '.join(s for s in contr_expanded.split())

word_lst = [w for w in wt_space_removed.split()]
word_lst

['hey',
 'did',
 'you',
 'know',
 'that',
 'the',
 'summer',
 'break',
 'is',
 'coming',
 'amazing',
 'right',
 'its',
 'only',
 'more',
 'days',
 'let',
 'us',
 'go',
 'hiking',
 'id',
 'like',
 'to',
 'know',
 'how',
 'do',
 'you',
 'think',
 'about',
 'it']

# Removing stopwords

In [66]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
en_stopwords = stopwords.words('english')

# you can expand the stopwords list:
en_stopwords += ['go','day']

stw_removed = [s for s in word_lst if s.lower() not in en_stopwords]
stw_removed

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['hey',
 'know',
 'summer',
 'break',
 'coming',
 'amazing',
 'right',
 'days',
 'let',
 'us',
 'hiking',
 'id',
 'like',
 'know',
 'think']

# Stemming & Lemmatization

In [67]:
### Stemming

from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemmed_word_lst = [ps.stem(w) for w in word_lst]
stemmed_word_lst

['hey',
 'did',
 'you',
 'know',
 'that',
 'the',
 'summer',
 'break',
 'is',
 'come',
 'amaz',
 'right',
 'it',
 'onli',
 'more',
 'day',
 'let',
 'us',
 'go',
 'hike',
 'id',
 'like',
 'to',
 'know',
 'how',
 'do',
 'you',
 'think',
 'about',
 'it']

In [70]:
### Lemmatization

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
wnl = WordNetLemmatizer()
lemmatized_word_lst = [wnl.lemmatize(w) for w in word_lst]
lemmatized_word_lst

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...


['hey',
 'did',
 'you',
 'know',
 'that',
 'the',
 'summer',
 'break',
 'is',
 'coming',
 'amazing',
 'right',
 'it',
 'only',
 'more',
 'day',
 'let',
 'u',
 'go',
 'hiking',
 'id',
 'like',
 'to',
 'know',
 'how',
 'do',
 'you',
 'think',
 'about',
 'it']

# Tokenization

In [76]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Note: if we don't remove punctuations or numbers in advance, they will also be tokenized
tokenized = word_tokenize(sample_txt)
tokenized_clean = word_tokenize(' '.join(w for w in word_lst))
tokenized,tokenized_clean

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


(['&',
  '&',
  '&',
  '^',
  '@',
  '@',
  '^',
  '11111',
  'Hey',
  '!',
  'did',
  'you',
  'know',
  'that',
  'the',
  'summer',
  'break',
  'is',
  'coming',
  '?',
  'Amazing',
  'right',
  '!',
  '!',
  'It',
  "'s",
  'only',
  '5',
  'more',
  'days',
  '!',
  '!',
  'Let',
  "'s",
  'go',
  'hiking',
  '!',
  'I',
  "'d",
  'like',
  'to',
  'know',
  'how',
  'do',
  'you',
  'think',
  'about',
  'it',
  '!'],
 ['hey',
  'did',
  'you',
  'know',
  'that',
  'the',
  'summer',
  'break',
  'is',
  'coming',
  'amazing',
  'right',
  'its',
  'only',
  'more',
  'days',
  'let',
  'us',
  'go',
  'hiking',
  'id',
  'like',
  'to',
  'know',
  'how',
  'do',
  'you',
  'think',
  'about',
  'it'])

# POS tagging

In [79]:
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')
word_tokens = word_tokenize(sample_txt)
pos_tagged = nltk.pos_tag(word_tokens)
pos_tagged

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('&', 'CC'),
 ('&', 'CC'),
 ('&', 'CC'),
 ('^', 'NNP'),
 ('@', 'NNP'),
 ('@', 'NNP'),
 ('^', 'VBD'),
 ('11111', 'CD'),
 ('Hey', 'NNP'),
 ('!', '.'),
 ('did', 'VBD'),
 ('you', 'PRP'),
 ('know', 'VBP'),
 ('that', 'IN'),
 ('the', 'DT'),
 ('summer', 'NN'),
 ('break', 'NN'),
 ('is', 'VBZ'),
 ('coming', 'VBG'),
 ('?', '.'),
 ('Amazing', 'VBG'),
 ('right', 'RB'),
 ('!', '.'),
 ('!', '.'),
 ('It', 'PRP'),
 ("'s", 'VBZ'),
 ('only', 'RB'),
 ('5', 'CD'),
 ('more', 'JJR'),
 ('days', 'NNS'),
 ('!', '.'),
 ('!', '.'),
 ('Let', 'VB'),
 ("'s", 'POS'),
 ('go', 'VB'),
 ('hiking', 'NN'),
 ('!', '.'),
 ('I', 'PRP'),
 ("'d", 'MD'),
 ('like', 'VB'),
 ('to', 'TO'),
 ('know', 'VB'),
 ('how', 'WRB'),
 ('do', 'VB'),
 ('you', 'PRP'),
 ('think', 'VB'),
 ('about', 'IN'),
 ('it', 'PRP'),
 ('!', '.')]