In [22]:
# importing libraries

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, WhitespaceTokenizer # 1. Tokenization
from string import punctuation                                              # 3. remove punctuations
from nltk.corpus import stopwords                                           # 4. remove stopwords
import contractions                                                         # 5. contraction mapping
from nltk.stem import WordNetLemmatizer, LancasterStemmer                   # 6. Stemming and Lematization
from unidecode import unidecode                                             # 7. handling accented character

In [2]:
# pip install contractions

In [3]:
# nltk.download()

In [4]:
# pip install nltk

In [5]:
text = """The Moon is a barren, rocky world without air and water. 
It has dark lava plain on its surface. The Moon is filled wit craters. 
It has no light of its own. It gets its light from the Sun. The Moo keeps changing its
shape as it moves round the Earth. It spins on its axis in 27.3 days stars were named after
the Edwin Aldrin were the first ones to set their foot on the Moon on 21 
July 1969 They reached the Moon in their space craft named Apollo II."""

## Preprocessing Steps

### 1. Tokenization 

In [6]:
# sentence tokenization
tokens_sent = sent_tokenize(text)
tokens_sent

['The Moon is a barren, rocky world without air and water.',
 'It has dark lava plain on its surface.',
 'The Moon is filled wit craters.',
 'It has no light of its own.',
 'It gets its light from the Sun.',
 'The Moo keeps changing its\nshape as it moves round the Earth.',
 'It spins on its axis in 27.3 days stars were named after\nthe Edwin Aldrin were the first ones to set their foot on the Moon on 21 \nJuly 1969 They reached the Moon in their space craft named Apollo II.']

In [7]:
# word tokenization
text1 = 'The Moon is a barren, rocky world without air and water.'
token_words = word_tokenize(text)
token_words

['The',
 'Moon',
 'is',
 'a',
 'barren',
 ',',
 'rocky',
 'world',
 'without',
 'air',
 'and',
 'water',
 '.',
 'It',
 'has',
 'dark',
 'lava',
 'plain',
 'on',
 'its',
 'surface',
 '.',
 'The',
 'Moon',
 'is',
 'filled',
 'wit',
 'craters',
 '.',
 'It',
 'has',
 'no',
 'light',
 'of',
 'its',
 'own',
 '.',
 'It',
 'gets',
 'its',
 'light',
 'from',
 'the',
 'Sun',
 '.',
 'The',
 'Moo',
 'keeps',
 'changing',
 'its',
 'shape',
 'as',
 'it',
 'moves',
 'round',
 'the',
 'Earth',
 '.',
 'It',
 'spins',
 'on',
 'its',
 'axis',
 'in',
 '27.3',
 'days',
 'stars',
 'were',
 'named',
 'after',
 'the',
 'Edwin',
 'Aldrin',
 'were',
 'the',
 'first',
 'ones',
 'to',
 'set',
 'their',
 'foot',
 'on',
 'the',
 'Moon',
 'on',
 '21',
 'July',
 '1969',
 'They',
 'reached',
 'the',
 'Moon',
 'in',
 'their',
 'space',
 'craft',
 'named',
 'Apollo',
 'II',
 '.']

In [8]:
# Whitespace Tokenizer - working just like .split with spaces
ws_tokens = WhitespaceTokenizer().tokenize(text1)
ws_tokens


['The',
 'Moon',
 'is',
 'a',
 'barren,',
 'rocky',
 'world',
 'without',
 'air',
 'and',
 'water.']

### 2. Normalization

In [9]:
token_lower_text = [i.lower() for i in token_words]
token_lower_text

['the',
 'moon',
 'is',
 'a',
 'barren',
 ',',
 'rocky',
 'world',
 'without',
 'air',
 'and',
 'water',
 '.',
 'it',
 'has',
 'dark',
 'lava',
 'plain',
 'on',
 'its',
 'surface',
 '.',
 'the',
 'moon',
 'is',
 'filled',
 'wit',
 'craters',
 '.',
 'it',
 'has',
 'no',
 'light',
 'of',
 'its',
 'own',
 '.',
 'it',
 'gets',
 'its',
 'light',
 'from',
 'the',
 'sun',
 '.',
 'the',
 'moo',
 'keeps',
 'changing',
 'its',
 'shape',
 'as',
 'it',
 'moves',
 'round',
 'the',
 'earth',
 '.',
 'it',
 'spins',
 'on',
 'its',
 'axis',
 'in',
 '27.3',
 'days',
 'stars',
 'were',
 'named',
 'after',
 'the',
 'edwin',
 'aldrin',
 'were',
 'the',
 'first',
 'ones',
 'to',
 'set',
 'their',
 'foot',
 'on',
 'the',
 'moon',
 'on',
 '21',
 'july',
 '1969',
 'they',
 'reached',
 'the',
 'moon',
 'in',
 'their',
 'space',
 'craft',
 'named',
 'apollo',
 'ii',
 '.']

### 3. Remove Punctuations

In [10]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
tokens_without_punc = [i for i in token_lower_text if i not in punctuation]
tokens_without_punc

['the',
 'moon',
 'is',
 'a',
 'barren',
 'rocky',
 'world',
 'without',
 'air',
 'and',
 'water',
 'it',
 'has',
 'dark',
 'lava',
 'plain',
 'on',
 'its',
 'surface',
 'the',
 'moon',
 'is',
 'filled',
 'wit',
 'craters',
 'it',
 'has',
 'no',
 'light',
 'of',
 'its',
 'own',
 'it',
 'gets',
 'its',
 'light',
 'from',
 'the',
 'sun',
 'the',
 'moo',
 'keeps',
 'changing',
 'its',
 'shape',
 'as',
 'it',
 'moves',
 'round',
 'the',
 'earth',
 'it',
 'spins',
 'on',
 'its',
 'axis',
 'in',
 '27.3',
 'days',
 'stars',
 'were',
 'named',
 'after',
 'the',
 'edwin',
 'aldrin',
 'were',
 'the',
 'first',
 'ones',
 'to',
 'set',
 'their',
 'foot',
 'on',
 'the',
 'moon',
 'on',
 '21',
 'july',
 '1969',
 'they',
 'reached',
 'the',
 'moon',
 'in',
 'their',
 'space',
 'craft',
 'named',
 'apollo',
 'ii']

### 4. Remove Stopwords

In [12]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
token_without_stopwords = [i for i in tokens_without_punc if i not in stop_words]
token_without_stopwords

['moon',
 'barren',
 'rocky',
 'world',
 'without',
 'air',
 'water',
 'dark',
 'lava',
 'plain',
 'surface',
 'moon',
 'filled',
 'wit',
 'craters',
 'light',
 'gets',
 'light',
 'sun',
 'moo',
 'keeps',
 'changing',
 'shape',
 'moves',
 'round',
 'earth',
 'spins',
 'axis',
 '27.3',
 'days',
 'stars',
 'named',
 'edwin',
 'aldrin',
 'first',
 'ones',
 'set',
 'foot',
 'moon',
 '21',
 'july',
 '1969',
 'reached',
 'moon',
 'space',
 'craft',
 'named',
 'apollo',
 'ii']

### 5. Contraction Mapping

In [14]:
text1 = "I didn't like the movie"
expanded_text = contractions.fix(text1)
expanded_text

'I did not like the movie'

In [16]:
text1 = "I can't like the movie"
expanded_text = contractions.fix(text1)
expanded_text

'I cannot like the movie'

### 6. Stemming and Lemmatization

In [20]:
stemming = LancasterStemmer()
lemma = WordNetLemmatizer()
for word in token_without_stopwords:
    stem_word = stemming.stem(word)
    lemma_word = lemma.lemmatize(word)
    print(f'word : {word}')
    print(f'Stem word : {stem_word}')
    print(f'Lemma word : {lemma_word}')
    print('*'*50)

word : moon
Stem word : moon
Lemma word : moon
**************************************************
word : barren
Stem word : bar
Lemma word : barren
**************************************************
word : rocky
Stem word : rocky
Lemma word : rocky
**************************************************
word : world
Stem word : world
Lemma word : world
**************************************************
word : without
Stem word : without
Lemma word : without
**************************************************
word : air
Stem word : air
Lemma word : air
**************************************************
word : water
Stem word : wat
Lemma word : water
**************************************************
word : dark
Stem word : dark
Lemma word : dark
**************************************************
word : lava
Stem word : lav
Lemma word : lava
**************************************************
word : plain
Stem word : plain
Lemma word : plain
**************************************************
wo

### 7. Accented Character

In [23]:
accented_character = 'á, Á, å, é, É'
fixed_words = unidecode(accented_character)
fixed_words

'a, A, a, e, E'