<a href="https://colab.research.google.com/github/sunilkonatham/basicPythonForDataScience/blob/master/Advance_text_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [3]:
doc = ['How is your week going its going good']

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [7]:
tok = Tokenizer()

In [8]:
tok.fit_on_texts(doc)

In [9]:
tok.word_index

{'going': 1, 'good': 7, 'how': 2, 'is': 3, 'its': 6, 'week': 5, 'your': 4}

In [11]:
tok.texts_to_sequences(doc)

[[2, 3, 4, 5, 1, 6, 1, 7]]

In [12]:
test = ['how is your week going', 'hope next week will be better', 'it was good', 'today is friday', 'and then comes saturday', 'SAturday and sunday are holidays', 'I dont know what Im writing']

In [13]:
df = pd.DataFrame({'test_data':test})

In [15]:
df.head(10)

Unnamed: 0,test_data
0,how is your week going
1,hope next week will be better
2,it was good
3,today is friday
4,and then comes saturday
5,SAturday and sunday are holidays
6,I dont know what Im writing


In [16]:
df['sequence'] = df['test_data'].apply(lambda x : tok.texts_to_sequences([x]))

In [17]:
df

Unnamed: 0,test_data,sequence
0,how is your week going,"[[2, 3, 4, 5, 1]]"
1,hope next week will be better,[[5]]
2,it was good,[[7]]
3,today is friday,[[3]]
4,and then comes saturday,[[]]
5,SAturday and sunday are holidays,[[]]
6,I dont know what Im writing,[[]]


In [19]:
#Out of vocabulary tokenizer
tok_oov = Tokenizer(oov_token='<unk>')

In [24]:
tok_oov.fit_on_texts(doc)

In [25]:
tok_oov.word_index

{'<unk>': 1,
 'data': 4,
 'going': 2,
 'good': 11,
 'how': 6,
 'is': 7,
 'its': 10,
 'sequence': 5,
 'test': 3,
 'week': 9,
 'your': 8}

In [26]:
df['tok_oov_sequence'] = df['test_data'].apply(lambda x : tok_oov.texts_to_sequences([x]))

In [27]:
df

Unnamed: 0,test_data,sequence,tok_oov_sequence
0,how is your week going,"[[2, 3, 4, 5, 1]]","[[6, 7, 8, 9, 2]]"
1,hope next week will be better,[[5]],"[[1, 1, 9, 1, 1, 1]]"
2,it was good,[[7]],"[[1, 1, 11]]"
3,today is friday,[[3]],"[[1, 7, 1]]"
4,and then comes saturday,[[]],"[[1, 1, 1, 1]]"
5,SAturday and sunday are holidays,[[]],"[[1, 1, 1, 1, 1]]"
6,I dont know what Im writing,[[]],"[[1, 1, 1, 1, 1, 1]]"


In [28]:
#Padding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
df['pre-padding'] = df['tok_oov_sequence'].apply(lambda x: pad_sequences(x,maxlen=4, padding='pre', truncating='pre', value=0))

In [31]:
df['post-padding'] = df['tok_oov_sequence'].apply(lambda x: pad_sequences(x,maxlen=4, padding='post', truncating='post', value=0))

In [32]:
df

Unnamed: 0,test_data,sequence,tok_oov_sequence,pre-padding,post-padding
0,how is your week going,"[[2, 3, 4, 5, 1]]","[[6, 7, 8, 9, 2]]","[[7, 8, 9, 2]]","[[6, 7, 8, 9]]"
1,hope next week will be better,[[5]],"[[1, 1, 9, 1, 1, 1]]","[[9, 1, 1, 1]]","[[1, 1, 9, 1]]"
2,it was good,[[7]],"[[1, 1, 11]]","[[0, 1, 1, 11]]","[[1, 1, 11, 0]]"
3,today is friday,[[3]],"[[1, 7, 1]]","[[0, 1, 7, 1]]","[[1, 7, 1, 0]]"
4,and then comes saturday,[[]],"[[1, 1, 1, 1]]","[[1, 1, 1, 1]]","[[1, 1, 1, 1]]"
5,SAturday and sunday are holidays,[[]],"[[1, 1, 1, 1, 1]]","[[1, 1, 1, 1]]","[[1, 1, 1, 1]]"
6,I dont know what Im writing,[[]],"[[1, 1, 1, 1, 1, 1]]","[[1, 1, 1, 1]]","[[1, 1, 1, 1]]"


In [33]:
#Stemmer and lemmatizer
import nltk

In [34]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [35]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [36]:
ps = PorterStemmer()
lm = WordNetLemmatizer()

In [37]:
df['Stemmer'] = df['test_data'].apply(lambda x: ps.stem(x))
df['lemmatizer'] = df['test_data'].apply(lambda x: lm.lemmatize(x))

In [38]:
df

Unnamed: 0,test_data,sequence,tok_oov_sequence,pre-padding,post-padding,Stemmer,lemmatizer
0,how is your week going,"[[2, 3, 4, 5, 1]]","[[6, 7, 8, 9, 2]]","[[7, 8, 9, 2]]","[[6, 7, 8, 9]]",how is your week go,how is your week going
1,hope next week will be better,[[5]],"[[1, 1, 9, 1, 1, 1]]","[[9, 1, 1, 1]]","[[1, 1, 9, 1]]",hope next week will be bett,hope next week will be better
2,it was good,[[7]],"[[1, 1, 11]]","[[0, 1, 1, 11]]","[[1, 1, 11, 0]]",it was good,it was good
3,today is friday,[[3]],"[[1, 7, 1]]","[[0, 1, 7, 1]]","[[1, 7, 1, 0]]",today is friday,today is friday
4,and then comes saturday,[[]],"[[1, 1, 1, 1]]","[[1, 1, 1, 1]]","[[1, 1, 1, 1]]",and then comes saturday,and then comes saturday
5,SAturday and sunday are holidays,[[]],"[[1, 1, 1, 1, 1]]","[[1, 1, 1, 1]]","[[1, 1, 1, 1]]",saturday and sunday are holiday,SAturday and sunday are holidays
6,I dont know what Im writing,[[]],"[[1, 1, 1, 1, 1, 1]]","[[1, 1, 1, 1]]","[[1, 1, 1, 1]]",i dont know what im writ,I dont know what Im writing
