# Text Preprocessing

In [1]:

text = "Google's mission has always beento organize the world's information and makeit universally accessible and useful.We're excited about the transformational power of AI and the helpful new ways it can be applied. From research that expands what's possible, to product integrations designed to make <br/><br/> everyday things easier, and applying AI to make a difference in the lives of those who need it most-we're committed to responsible innovation and technologies that benefit all of humanity."


### 1. Convert to LowerCase

In [2]:
text = text.lower()

### 2. Remove HTML tags

In [3]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [4]:
remove_html_tags(text)

"google's mission has always beento organize the world's information and makeit universally accessible and useful.we're excited about the transformational power of ai and the helpful new ways it can be applied. from research that expands what's possible, to product integrations designed to make  everyday things easier, and applying ai to make a difference in the lives of those who need it most-we're committed to responsible innovation and technologies that benefit all of humanity."

### 3. Remove URLs

In [5]:
text1 = 'check out my notebook http://www.zakir.com/notef34j343hj4'

In [6]:
def remove_url(text):
    pattern =re.compile(r'https?:(//\S+|www\.\S+)')
    return pattern.sub(r'', text)

In [7]:
remove_url(text1)

'check out my notebook '

### 4. Remove Punctuation

In [8]:
import string
exclude = string.punctuation
print(exclude)
def remove_punc(text):
    for char in exclude:
        text=text.replace(char,'')
    return text

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [9]:
text2 = "str. with ? punctuation !"
remove_punc(text2) # Slow process

'str with  punctuation '

In [10]:
def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))

In [11]:

text2 = "str. with ? punctuation !"
remove_punc1(text2) # fast process

'str with  punctuation '

### 5. Chat Word treatment

In [12]:
chatwords = {
    "AFK": "Away From Keyboard",
    "BRB": "Be Right Back",
    "CMIIW": "Correct me if i'm wrong",
    "IKR": "I know, right",
    "ILY": "I love you",
    "IRL": "In Real Life",
    "IYKYK": "If You Know You Know",
    "LFG": "let's freaking go",
    "LMFAO": "Laughing my freaking *a* off",
    "LMK": "Let me know",
    "LOL": "Laugh out loud",
    "LTR": "Left To Right",
    "NVM": "Never mind",
    "OFC": "Of course",
    "ROFL": "Rolling on floor laughing",
    "RTL": "Right To Left",
    "SMH": "Shaking my head",
    "STFU": "Shut the *freak* up",
    "TTYL": "Talk to you later",
    "TYSM": "Thank You So Much",
    "TYVM": "Thank you very much",
    "YOLO": "You only live once"
}

In [13]:
def chat_conversion(text):
    new_text=[]
    for w in text.split():
        if w.upper() in chatwords:
            new_text.append(chatwords[w.upper()])
        else :
            new_text.append(w)
    return " ".join(new_text)

In [14]:

chat_conversion('TYSM For Your help!')

'Thank You So Much For Your help!'

### 6. Spelling Correction

In [15]:
from textblob import TextBlob

In [16]:
incorrect_text='ceertain conditions during serveal generetions are modifeid in the same maner.'
textBlb = TextBlob(incorrect_text)
textBlb.correct().string

'certain conditions during several generations are modified in the same manner.'

### 7. Removing Stopword

In [17]:
from nltk.corpus import stopwords

In [18]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [19]:
def remove_stopwords(text):
    new_text=[]
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [20]:
remove_stopwords('Elephant is one of the largest animals in the world') 

'Elephant  one   largest animals   world'

### 8. Handling Emojis

In [21]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F"   #emotions     
u"\U0001F300-\U0001F5FF"   #symbols & pictographs    
u"\U0001F680-\U0001F6FF"   #transport and map symbols     
u"\U0001F1E0-\U0001F1FF"   #flags (iOS)     
u"\U00012702-\U0002780F"        
u"\U000124C2-\U0001F251"        
        "]+")
    return emoji_pattern.sub(r'',text)

In [22]:
remove_emoji('it was 😊 , and I am 😎')

'it was  , and I am '

In [23]:
import emoji
print(emoji.demojize('it was 😊 , and I am 😎'))

it was :smiling_face_with_smiling_eyes: , and I am :smiling_face_with_sunglasses:


### 10. Tokenization

#### 1. Using the split function

In [24]:
#word tokenization
sent1="I am going to delhi"
sent1.split()

['I', 'am', 'going', 'to', 'delhi']

In [25]:
#sentence tokenization
sent2 = 'I am going to delhi.I will stay there for 3 days. Let\'s hope the trip to be great'
sent2.split('.')

['I am going to delhi',
 'I will stay there for 3 days',
 " Let's hope the trip to be great"]

In [26]:
# problems with split function
sent3 ='I am going to delhi!'
sent3.split()

['I', 'am', 'going', 'to', 'delhi!']

#### 2. Regular Expression

In [27]:

sent4 = 'I am going to delhi'
tokens = re.findall("[\w']+",sent4)
tokens

['I', 'am', 'going', 'to', 'delhi']

In [28]:

text = """Lorem ipsum dolor sit amet? this is is the general text. this text will autogenerated and used for dummy text."""
sentences = re.compile('[.!?]').split(text)
sentences

['Lorem ipsum dolor sit amet',
 ' this is is the general text',
 ' this text will autogenerated and used for dummy text',
 '']

#### 3. NLTK

In [29]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [30]:
sent1='I am going to delhi!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'delhi', '!']

In [31]:
text = """Lorem ipsum dolor sit amet? this is is the general text. this text will autogenerated and used for dummy text."""
sent_tokenize(text)

['Lorem ipsum dolor sit amet?',
 'this is is the general text.',
 'this text will autogenerated and used for dummy text.']

In [32]:
sent5= "I have Ph.D in A.I"
sent6="we're here to help mail us on abc@gmail.com"
sent7="5km rides cost $7.5"
word_tokenize(sent5)


['I', 'have', 'Ph.D', 'in', 'A.I']

In [33]:
word_tokenize(sent6)

['we',
 "'re",
 'here',
 'to',
 'help',
 'mail',
 'us',
 'on',
 'abc',
 '@',
 'gmail.com']

In [34]:
word_tokenize(sent7)

['5km', 'rides', 'cost', '$', '7.5']

#### 4. Spacy

In [35]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [36]:
doc1= nlp(sent5)
doc2= nlp(sent6)
doc3= nlp(sent7)
doc4= nlp(sent1)

In [37]:
for token in doc4:
    print(token)

I
am
going
to
delhi
!


### 11. Stemming and Lemitization

In [38]:
from nltk.stem.porter import PorterStemmer

In [39]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [40]:
sample="walk walks walking probably"
stem_words(sample)

'walk walk walk probabl'

In [41]:
from nltk.stem import WordNetLemmatizer

In [42]:
import nltk
wordnet_lemmatizer=WordNetLemmatizer()

sentence = 'he was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun.'
punctuation="?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuation:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word,pos="v")))

Word                Lemma               
he                  he                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 
