# Text Cleaning in Python

In [1]:
#warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Creating bunch of sentences
raw_docs = ["I am writing some very basic english sentences",
"I'm just writing it for the practice PURPOSE to make self understanding the basics .",
"The point is to _learn HOW it works_ on #simple # data."]

In [3]:
#importing nltk package
import nltk

In [4]:
#nltk.download()

#python -m nltk.downloader all

# Step 1 - convert to lower case

In [5]:
import string
raw_docs = [doc.lower() for doc in raw_docs]
print(raw_docs)

['i am writing some very basic english sentences', "i'm just writing it for the practice purpose to make self understanding the basics .", 'the point is to _learn how it works_ on #simple # data.']


# Step 2 - Tokenization

In [6]:
# word tokenize
from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in raw_docs]
print(tokenized_docs)

print("#######################################################################################")

#Sentence tokenization

from nltk.tokenize import sent_tokenize
sent_token = [sent_tokenize(doc) for doc in raw_docs]
print(sent_token)

[['i', 'am', 'writing', 'some', 'very', 'basic', 'english', 'sentences'], ['i', "'m", 'just', 'writing', 'it', 'for', 'the', 'practice', 'purpose', 'to', 'make', 'self', 'understanding', 'the', 'basics', '.'], ['the', 'point', 'is', 'to', '_learn', 'how', 'it', 'works_', 'on', '#', 'simple', '#', 'data', '.']]
#######################################################################################
[['i am writing some very basic english sentences'], ["i'm just writing it for the practice purpose to make self understanding the basics ."], ['the point is to _learn how it works_ on #simple # data.']]


# Step 3 - Punctuation Removal

In [7]:
# Removing punctuation
import re
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html

tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    
    tokenized_docs_no_punctuation.append(new_review)
    
print(tokenized_docs_no_punctuation)

[['i', 'am', 'writing', 'some', 'very', 'basic', 'english', 'sentences'], ['i', 'm', 'just', 'writing', 'it', 'for', 'the', 'practice', 'purpose', 'to', 'make', 'self', 'understanding', 'the', 'basics'], ['the', 'point', 'is', 'to', 'learn', 'how', 'it', 'works', 'on', 'simple', 'data']]


# Step 4 - Removing Stopwords

In [8]:
# Cleaning text of stopwords
from nltk.corpus import stopwords

tokenized_docs_no_stopwords = []

for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    
    tokenized_docs_no_stopwords.append(new_term_vector)

print(tokenized_docs_no_stopwords)

[['writing', 'basic', 'english', 'sentences'], ['writing', 'practice', 'purpose', 'make', 'self', 'understanding', 'basics'], ['point', 'learn', 'works', 'simple', 'data']]


# Step 5- Stemming and Lemmantization

In [9]:
# Stemming and Lemmatization
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
wordnet = WordNetLemmatizer()

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        #final_doc.append(porter.stem(word))
        final_doc.append(wordnet.lemmatize(word))
    
    preprocessed_docs.append(final_doc)

print(preprocessed_docs)

[['writing', 'basic', 'english', 'sentence'], ['writing', 'practice', 'purpose', 'make', 'self', 'understanding', 'basic'], ['point', 'learn', 'work', 'simple', 'data']]


# Advance cleaning technique 1 - Normalization 

In [10]:
text = "On the 30th Jan 2020,Corona virus hit India with 1st case in kerala  anywhere, G.O.I started acting and allocated fund of 17287 Crores I.N.R"

In [1]:
from normalise import normalise

custom_abbr = {
    "G.O.I": "Government Of India",
    "I.N.R": "Indian Rupees",
    "ttyl":"talk to you later",
    "G.O.A.T":"Great Of All Times"
    
}

normalized_tokens = normalise(word_tokenize(text), user_abbrevs=custom_abbr, verbose=False)
display(f"Normalized text: {' '.join(normalized_tokens)}")

ModuleNotFoundError: No module named 'sklearn.semi_supervised.label_propagation'