# Basic text cleaning techniques

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import re, itertools
import nltk
from nltk.corpus import stopwords 

verbatim = '    So coooooooool ! I just love thisGreat product, <div>Great Product</div> http://www.greatproduct.com !!!'

print("Remove whitespaces\n")
verbatim = verbatim.strip()
print(verbatim)

print("\nRemove html tags\n")
verbatim = re.sub(r'<[^<]+?>', ' ', verbatim)
print(verbatim)

print("\nRemove urls\n")
verbatim = re.sub(r'https?:\/\/.*[\r\n]*', ' ', verbatim, flags=re.MULTILINE)
print(verbatim)

print("\nRemove ponctuation\n")
verbatim = re.sub(r'[^\w\s]','',verbatim)
print(verbatim)

print("\nStandardize words\n")
verbatim = ''.join(''.join(s)[:2] for _, s in itertools.groupby(verbatim))
print(verbatim)

print("\nSplit attached words\n")
verbatim = ' '.join(re.findall('[A-Z][^A-Z]*', verbatim))
print(verbatim)

print("\nLowercase\n")
verbatim = verbatim.lower()
print(verbatim)

print("\nStopwords\n")
verbatim = ' '.join([word for word in verbatim.split() if word not in (stopwords.words('english'))])
print(verbatim)

print("\nTokenize\n")
tokens = nltk.word_tokenize(verbatim)
print(tokens)

Remove whitespaces

So coooooooool ! I just love thisGreat product, <div>Great Product</div> http://www.greatproduct.com !!!

Remove html tags

So coooooooool ! I just love thisGreat product,  Great Product  http://www.greatproduct.com !!!

Remove urls

So coooooooool ! I just love thisGreat product,  Great Product   

Remove ponctuation

So coooooooool  I just love thisGreat product  Great Product   

Standardize words

So cool  I just love thisGreat product  Great Product  

Split attached words

So cool   I just love this Great product   Great  Product  

Lowercase

so cool   i just love this great product   great  product  

Stopwords

cool love great product great product

Tokenize

['cool', 'love', 'great', 'product', 'great', 'product']
