# Text cleaning with regular expressions

https://docs.python.org/3.8/library/re.html

RegEx tutorial: https://medium.com/factory-mind/regex-tutorial-a-simple-cheatsheet-by-examples-649dc1c3f285

https://regex101.com

In [21]:
import re
import string

In [22]:
text = 'Noisy Text Example with a lot of numbers and special symbols: 12345, #abc, @@qwerty, http://xyz. Data cleaning Done'

In [23]:
# Converting to lowercase
text = text.lower()
text

'noisy text example with a lot of numbers and special symbols: 12345, #abc, @@qwerty, http://xyz. data cleaning done'

In [24]:
# remove mentions
text = re.sub("@\\S+", " ", text)
text

'noisy text example with a lot of numbers and special symbols: 12345, #abc,   http://xyz. data cleaning done'

In [25]:
# remove url
text = re.sub("https*\\S+", " ", text)
text

'noisy text example with a lot of numbers and special symbols: 12345, #abc,     data cleaning done'

In [26]:
# remove hashtags
text = re.sub("#\\S+", " ", text)
print(text)

noisy text example with a lot of numbers and special symbols: 12345,       data cleaning done


In [27]:
# remove all numbers
text = re.sub("\\d", " ", text)  
text

'noisy text example with a lot of numbers and special symbols:      ,       data cleaning done'

In [28]:
# remove punctuations
text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
text

'noisy text example with a lot of numbers and special symbols               data cleaning done'

In [29]:
# remove extra spaces
text = re.sub('\\s{2,}', " ", text)
text

'noisy text example with a lot of numbers and special symbols data cleaning done'

In [30]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
text = ' '.join([word for word in text.split(' ') if word not in stop_words])
text

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/home/jupyter/nltk_data'
    - '/usr/local/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
# example of text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub("@\\S+", " ", text)
    text = re.sub("https*\\S+", " ", text)
    text = re.sub("#\\S+", " ", text)
    text = re.sub("\\d", " ", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\\n', ' ', text)
    text = re.sub('\\s{2,}',' ', text)
    text = ' '.join([word for word in text.split(' ') if word not in stop_words])
    return text