#### Handling HTML Text

We can use Beautiful Soup package to clean Web data

In [None]:
from bs4 import BeautifulSoup

In [None]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [None]:
strip_html_tags('<html><h2>Some important text</h2></html>')

#### Removing Special Characters

In [None]:
import re

In [None]:
def remove_special_characters(text, remove_digits=False):
    #Using regex
    if remove_digits:
        pattern = r'[^a-zA-Z\s]'
    else:
        pattern = r'[^a-zA-Z0-9\s]'

    text = re.sub(pattern, '', text)
    return text

In [None]:
remove_special_characters("Well this was fun! What do you think? 123#@!?",
                          remove_digits=False)

In [None]:
remove_special_characters("Well this was fun! What do you think? 123#@!",
                          remove_digits=True)

In [None]:
remove_special_characters('Sómě Áccěntěd těxt')

#### Remove accented characters

In [None]:
import unicodedata

In [None]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [None]:
remove_accented_chars('Sómě Áccěntěd těxt')

In [None]:
remove_accented_chars('My name is Rajeev')

#### Text Lemmatization

In [None]:
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
def lemmatize_text(text):

    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

In [None]:
lemmatize_text("My system keeps crashing, his crashed yesterday, ours crashes daily")

#### Text Stemming

In [None]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [None]:
simple_stemmer("My system keeps crashing this crashed yesterday, ours crashes daily")

#### Working with Emojis

In [None]:
!pip install emoji --quiet

In [None]:
import emoji

In [None]:
#input data
input_text = 'He is 😳'

In [None]:
#Replace emoji icon with text
output_text = emoji.demojize(input_text)
output_text

In [None]:
#Remove ':' from emoji text
output_text = output_text.replace(':','')
output_text