In [1]:
from bs4 import BeautifulSoup
import re
import inflect

In [2]:
input_text = "<b> This text is in bold</br>, <i> This text is in italics </i>"
output_text = BeautifulSoup(input_text, "html.parser").get_text()
print('Input: ' + input_text)
print('Output: ' + output_text)

Input: <b> This text is in bold</br>, <i> This text is in italics </i>
Output:  This text is in bold,  This text is in italics 


# **Stemming and Lematization**

In [3]:
from nltk.stem import PorterStemmer 
import nltk.corpus 
from nltk.corpus import wordnet 
from nltk import word_tokenize 
import nltk 
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [4]:
porter = PorterStemmer()

In [5]:
word_list = ["see","saw","cat", "cats", "stem", "stemming","lemma","lemmatization","known","knowing","time", "timing","football", "footballers"]
for word in word_list:
  print(word + ' -> ' + porter.stem(word))

see -> see
saw -> saw
cat -> cat
cats -> cat
stem -> stem
stemming -> stem
lemma -> lemma
lemmatization -> lemmat
known -> known
knowing -> know
time -> time
timing -> time
football -> footbal
footballers -> footbal


In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
def SentenceStemmer(sentence):
  tokens = word_tokenize(sentence)
  stems = [porter.stem(word) for word in tokens]
  return " ".join(stems)


In [11]:
SentenceStemmer("The cats and dogs are running")

'the cat and dog are run'

In [12]:
wordnet_lemmatizer = WordNetLemmatizer()

In [13]:
print(wordnet_lemmatizer.lemmatize('horses'))
print(wordnet_lemmatizer.lemmatize('wolves'))
print(wordnet_lemmatizer.lemmatize('mice'))
print(wordnet_lemmatizer.lemmatize('cacti'))

horse
wolf
mouse
cactus


In [15]:
print(wordnet_lemmatizer.lemmatize('madeupwords'))
print(porter.stem('madeupwords'))

madeupwords
madeupword


In [16]:
print(wordnet_lemmatizer.lemmatize('ran'))
print(wordnet_lemmatizer.lemmatize('run'))

ran
run


In [17]:
print(wordnet_lemmatizer.lemmatize('ran', pos='v'))
print(wordnet_lemmatizer.lemmatize('run', pos='v'))

run
run


In [20]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [21]:
sentence = 'The cats and dogs are running'

def return_words_pos_tuples(sentence):
  return nltk.pos_tag(nltk.word_tokenize(sentence))

return_words_pos_tuples(sentence)

[('The', 'DT'),
 ('cats', 'NNS'),
 ('and', 'CC'),
 ('dogs', 'NNS'),
 ('are', 'VBP'),
 ('running', 'VBG')]

In [24]:
def get_post_wordnet(pos_tag):
  pos_dict = {
      "NN": wordnet.NOUN,
      "V" : wordnet.VERB,
      "J" : wordnet.ADJ,
      "R" : wordnet.ADV,
  }

  return pos_dict.get(pos_tag[0].upper(), wordnet.NOUN)

In [25]:
get_post_wordnet('VBG')

'v'

In [28]:
def lemmatize_with_pos(sentence):
  new_sentence = []
  tuples = return_words_pos_tuples(sentence)
  for tup in tuples:
    pos = get_post_wordnet(tup[1])
    lemma = wordnet_lemmatizer.lemmatize(tup[0], pos=pos)
    new_sentence.append(lemma)
  return new_sentence 

print(lemmatize_with_pos(sentence))

['The', 'cat', 'and', 'dog', 'be', 'run']
