In [2]:
import nltk
import string
import re

In [2]:
# remove whitespace from text
def remove_whitespace(text):
    return  " ".join(text.split())
 
input_str = "   we don't need   the given questions"
remove_whitespace(input_str)

"we don't need the given questions"

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# remove stopwords function
def remove_stopwords(text):
	stop_words = set(stopwords.words("english"))
	word_tokens = word_tokenize(text)
	filtered_text = [word for word in word_tokens if word not in stop_words]
	return filtered_text

example_text = "This is a sample sentence and we are going to remove the stopwords from this."
remove_stopwords(example_text)


['This', 'sample', 'sentence', 'going', 'remove', 'stopwords', '.']

In [4]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()

# stem words in the list of tokenized words
def stem_words(text):
	word_tokens = word_tokenize(text)
	stems = [stemmer.stem(word) for word in word_tokens]
	return stems

text = 'data science uses scientific methods algorithms and many types of processes'
stem_words(text)


['data',
 'scienc',
 'use',
 'scientif',
 'method',
 'algorithm',
 'and',
 'mani',
 'type',
 'of',
 'process']

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/hp/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [8]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
# lemmatize string
def lemmatize_word(text):
	word_tokens = word_tokenize(text)
	# provide context i.e. part-of-speech
	lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
	return lemmas

text = 'data science uses scientific methods algorithms and many types of processes'
lemmatize_word(text)


['data',
 'science',
 'use',
 'scientific',
 'methods',
 'algorithms',
 'and',
 'many',
 'type',
 'of',
 'process']

In [7]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# convert text into word_tokens with their tags
def pos_tagging(text):
	word_tokens = word_tokenize(text)
	return pos_tag(word_tokens)

pos_tagging('You just gave me a scare')


[('You', 'PRP'),
 ('just', 'RB'),
 ('gave', 'VBD'),
 ('me', 'PRP'),
 ('a', 'DT'),
 ('scare', 'NN')]

In [9]:
# download the tagset
nltk.download('tagsets')

# extract information about the tag
nltk.help.upenn_tagset('NN')


[nltk_data] Downloading package tagsets to /home/hp/nltk_data...


NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


[nltk_data]   Unzipping help/tagsets.zip.


In [None]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# define chunking function with text and regular
# expression representing grammar as parameter
def chunking(text, grammar):
	word_tokens = word_tokenize(text)

	# label words with part of speech
	word_pos = pos_tag(word_tokens)

	# create a chunk parser using grammar
	chunkParser = nltk.RegexpParser(grammar)

	# test it on the list of word tokens with tagged pos
	tree = chunkParser.parse(word_pos)
	
	for subtree in tree.subtrees():
		print(subtree)
	tree.draw()
	
sentence = 'the little yellow bird is flying in the sky'
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunking(sentence, grammar)


(S
  (NP the/DT little/JJ yellow/JJ bird/NN)
  is/VBZ
  flying/VBG
  in/IN
  (NP the/DT sky/NN))
(NP the/DT little/JJ yellow/JJ bird/NN)
(NP the/DT sky/NN)


In [5]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk

def named_entity_recognition(text):
	# tokenize the text
	word_tokens = word_tokenize(text)

	# part of speech tagging of words
	word_pos = pos_tag(word_tokens)

	# tree of word entities
	print(ne_chunk(word_pos))

text = 'Bill works for GeeksforGeeks so he went to Delhi for a meetup.'
named_entity_recognition(text)


(S
  (PERSON Bill/NNP)
  works/VBZ
  for/IN
  (ORGANIZATION GeeksforGeeks/NNP)
  so/RB
  he/PRP
  went/VBD
  to/TO
  (GPE Delhi/NNP)
  for/IN
  a/DT
  meetup/NN
  ./.)


In [4]:
nltk.download('words')

[nltk_data] Downloading package words to /home/hp/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True