## Text Cleanup

In [1]:
!python -m spacy download en_core_web_sm

  File "/usr/local/lib/python3.7/site.py", line 177
    file=sys.stderr)
        ^
SyntaxError: invalid syntax


In [2]:
text = 'I am learning Natural Language Processing. I am fond of it. I like to make my career in the same field.'
text

'I am learning Natural Language Processing. I am fond of it. I like to make my career in the same field.'

In [3]:
## Importing the nltk library

import nltk

### Sentence Segmentation

In [4]:
nltk.tokenize.sent_tokenize(text)

['I am learning Natural Language Processing.',
 'I am fond of it.',
 'I like to make my career in the same field.']

In [5]:
nltk.tokenize.sent_tokenize('My email address is waj@simplilearn.com. I live in Hyderabad')

['My email address is waj@simplilearn.com.', 'I live in Hyderabad']

### Word Tokenization

In [6]:
for sentences in nltk.tokenize.sent_tokenize(text):
    print(sentences,' : ',nltk.tokenize.word_tokenize(sentences))

I am learning Natural Language Processing.  :  ['I', 'am', 'learning', 'Natural', 'Language', 'Processing', '.']
I am fond of it.  :  ['I', 'am', 'fond', 'of', 'it', '.']
I like to make my career in the same field.  :  ['I', 'like', 'to', 'make', 'my', 'career', 'in', 'the', 'same', 'field', '.']


In [7]:
## Limitations with nltk word tokenizer
nltk.tokenize.word_tokenize('''Mr. Michael O`Neil works at Microsoft, located at 45 Avenue, United States of America''')

['Mr.',
 'Michael',
 'O',
 '`',
 'Neil',
 'works',
 'at',
 'Microsoft',
 ',',
 'located',
 'at',
 '45',
 'Avenue',
 ',',
 'United',
 'States',
 'of',
 'America']

In [8]:
### type of tokenizer

# nltk.tokenize.TweetTokenizer

In [9]:
text = 'I am working for the England cricket board as analytics engineer'

# Some words which are frequently used words, most common words such as I, am, for, the, as
#Stopwords - words which do not add any meaning to the sentence

#Steps to remove stopwords, lowercase the text, word tokenize, filter it from the stopword dictionary

### Stopword Removal

In [10]:
#Create your stopword dictionary
my_stopwords = ['i', 'am', 'for', 'the', 'as']

In [11]:
# Lowercase, tokenize and filter from the above created dictionary
clean_text = [word for word in nltk.tokenize.word_tokenize(text.lower()) if word not in my_stopwords]
clean_text

['working', 'england', 'cricket', 'board', 'analytics', 'engineer']

In [12]:
text = 'He is an experienced Natural Language Processing Engineer at Microsoft. '

clean_text = [word for word in nltk.tokenize.word_tokenize(text.lower()) if word not in my_stopwords]
clean_text

['he',
 'is',
 'an',
 'experienced',
 'natural',
 'language',
 'processing',
 'engineer',
 'at',
 'microsoft',
 '.']

In [13]:
# Using pre-available list within nltk
from nltk.corpus import stopwords

mystopwords = stopwords.words('english')
print(mystopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [14]:
# Filtering using the nltk's stopword list

text = 'He is a knowledgeable Natural Language Processing Engineer.'

clean_text = [word for word in nltk.tokenize.word_tokenize(text.lower()) if word not in mystopwords]
clean_text

['knowledgeable', 'natural', 'language', 'processing', 'engineer', '.']

In [15]:
text = 'He is having experience of 4+ years in the field of NLP. I am working with him as his colleague since 2 years.'

clean_text = [word for word in nltk.tokenize.word_tokenize(text.lower()) if word not in mystopwords]
clean_text

['experience',
 '4+',
 'years',
 'field',
 'nlp',
 '.',
 'working',
 'colleague',
 'since',
 '2',
 'years',
 '.']

### Digits and Punctuation Removal

In [16]:
from string import punctuation

In [17]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
'3'.isdigit()

True

In [19]:
text = 'He is having experience of 4+ years in the field of NLP. I am working with him as his colleague since 2 years.'

clean_text = [word for word in nltk.tokenize.word_tokenize(text.lower()) if word not in mystopwords]
# Removing punctuation from the stopword removed text
clean_text = [word for word in clean_text if word not in punctuation]
# Removing digits
clean_text = [word for word in clean_text if not word.isdigit()]
clean_text

['experience',
 '4+',
 'years',
 'field',
 'nlp',
 'working',
 'colleague',
 'since',
 'years']

### Stemming and Lemmatization

In [20]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

In [21]:
stemmer.stem('cars')

'car'

In [22]:
stemmer.stem('policies')

'polici'

In [23]:
stemmer.stem('revolution')

'revolut'

In [24]:
stemmer.stem('better')

'better'

In [25]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [26]:
lemmatizer.lemmatize('better',pos='a')

'good'

In [27]:
lemmatizer.lemmatize('better')

'better'

In [28]:
#Implementing Stemming

text = 'He is having experience of 4+ years in the field of NLP. I am working with him as his colleague since 2 years.'

clean_text = [word for word in nltk.tokenize.word_tokenize(text.lower()) if word not in mystopwords]
# Removing punctuation from the stopword removed text
clean_text = [word for word in clean_text if word not in punctuation]
# Removing digits
clean_text = [word for word in clean_text if not word.isdigit()]
# Stemming of the text
clean_text = [stemmer.stem(word) for word in clean_text]
clean_text

['experi', '4+', 'year', 'field', 'nlp', 'work', 'colleagu', 'sinc', 'year']

In [29]:
#Implementing Lemmatization

text = 'He is having experience of 4+ years in the field of NLP. I am working with him as his colleague since 2 years.'

clean_text = [word for word in nltk.tokenize.word_tokenize(text.lower()) if word not in mystopwords]
# Removing punctuation from the stopword removed text
clean_text = [word for word in clean_text if word not in punctuation]
# Removing digits
clean_text = [word for word in clean_text if not word.isdigit()]
# Stemming of the text
clean_text = [lemmatizer.lemmatize(word,pos='v') for word in clean_text]
clean_text

['experience',
 '4+',
 'years',
 'field',
 'nlp',
 'work',
 'colleague',
 'since',
 'years']

#### Part of Speech Tagging

In [30]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.is_alpha, token.is_stop, spacy.explain(token.pos_))

Apple Apple PROPN NNP True False proper noun
is be AUX VBZ True True auxiliary
looking look VERB VBG True False verb
at at ADP IN True True adposition
buying buy VERB VBG True False verb
U.K. U.K. PROPN NNP False False proper noun
startup startup NOUN NN True False noun
for for ADP IN True True adposition
$ $ SYM $ False False symbol
1 1 NUM CD False False numeral
billion billion NUM CD True False numeral


In [31]:
[token.text for token in doc if token.is_stop==False]

['Apple', 'looking', 'buying', 'U.K.', 'startup', '$', '1', 'billion']

#### Named Entity Recognition

In [32]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_, spacy.explain(ent.label_))

Apple 0 5 ORG Companies, agencies, institutions, etc.
U.K. 27 31 GPE Countries, cities, states
$1 billion 44 54 MONEY Monetary values, including unit


In [33]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("An apple a day keeps a doctor away")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.is_alpha, token.is_stop, spacy.explain(token.pos_))

An an DET DT True True determiner
apple apple NOUN NN True False noun
a a DET DT True True determiner
day day NOUN NN True False noun
keeps keep VERB VBZ True False verb
a a DET DT True True determiner
doctor doctor NOUN NN True False noun
away away ADV RB True False adverb


In [36]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_, spacy.explain(ent.label_))