In [6]:
from nltk import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk import WordNetLemmatizer
import nltk

from sklearn.feature_extraction.text import CountVectorizer

from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
text_file = open("Natural_Language_Processing_Text.txt")
text = text_file.read()
print(text)

# Basic crunch

In [None]:
# Sentence tokenizing
sentences = sent_tokenize(text)
print(len(sentences))
print(sentences)

In [None]:
# Word tokenizing
words = word_tokenize(text)
print(len(words))
print(words)

In [None]:
# Word frequency distribution
fdist = FreqDist(words)
print(fdist.most_common(10))
fdist.plot(10)

# Removing punctuations

In [None]:
# Removing punctuation marks
words_no_punc = [w.lower() for w in words if w.isalpha()]
print(len(words_no_punc))
print(words_no_punc)

In [None]:
# Plotting without punctuation
fdist_no_punc = FreqDist(words_no_punc)
print(fdist_no_punc.most_common(10))
fdist_no_punc.plot(10)

# Stopwords

In [None]:
stopwords = stopwords.words("english")
print(stopwords)

In [None]:
# Removing stopwords
clean_words = [w for w in words_no_punc if w not in stopwords]
print(len(clean_words))
print(clean_words)

In [None]:
fdist_clean = FreqDist(clean_words)
print(fdist_clean.most_common(10))
fdist_clean.plot(10)

# Wordcloud

In [None]:
wordcloud = WordCloud().generate(text)
plt.figure(figsize=(12,12))
plt.imshow(wordcloud)
plt.axis("off")

# Stemming

In [None]:
# Porter's stemmer
porter = PorterStemmer()
words_study = ["Study", "Studying", "Studies", "Studied"]
study = [porter.stem(w) for w in words_study]
print(study)

words_random = ["Studies", "leaves", "decreases", "plays"]
random = [porter.stem(w) for w in words_random]
print(random)

In [None]:
# Snowballstemmer languages
print(SnowballStemmer.languages)

# Lemmatizing

In [None]:
lemma = WordNetLemmatizer()
word_be = ["am", "is", "are", "was", "were"]

be = [lemma.lemmatize(w, pos="v") for w in word_be] # pos = part of speech
print(be)

# Part of Speech

In [None]:
sentence = "A very beautiful young lady is walking on the beach"

tokenized_words = word_tokenize(sentence)

nltk.pos_tag(tokenized_words)

# Named Entity Recognition

In [4]:
sentence = "Mr. Smith made a deal on a beach of Switzerland near WHO"

tokenized_words = word_tokenize(sentence)
tagged_words = nltk.pos_tag(tokenized_words)
n_e_r = nltk.ne_chunk(tagged_words, binary=False)
print(n_e_r)

(S
  (PERSON Mr./NNP)
  (PERSON Smith/NNP)
  made/VBD
  a/DT
  deal/NN
  on/IN
  a/DT
  beach/NN
  of/IN
  (GPE Switzerland/NNP)
  near/IN
  (ORGANIZATION WHO/NNP))


# Bag of Words

In [11]:
sentences = [
    "Jim and Pam travelled by the bus",
    "The train was late",
    "The flight was full. Travelling by flight is expensive"
]

cv = CountVectorizer()
bow = cv.fit_transform(sentences).toarray()

bow

array([[1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1],
       [0, 0, 1, 1, 2, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1]])