# NLTK Basics

In [1]:
!pip install --user -U nltk



In [1]:
import nltk 
import numpy as np

### 1. Stemming

* Base words for stemming & stop words
* Helps reduce the amount of training data that we would need to train our models by capturing different variations of the word

In [14]:
from nltk.stem.snowball import SnowballStemmer

In [15]:
SnowballStemmer.languages

('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
# Alt
# nltk.download("all")

In [23]:
stemmer1 = SnowballStemmer("english")

In [25]:
# The stem for fishes, fishing, fished is "fish"
print(stemmer1.stem("fishing"))
print(stemmer1.stem("fishes"))
print(stemmer1.stem("fished"))

fish
fish
fish


In [26]:
stemmer2 = SnowballStemmer("english", ignore_stopwords="True")

In [29]:
print(stemmer1.stem("having")) # no stop words
print(stemmer2.stem("having")) # stop words not stemmed

have
having


### 2. Lemmatizer

In [30]:
from nltk.stem import WordNetLemmatizer

In [31]:
lemmatizer = WordNetLemmatizer()

In [34]:
lemmatizer.lemmatize("fishing")

'fishing'

In [38]:
lemmatizer.lemmatize("corpora")

'corpus'

### 3. Tokenization

In [78]:
s1 = "Hi, how are you doing? I am fine"

In [79]:
t1 = s.lower()
t1 = t1.replace(",", "")
t1 = t1.replace("?", "")
t1 = t1.split()

In [80]:
t1

['hi', 'how', 'are', 'you', 'doing']

In [81]:
import re

In [84]:
# Get the tokens for building ML model
s2 = "Hi, how are you doing? I am fine"
re.sub(r"[^\w]", " ", s2).split() # replace punctuations with a space

['Hi', 'how', 'are', 'you', 'doing', 'I', 'am', 'fine']

### 4. NLTK based Tokenization

In [85]:
from nltk.tokenize import word_tokenize

In [86]:
word_tokenize(s2)

['Hi', ',', 'how', 'are', 'you', 'doing', '?', 'I', 'am', 'fine']

In [87]:
from nltk.tokenize import wordpunct_tokenize # regex tokenizer

In [88]:
from nltk.tokenize import sent_tokenize

In [90]:
sent_tokenize(s2)

['Hi, how are you doing?', 'I am fine']

### 5. Bag of Words

Bag of Words based encoding or TF-IDF vector is a frequentist based approach to NLP applications.

In [2]:
import requests

url = "http://www.gutenberg.org/files/2554/2554.txt"

response = requests.get(url)
raw_html = response.content
text = raw_html.decode("utf-8-sig")

In [3]:
import urllib.request

url = "https://www.gutenberg.org/files/829/829-0.txt" # gulliver's travels
#Alt
#url = 'https://www.gutenberg.org/files/2701/2701-0.txt' # Moby Dick

file = urllib.request.urlopen(url)
text = [line.decode('utf-8') for line in file]
text = ''.join(text)

### 6. Tokenizing with punkt 

In [4]:
nltk.download('punkt')
from nltk import word_tokenize
tokens = word_tokenize(text)

[nltk_data] Downloading package punkt to /home/yashroff/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
import string
tokens = [word for word in tokens if word.isalpha()]
table = str.maketrans('', '', string.punctuation)
tokens = [w.translate(table) for w in tokens]
tokens = [word.lower() for word in tokens]

Removing **stop-words** and **stemming**

In [6]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
tokens = [porter.stem(word) for word in tokens]
tokens[200:202]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yashroff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['travel', 'littl']

**Understanding the vocabulary** 

* A vocabulary of a document represents all the words in that document and the frequency they appear.
* `FreqDist` class


In [7]:
from nltk.probability import FreqDist

word_counts = FreqDist(tokens)
word_counts

FreqDist({'could': 395, 'upon': 393, 'would': 370, 'great': 298, 'one': 288, 'two': 252, 'time': 240, 'countri': 232, 'made': 228, 'much': 212, ...})

**Scoring words with frequency**

In [8]:
top = 100
vocabulary = word_counts.most_common(top)

vocabulary[:10]

[('could', 395),
 ('upon', 393),
 ('would', 370),
 ('great', 298),
 ('one', 288),
 ('two', 252),
 ('time', 240),
 ('countri', 232),
 ('made', 228),
 ('much', 212)]

In [9]:
voc_size = len(vocabulary)
doc_vector = np.zeros(voc_size)

word_vector = [(idx,word_counts[word[0]]) for idx, word in enumerate(vocabulary) if word[0] in word_counts.keys()] 
word_vector[10]

(10, 191)

In [12]:
# Generating a model of Bag of Words

from nltk import sent_tokenize

docs = sent_tokenize(text)[703:706]
docs

from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer=CountVectorizer(stop_words='english')
word_count_vector=count_vectorizer.fit_transform(docs)
word_count_vector.shape
word_count_vector.toarray()
count_vectorizer.get_feature_names()

['advance',
 'beards',
 'came',
 'clothes',
 'corn',
 'creep',
 'ears',
 'fallen',
 'field',
 'flesh',
 'forward',
 'heard',
 'impossible',
 'interwoven',
 'laid',
 'pierced',
 'pointed',
 'rain',
 'reapers',
 'shift',
 'stalks',
 'step',
 'strong',
 'till',
 'time',
 'wind',
 'yards']